You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ak...@apache.org on 2021/10/13 20:50:39 UTC

[incubator-mxnet] branch master updated: [submodule] Remove soon to be obsolete dnnl nomenclature from mxnet (#20606)

This is an automated email from the ASF dual-hosted git repository.

akarbown pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 5e04608  [submodule] Remove soon to be obsolete dnnl nomenclature from mxnet (#20606)
5e04608 is described below

commit 5e04608ea5fc88ef54d0e3b4e1e0a170335e1671
Author: bartekkuncer <ba...@intel.com>
AuthorDate: Wed Oct 13 22:48:10 2021 +0200

    [submodule] Remove soon to be obsolete dnnl nomenclature from mxnet (#20606)
    
    * Tests directory
    
    * Docs
    
    * subgraph
    
    * src/operator/subgraph/dnnl/dnnl_subgraph_property.cc
    
    * src/operator/nn/mkldnn -> src/operator/nn/dnnl
    
    * src/operator/nn
    
    * src/operator/quantization
    
    * src/operator/tensor
    
    * Other files
    
    * mkl_mem -> dnnl_mem
    
    * Fix sanity
    
    * Fix miscellaneous
    
    * Apply clang
    
    * Fix license
    
    * Fix linkcheck
    
    * Remove unnecessary onednn header files
    
    * review changes
    
    * fix sanity
    
    * dnnl -> oneDNN/ONEDNN/onednn for functions/variables visible from outside
    
    * fix quantization.py
    
    * Fix contributors
    
    * Apply clang-format
---
 .licenserc.yaml                                    |   4 +-
 CMakeLists.txt                                     |  12 +-
 CONTRIBUTORS.md                                    |   2 +-
 MKLDNN_README.md => DNNL_README.md                 |   2 +-
 LICENSE                                            |   2 +-
 README.md                                          |   2 +-
 cd/python/docker/test_python_image.sh              |   2 +-
 cd/python/pypi/pypi_package.sh                     |   3 +-
 ci/docker/runtime_functions.sh                     |   4 +-
 cpp-package/example/inference/README.md            |   2 +-
 docs/python_docs/python/tutorials/index.rst        |   4 +-
 .../mkldnn_readme.md => dnnl/dnnl_readme.md}       |  14 +-
 .../performance/backend/{mkldnn => dnnl}/index.rst |  14 +-
 .../python/tutorials/performance/backend/index.rst |   8 +-
 .../tutorials/performance/backend/profiler.md      |  14 +-
 .../python/tutorials/performance/index.rst         |   6 +-
 docs/static_site/src/pages/api/faq/env_var.md      |   4 +-
 docs/static_site/src/pages/api/faq/perf.md         |   2 +-
 include/mxnet/base.h                               |   2 +-
 include/mxnet/ndarray.h                            |  78 ++--
 include/onednn/mkldnn.h                            |   1 -
 include/onednn/mkldnn.hpp                          |   1 -
 include/onednn/mkldnn_config.h                     |   1 -
 include/onednn/mkldnn_debug.h                      |   1 -
 include/onednn/mkldnn_dnnl_mangling.h              |   1 -
 include/onednn/mkldnn_types.h                      |   1 -
 include/onednn/mkldnn_version.h                    |   1 -
 python/mxnet/amp/lists/symbol_bf16.py              |   4 +-
 python/mxnet/amp/lists/symbol_fp16.py              |   8 +-
 python/mxnet/contrib/quantization.py               |  13 +-
 src/c_api/c_api.cc                                 |  18 +-
 src/common/exec_utils.h                            |   4 +-
 src/common/utils.h                                 |   6 +-
 src/imperative/attach_op_execs_pass.cc             |   8 +-
 src/imperative/imperative_utils.h                  |  35 +-
 src/ndarray/ndarray.cc                             | 367 ++++++++--------
 src/operator/contrib/batch_norm_relu.cc            |  32 +-
 src/operator/leaky_relu.cc                         |  32 +-
 src/operator/nn/activation.cc                      |  32 +-
 src/operator/nn/batch_norm.cc                      |  32 +-
 src/operator/nn/concat.cc                          |  36 +-
 src/operator/nn/convolution.cc                     |  28 +-
 src/operator/nn/deconvolution.cc                   |  28 +-
 src/operator/nn/dnnl/dnnl_act-inl.h                | 114 +++++
 src/operator/nn/dnnl/dnnl_act.cc                   | 321 ++++++++++++++
 .../mkldnn_base-inl.h => dnnl/dnnl_base-inl.h}     | 425 +++++++++---------
 .../{mkldnn/mkldnn_base.cc => dnnl/dnnl_base.cc}   | 404 +++++++++--------
 .../dnnl_batch_dot-inl.h}                          |  31 +-
 src/operator/nn/dnnl/dnnl_batch_dot.cc             | 127 ++++++
 .../dnnl_batch_norm-inl.h}                         | 256 ++++++-----
 .../mkldnn_concat-inl.h => dnnl/dnnl_concat-inl.h} |  35 +-
 src/operator/nn/dnnl/dnnl_concat.cc                | 128 ++++++
 src/operator/nn/dnnl/dnnl_convolution-inl.h        | 171 ++++++++
 .../dnnl_convolution.cc}                           | 457 ++++++++++----------
 .../{mkldnn/mkldnn_copy.cc => dnnl/dnnl_copy.cc}   |  28 +-
 .../dnnl_deconvolution-inl.h}                      | 230 +++++-----
 .../dnnl_deconvolution.cc}                         | 158 ++++---
 src/operator/nn/dnnl/dnnl_fully_connected-inl.h    | 142 ++++++
 src/operator/nn/dnnl/dnnl_fully_connected.cc       | 327 ++++++++++++++
 .../dnnl_layer_norm-inl.h}                         |  57 ++-
 .../dnnl_layer_norm.cc}                            | 174 ++++----
 src/operator/nn/dnnl/dnnl_log_softmax.cc           | 210 +++++++++
 src/operator/nn/dnnl/dnnl_lrn-inl.h                | 262 +++++++++++
 src/operator/nn/dnnl/dnnl_ops-inl.h                | 197 +++++++++
 .../dnnl_pooling-inl.h}                            | 114 +++--
 src/operator/nn/dnnl/dnnl_pooling.cc               | 401 +++++++++++++++++
 .../dnnl_reshape-inl.h}                            |  31 +-
 src/operator/nn/dnnl/dnnl_reshape.cc               | 145 +++++++
 .../mkldnn_rnn-inl.h => dnnl/dnnl_rnn-inl.h}       | 297 +++++++------
 .../nn/{mkldnn/mkldnn_rnn.cc => dnnl/dnnl_rnn.cc}  | 477 ++++++++++-----------
 .../mkldnn_slice-inl.h => dnnl/dnnl_slice-inl.h}   |  43 +-
 .../{mkldnn/mkldnn_slice.cc => dnnl/dnnl_slice.cc} |  68 +--
 src/operator/nn/dnnl/dnnl_softmax.cc               | 213 +++++++++
 src/operator/nn/dnnl/dnnl_softmax_output.cc        | 124 ++++++
 src/operator/nn/dnnl/dnnl_sum.cc                   | 135 ++++++
 .../mkldnn_transpose.cc => dnnl/dnnl_transpose.cc} |  79 ++--
 src/operator/nn/fully_connected.cc                 |  32 +-
 src/operator/nn/layer_norm.cc                      |  30 +-
 src/operator/nn/log_softmax.cc                     |  28 +-
 src/operator/nn/lrn.cc                             |  34 +-
 src/operator/nn/mkldnn/mkldnn_act-inl.h            | 114 -----
 src/operator/nn/mkldnn/mkldnn_act.cc               | 325 --------------
 src/operator/nn/mkldnn/mkldnn_batch_dot.cc         | 127 ------
 src/operator/nn/mkldnn/mkldnn_concat.cc            | 129 ------
 src/operator/nn/mkldnn/mkldnn_convolution-inl.h    | 172 --------
 .../nn/mkldnn/mkldnn_fully_connected-inl.h         | 143 ------
 src/operator/nn/mkldnn/mkldnn_fully_connected.cc   | 328 --------------
 src/operator/nn/mkldnn/mkldnn_log_softmax.cc       | 214 ---------
 src/operator/nn/mkldnn/mkldnn_lrn-inl.h            | 267 ------------
 src/operator/nn/mkldnn/mkldnn_ops-inl.h            | 197 ---------
 src/operator/nn/mkldnn/mkldnn_pooling.cc           | 405 -----------------
 src/operator/nn/mkldnn/mkldnn_reshape.cc           | 148 -------
 src/operator/nn/mkldnn/mkldnn_softmax.cc           | 214 ---------
 src/operator/nn/mkldnn/mkldnn_softmax_output.cc    | 126 ------
 src/operator/nn/mkldnn/mkldnn_sum.cc               | 137 ------
 src/operator/nn/pooling-inl.h                      |   4 +-
 src/operator/nn/pooling.cc                         |  55 ++-
 src/operator/nn/pooling.cu                         |   2 +-
 src/operator/nn/softmax.cc                         |  28 +-
 src/operator/numpy/np_matrix_op.cc                 |   9 +-
 src/operator/operator_common.h                     |  12 +-
 src/operator/quantization/dequantize.cc            |   8 +-
 .../dnnl_dequantize-inl.h}                         |  79 ++--
 .../dnnl_quantize-inl.h}                           |  66 +--
 .../dnnl_quantize_v2-inl.h}                        |  80 ++--
 .../dnnl_quantized_act.cc}                         |  22 +-
 .../dnnl_quantized_batch_norm.cc}                  |  70 +--
 .../dnnl_quantized_concat.cc}                      |  60 +--
 .../dnnl_quantized_conv.cc}                        |  49 ++-
 .../dnnl_quantized_elemwise_add.cc}                |  98 +++--
 .../dnnl_quantized_flatten.cc}                     |  22 +-
 .../dnnl_quantized_fully_connected.cc}             |  50 +--
 .../dnnl_quantized_ops-inl.h}                      |  20 +-
 .../dnnl_quantized_pooling.cc}                     |  22 +-
 .../dnnl_requantize-inl.h}                         |  58 +--
 src/operator/quantization/quantize.cc              |   6 +-
 src/operator/quantization/quantize_graph_pass.cc   |   2 +-
 src/operator/quantization/quantize_v2.cc           |   8 +-
 src/operator/quantization/quantized_batch_norm.cc  |   4 +-
 src/operator/quantization/quantized_conv.cc        |   8 +-
 .../quantization/quantized_elemwise_add.cc         |   4 +-
 .../quantization/quantized_fully_connected.cc      |  10 +-
 src/operator/quantization/quantized_pooling.cc     |  10 +-
 src/operator/quantization/requantize.cc            |   6 +-
 src/operator/rnn.cc                                |  22 +-
 src/operator/softmax_output.cc                     |  16 +-
 .../dnnl_bn_relu_property.h}                       |  29 +-
 .../{mkldnn/mkldnn_common.h => dnnl/dnnl_common.h} |  60 +--
 .../mkldnn_conv-inl.h => dnnl/dnnl_conv-inl.h}     |  36 +-
 .../{mkldnn/mkldnn_conv.cc => dnnl/dnnl_conv.cc}   | 419 +++++++++---------
 .../dnnl_conv_property.h}                          |  41 +-
 .../dnnl_elemwisemul_post_quantize_property.h}     |  15 +-
 .../{mkldnn/mkldnn_fc-inl.h => dnnl/dnnl_fc-inl.h} |  38 +-
 .../{mkldnn/mkldnn_fc.cc => dnnl/dnnl_fc.cc}       | 318 +++++++-------
 .../dnnl_fc_post_quantize_property.h}              |  31 +-
 .../dnnl_fc_property.h}                            |  41 +-
 .../dnnl_post_quantize_align_scale_property.h}     |  23 +-
 .../dnnl_post_quantize_property.h}                 |  39 +-
 .../dnnl_subgraph_base-inl.h}                      |   8 +-
 .../subgraph/dnnl/dnnl_subgraph_property.cc        |  63 +++
 .../dnnl_transformer-inl.h}                        |  10 +-
 .../dnnl_transformer.cc}                           | 269 ++++++------
 .../dnnl_transformer_post_quantize_property.h}     |  39 +-
 .../dnnl_transformer_qk_property.h}                |  33 +-
 .../dnnl_transformer_valatt_property.h}            |  33 +-
 .../subgraph/mkldnn/mkldnn_subgraph_property.cc    |  63 ---
 .../partitioner/custom_subgraph_property.h         |   8 +-
 src/operator/tensor/amp_cast.cc                    |  64 +--
 src/operator/tensor/cast_storage-inl.h             |  18 +-
 src/operator/tensor/dot.cc                         |  16 +-
 src/operator/tensor/elemwise_binary_op_basic.cc    |  26 +-
 src/operator/tensor/elemwise_sum.cc                |  17 +-
 src/operator/tensor/elemwise_unary_op.h            |   2 +-
 src/operator/tensor/elemwise_unary_op_basic.cc     |  13 +-
 src/operator/tensor/elemwise_unary_op_logexp.cc    |   7 +-
 src/operator/tensor/elemwise_unary_op_pow.cc       |   7 +-
 src/operator/tensor/matrix_op-inl.h                |   6 +-
 src/operator/tensor/matrix_op.cc                   |  58 +--
 src/serialization/cnpy.cc                          |   4 +-
 src/storage/cpu_device_storage.h                   |   4 +-
 src/storage/storage_manager_helpers.h              |   4 +-
 tests/cpp/include/{test_mkldnn.h => test_dnnl.h}   | 290 ++++++-------
 ...ldnn_operator_test.cc => dnnl_operator_test.cc} | 105 +++--
 .../cpp/operator/{mkldnn_test.cc => dnnl_test.cc}  | 201 +++++----
 tests/cpp/storage/storage_test.cc                  |  10 +-
 tests/nightly/test_large_array.py                  |   2 +-
 tests/nightly/test_np_large_array.py               |   2 +-
 .../data/test_dnnl_test_dnnl_model_model1.json}    |   0
 .../{mkl => dnnl}/subgraphs/subgraph_common.py     |  18 +-
 .../{mkl => dnnl}/subgraphs/test_conv_subgraph.py  |  14 +-
 .../{mkl => dnnl}/subgraphs/test_fc_subgraph.py    |   0
 .../subgraphs/test_transformer_subgraph.py         |   2 +-
 tests/python/{mkl => dnnl}/test_amp.py             |   0
 tests/python/{mkl => dnnl}/test_bf16_operator.py   |   0
 .../{mkl/test_mkldnn.py => dnnl/test_dnnl.py}      |  30 +-
 .../test_quantization_dnnl.py}                     |   0
 tests/python/gpu/test_gluon_model_zoo_gpu.py       |   4 +-
 tests/python/gpu/test_kvstore_gpu.py               |   4 +-
 tests/python/quantization/test_quantization.py     |  20 +-
 tests/python/unittest/test_numpy_gluon.py          |   2 +-
 tests/python/unittest/test_sparse_operator.py      |   2 +-
 tests/tutorials/test_sanity_tutorials.py           |   6 +-
 tests/tutorials/test_tutorials.py                  |   4 +-
 tools/license_header.py                            |   2 +-
 184 files changed, 6893 insertions(+), 7019 deletions(-)

diff --git a/.licenserc.yaml b/.licenserc.yaml
index f7e262a..dc6e8fa 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -35,7 +35,7 @@ header:
     - 'src/operator/contrib/multi_proposal.cu'
     - 'src/operator/contrib/psroi_pooling.cc'
     - 'src/operator/contrib/psroi_pooling.cu'
-    - 'src/operator/nn/mkldnn/mkldnn_base-inl.h'
+    - 'src/operator/nn/dnnl/dnnl_base-inl.h'
     # files licensed under boost license
     - 'cmake/Modules/FindJeMalloc.cmake'
     # files licensed under bsd 3-clause
@@ -64,7 +64,7 @@ header:
     - 'include/mshadow' # symlink to 3rdparty/mshadow/mshadow
     - 'include/onednn' # symlinks to 3rdparty/onednn
     # test/build data
-    - 'tests/python/mkl/data/test_mkldnn_test_mkldnn_model_model1.json'
+    - 'tests/python/dnnl/data/test_dnnl_test_dnnl_model_model1.json'
 
 
   comment: on-failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 882e8b0..6966920 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -273,16 +273,16 @@ if(USE_ONEDNN)
   endif()
 
   function(load_onednn)
-    set(MKLDNN_BUILD_TESTS OFF CACHE INTERNAL "" FORCE)
-    set(MKLDNN_BUILD_EXAMPLES OFF CACHE INTERNAL "" FORCE)
-    set(MKLDNN_ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE)
-    set(MKLDNN_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
-    set(MKLDNN_LIBRARY_TYPE STATIC CACHE INTERNAL "" FORCE)
+    set(DNNL_BUILD_TESTS OFF CACHE INTERNAL "" FORCE)
+    set(DNNL_BUILD_EXAMPLES OFF CACHE INTERNAL "" FORCE)
+    set(DNNL_ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE)
+    set(DNNL_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
+    set(DNNL_LIBRARY_TYPE STATIC CACHE INTERNAL "" FORCE)
     set(DNNL_ENABLE_CONCURRENT_EXEC ON CACHE INTERNAL "" FORCE)
     set(DNNL_ENABLE_PRIMITIVE_CACHE ON CACHE INTERNAL "" FORCE)
 
     if(NOT USE_OPENMP)
-      set(MKLDNN_CPU_RUNTIME SEQ CACHE INTERNAL "" FORCE)
+      set(DNNL_CPU_RUNTIME SEQ CACHE INTERNAL "" FORCE)
     endif()
 
     set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/onednn")
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c02414c..47b491d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -104,7 +104,7 @@ healthy project. The PPMC actively seeks to appoint new committers from the list
 * [Aaron Markham](https://github.com/aaronmarkham)
 * [Alex Zai](https://github.com/azai91)
 * [Anirudh Acharya](https://github.com/anirudhacharya)
-* [Anna Karbownik]((https://github.com/akarbown)
+* [Anna Karbownik](https://github.com/akarbown)
 * [Aston Zhang](https://github.com/astonzhang)
 * [Chaitanya Bapat](https://github.com/ChaiBapchya)
 * [Ciyong Chen](https://github.com/ciyongch)
diff --git a/MKLDNN_README.md b/DNNL_README.md
similarity index 83%
rename from MKLDNN_README.md
rename to DNNL_README.md
index 795e502..d9d9d35 100644
--- a/MKLDNN_README.md
+++ b/DNNL_README.md
@@ -18,4 +18,4 @@
   ~
 -->
 
-File is moved to [docs/tutorials/mkldnn/MKLDNN_README.md](docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md).
+File is moved to [docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md](docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md).
diff --git a/LICENSE b/LICENSE
index 80abbe1..d06d50f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -252,7 +252,7 @@
     src/operator/contrib/multi_proposal.cu
     src/operator/contrib/psroi_pooling.cc
     src/operator/contrib/psroi_pooling.cu
-    src/operator/nn/mkldnn/mkldnn_base-inl.h
+    src/operator/nn/dnnl/dnnl_base-inl.h
 
     =======================================================================================
     MIT license
diff --git a/README.md b/README.md
index baf1e5e..638987f 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ What's New
 
 ### Ecosystem News
 
-* [ONEDNN for Faster CPU Performance](docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md)
+* [oneDNN for Faster CPU Performance](docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md)
 * [MXNet Memory Monger, Training Deeper Nets with Sublinear Memory Cost](https://github.com/dmlc/mxnet-memonger)
 * [Tutorial for NVidia GTC 2016](https://github.com/dmlc/mxnet-gtc-tutorial)
 * [MXNet.js: Javascript Package for Deep Learning in Browser (without server)](https://github.com/dmlc/mxnet.js/)
diff --git a/cd/python/docker/test_python_image.sh b/cd/python/docker/test_python_image.sh
index be4f9dc..88f21db 100755
--- a/cd/python/docker/test_python_image.sh
+++ b/cd/python/docker/test_python_image.sh
@@ -33,7 +33,7 @@ fi
 
 # Execute tests
 if [[ $mxnet_variant != native ]]; then
-    python3 tests/python/mkl/test_mkldnn.py
+    python3 tests/python/dnnl/test_dnnl.py
 fi
 
 # TODO: Add more tests (18549)
diff --git a/cd/python/pypi/pypi_package.sh b/cd/python/pypi/pypi_package.sh
index 076f85a..26626ef 100755
--- a/cd/python/pypi/pypi_package.sh
+++ b/cd/python/pypi/pypi_package.sh
@@ -22,11 +22,10 @@ set -ex
 export mxnet_variant=${1:?"Please specify the mxnet variant"}
 
 # Due to this PR: https://github.com/apache/incubator-mxnet/pull/14899
-# The setup.py expects that mkldnn_version.h be present in
+# The setup.py expects that dnnl_version.h be present in
 # mxnet-build/3rdparty/onednn/build/install/include
 # The artifact repository stores this file in the dependencies
 # and CD unpacks it to a directory called cd_misc
-# Nov. 2019 Update: With v1.1, MKL-DNN is renaming to DNNL. Hence changing the prefix of file name.
 if [ -f "cd_misc/dnnl_version.h" ]; then
   mkdir -p 3rdparty/onednn/include/oneapi/dnnl
   cp cd_misc/dnnl_version.h 3rdparty/onednn/include/oneapi/dnnl/.
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index be0f1ca..3345ee7 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -801,7 +801,7 @@ cd_unittest_ubuntu() {
     fi
 
     if [[ ${mxnet_variant} = *mkl ]]; then
-        OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -n 4 --durations=50 --verbose tests/python/mkl
+        OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -n 4 --durations=50 --verbose tests/python/dnnl
     fi
 }
 
@@ -841,7 +841,7 @@ unittest_ubuntu_python3_cpu_onednn() {
     MXNET_ENGINE_TYPE=NaiveEngine \
                      OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
-    pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
+    pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/dnnl
 }
 
 unittest_array_api_standardization() {
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
index ddc8a19..fc81dea 100644
--- a/cpp-package/example/inference/README.md
+++ b/cpp-package/example/inference/README.md
@@ -30,7 +30,7 @@ This directory contains following examples. In order to run the examples, ensure
 
 ## [imagenet_inference.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/imagenet_inference.cpp>)
 
-This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. Now this script also supports inference with quantized CNN models generated by IntelĀ® MKL-DNN (see this [quantization flow](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md)). By using C++ API, the latency of most models will be reduced to some extent compared with current Python implementation.
+This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. Now this script also supports inference with quantized CNN models generated by oneDNN (see this [quantization flow](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md)). By using C++ API, the latency of most models will be reduced to some extent compared with current Python implementation.
 
 Most of CNN models have been tested on Linux systems. And 50000 images are used to collect accuracy numbers. Please refer to this [README](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md) for  more details about accuracy.
 
diff --git a/docs/python_docs/python/tutorials/index.rst b/docs/python_docs/python/tutorials/index.rst
index 2e0de42..e9a61be 100644
--- a/docs/python_docs/python/tutorials/index.rst
+++ b/docs/python_docs/python/tutorials/index.rst
@@ -84,10 +84,10 @@ Performance
       How to use int8 in your model to boost training speed.
 
    .. card::
-      :title: MKL-DNN
+      :title: oneDNN
       :link: performance/backend/mkldnn/index.html
 
-      How to get the most from your CPU by using Intel's MKL-DNN.
+      How to get the most from your CPU by using oneDNN.
 
    .. card::
       :title: TVM
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md b/docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md
similarity index 96%
rename from docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md
rename to docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md
index 8ff92fe..e68dc53 100644
--- a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_readme.md
+++ b/docs/python_docs/python/tutorials/performance/backend/dnnl/dnnl_readme.md
@@ -208,11 +208,11 @@ o = exe.outputs[0]
 t = o.asnumpy()
 ```
 
-More detailed debugging and profiling information can be logged by setting the environment variable 'MKLDNN_VERBOSE':
+More detailed debugging and profiling information can be logged by setting the environment variable 'DNNL_VERBOSE':
 ```
-export MKLDNN_VERBOSE=1
+export DNNL_VERBOSE=1
 ```
-For example, by running above code snippet, the following debugging logs providing more insights on ONEDNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
+For example, by running above code snippet, the following debugging logs providing more insights on oneDNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
 ```
 dnnl_verbose,info,DNNL v1.1.2 (commit cb2cc7ac17ff4e2ef50805c7048d33256d82be4d)
 dnnl_verbose,info,Detected ISA is Intel AVX-512 with Intel DL Boost
@@ -281,7 +281,7 @@ MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1
 Graph optimization with subgraph is available and enabled by default in master branch. For MXNet release v1.5, you can manually enable it by:
 
 ```
-export MXNET_SUBGRAPH_BACKEND=MKLDNN
+export MXNET_SUBGRAPH_BACKEND=ONEDNN
 ```
 
 This limitations of this experimental feature are:
@@ -293,7 +293,7 @@ This limitations of this experimental feature are:
 
 <h2 id="7">Quantization and Inference with INT8</h2>
 
-Benefiting from Intel ONEDNN, MXNet built with Intel ONEDNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
+Benefiting from oneDNN, MXNet built with oneDNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
 
 - [CNN Quantization Examples](https://github.com/apache/incubator-mxnet/tree/master/example/quantization).
 
@@ -303,6 +303,6 @@ Benefiting from Intel ONEDNN, MXNet built with Intel ONEDNN brings outstanding p
 
 - For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl) website.
 
-- For questions or support specific to ONEDNN, visit the [Intel ONEDNN](https://github.com/oneapi-src/oneDNN) website.
+- For questions or support specific to oneDNN, visit the [oneDNN](https://github.com/oneapi-src/oneDNN) website.
 
-- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with ONEDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
+- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with oneDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkldnn/index.rst b/docs/python_docs/python/tutorials/performance/backend/dnnl/index.rst
similarity index 78%
rename from docs/python_docs/python/tutorials/performance/backend/mkldnn/index.rst
rename to docs/python_docs/python/tutorials/performance/backend/dnnl/index.rst
index ec85855..116458c 100644
--- a/docs/python_docs/python/tutorials/performance/backend/mkldnn/index.rst
+++ b/docs/python_docs/python/tutorials/performance/backend/dnnl/index.rst
@@ -15,22 +15,22 @@
    specific language governing permissions and limitations
    under the License.
 
-Intel MKL-DNN
+oneDNN
 =============
 
 .. container:: cards
 
    .. card::
-      :title: MKL-DNN Installation and Verification
-      :link: mkldnn_readme
+      :title: oneDNN Installation and Verification
+      :link: dnnl_readme
 
-      A guide on using MKL-DNN with MXNet.
+      A guide on using oneDNN with MXNet.
 
    .. card::
-      :title: MKL-DNN Quantization
-      :link: mkldnn_quantization
+      :title: oneDNN Quantization
+      :link: dnnl_quantization
 
-      How to perform quantization with MKLDNN
+      How to perform quantization with oneDNN
 
 .. toctree::
    :hidden:
diff --git a/docs/python_docs/python/tutorials/performance/backend/index.rst b/docs/python_docs/python/tutorials/performance/backend/index.rst
index 942f399..d9b2947 100644
--- a/docs/python_docs/python/tutorials/performance/backend/index.rst
+++ b/docs/python_docs/python/tutorials/performance/backend/index.rst
@@ -22,10 +22,10 @@ The following tutorials will help you learn how to use backend tools to boost pe
 .. container:: cards
 
   .. card::
-     :title: MKL-DNN
-     :link: mkldnn/index.html
+     :title: oneDNN
+     :link: dnnl/index.html
 
-     How to get the most from your CPU by using Intel's MKL-DNN.
+     How to get the most from your CPU by using oneDNN.
 
   .. card::
      :title: TVM
@@ -50,7 +50,7 @@ The following tutorials will help you learn how to use backend tools to boost pe
    :hidden:
    :maxdepth: 1
 
-   mkldnn/index
+   dnnl/index
    tvm
    profiler
    amp
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index f935e46..e560194 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -211,11 +211,11 @@ Let's zoom in to check the time taken by operators
 The above picture visualizes the sequence in which the operators were executed and the time taken by each operator.
 
 ### Profiling ONEDNN Operators
-Reagrding ONEDNN operators, the library has already provided the internal profiling tool. Firstly, you need set `MKLDNN_VERBOSE=1` to enable internal profiler.
+Reagrding ONEDNN operators, the library has already provided the internal profiling tool. Firstly, you need set `DNNL_VERBOSE=1` to enable internal profiler.
 
-`$ MKLDNN_VERBOSE=1 python my_script.py > mkldnn_verbose.log`
+`$ DNNL_VERBOSE=1 python my_script.py > dnnl_verbose.log`
 
-Now, the detailed profiling insights of each ONEDNN prmitive are saved into `mkldnn_verbose.log` (like below).
+Now, the detailed profiling insights of each oneDNN prmitive are saved into `dnnl_verbose.log` (like below).
 
 ```
 dnnl_verbose,info,DNNL v1.1.2 (commit cb2cc7ac17ff4e2ef50805c7048d33256d82be4d)
@@ -225,13 +225,13 @@ dnnl_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::b
 
 For example, if you want to calculate the total executing time of `convolution` primitive, you can just run:
 
-`$ cat mkldnn_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
+`$ cat dnnl_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
 
-Moreover, you can set `MKLDNN_VERBOSE=2` to collect both creating and executing time of each primitive.
+Moreover, you can set `DNNL_VERBOSE=2` to collect both creating and executing time of each primitive.
 
-`$ cat mkldnn_verbose.log | grep "create,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
+`$ cat dnnl_verbose.log | grep "create,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
 
-`$ cat mkldnn_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
+`$ cat dnnl_verbose.log | grep "exec,cpu,convolution" | awk 'BEGIN{FS=","} {SUM+=$11} END {print SUM}'`
 
 
 ### Profiling Custom Operators
diff --git a/docs/python_docs/python/tutorials/performance/index.rst b/docs/python_docs/python/tutorials/performance/index.rst
index 43c548e..f4491db 100644
--- a/docs/python_docs/python/tutorials/performance/index.rst
+++ b/docs/python_docs/python/tutorials/performance/index.rst
@@ -76,10 +76,10 @@ Accelerated Backend
    ..
       TBD Content
       .. card::
-         :title: MKL-DNN
-         :link: backend/mkldnn/mkldnn_readme
+         :title: oneDNN
+         :link: backend/dnnl/dnnl_readme
 
-         How to get the most from your CPU by using Intel's MKL-DNN.
+         How to get the most from your CPU by using oneDNN.
 
       .. card::
          :title: TVM
diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index a4b4915..eed6cf3 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -375,9 +375,9 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator.
 
 * MXNET_SUBGRAPH_BACKEND
-  - Values: String ```(default="MKLDNN")``` if ONEDNN is avaliable, otherwise ```(default="")```
+  - Values: String ```(default="ONEDNN")``` if oneDNN is available, otherwise ```(default="")```
   - This variable controls the subgraph partitioning in MXNet.
-  - This variable is used to perform ONEDNN FP32 operator fusion and quantization. Please refer to the [ONEDNN operator list](https://github.com/apache/incubator-mxnet/blob/v1.5.x/docs/tutorials/mkldnn/operator_list.md) for how this variable is used and the list of fusion passes.
+  - This variable is used to perform oneDNN FP32 operator fusion and quantization. Please refer to the [oneDNN operator list](https://github.com/apache/incubator-mxnet/blob/v1.5.x/docs/tutorials/mkldnn/operator_list.md) for how this variable is used and the list of fusion passes.
   - Set ```MXNET_SUBGRAPH_BACKEND=NONE``` to disable subgraph backend.
 
 * MXNET_SAFE_ACCUMULATION
diff --git a/docs/static_site/src/pages/api/faq/perf.md b/docs/static_site/src/pages/api/faq/perf.md
index 083ef69..0759afc 100644
--- a/docs/static_site/src/pages/api/faq/perf.md
+++ b/docs/static_site/src/pages/api/faq/perf.md
@@ -58,7 +58,7 @@ We also find that setting the following environment variables can help:
 | :-------- | :---------- |
 | `OMP_NUM_THREADS`            | Suggested value: `vCPUs / 2` in which `vCPUs` is the number of virtual CPUs. For more information, please see the guide for [setting the number of threads using an OpenMP environment variable](https://software.intel.com/en-us/mkl-windows-developer-guide-setting-the-number-of-threads-using-an-openmp-environment-variable) |
 | `KMP_AFFINITY`               | Suggested value: `granularity=fine,compact,1,0`.  For more information, please see the guide for [Thread Affinity Interface (Linux* and Windows*)](https://software.intel.com/en-us/node/522691). |
-| `MXNET_SUBGRAPH_BACKEND` | Set to ONEDNN to enable the [subgraph feature](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN) for better performance. For more information please see [Build/Install MXNet with ONEDNN](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_readme.html)|
+| `MXNET_SUBGRAPH_BACKEND` | Set to ONEDNN to enable the [subgraph feature](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN) for better performance. For more information please see [Build/Install MXNet with oneDNN](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/dnnl/dnnl_readme.html)|
 
 Note that _MXNet_ treats all CPUs on a single machine as a single device.
 So whether you specify `cpu(0)` or `cpu()`, _MXNet_ will use all CPU cores on the machine.
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 12b083c..e374523 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -541,7 +541,7 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
 
 
 #if MXNET_USE_ONEDNN == 1 || MXNET_USE_INTGEMM == 1
-constexpr size_t kMKLDNNAlign = 64;
+constexpr size_t kDNNLAlign = 64;
 #endif
 
 }  // namespace mxnet
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 25c5850..5e6af4d 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -37,7 +37,7 @@
 #include <string>
 #include <vector>
 #if MXNET_USE_ONEDNN == 1
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 #endif
 #include "./base.h"
 #include "./engine.h"
@@ -73,7 +73,7 @@ enum NDArrayFormatErr {
   kRSPIdxErr,     // indices error for row sparse
 };
 
-class MKLDNNMemory;
+class DNNLMemory;
 
 /*!
  * \brief ndarray interface
@@ -217,7 +217,7 @@ class NDArray {
   /*
    * This indicates whether an array is a view of another array (created by
    * reshape or slice). If an array is a view and the data is stored in
-   * MKLDNN format, we need to convert the data to the default format when
+   * DNNL format, we need to convert the data to the default format when
    * data in the view is accessed.
    */
   inline bool IsView() const {
@@ -729,20 +729,20 @@ class NDArray {
 
 #if MXNET_USE_ONEDNN == 1
   /*
-   * Create NDArray from mkldnn memory.
-   * mkldnn_mem The mkldnn memory to be managed.
+   * Create NDArray from dnnl memory.
+   * dnnl_mem The dnnl memory to be managed.
    */
-  explicit NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem);
+  explicit NDArray(const std::shared_ptr<dnnl::memory>& dnnl_mem);
   /*
-   * Create NDArray from mkldnn memory descriptor.
-   * mem_pd The mkldnn memory descriptor to be created.
+   * Create NDArray from dnnl memory descriptor.
+   * mem_pd The dnnl memory descriptor to be created.
    */
-  explicit NDArray(const mkldnn::memory::desc& md);
+  explicit NDArray(const dnnl::memory::desc& md);
   /*
-   * Test if the data is stored in one of special MKLDNN format.
+   * Test if the data is stored in one of special DNNL format.
    */
-  bool IsMKLDNNData() const {
-    return ptr_->IsMKLDNN();
+  bool IsDNNLData() const {
+    return ptr_->IsDNNL();
   }
   /*
    * Test if the data is stored in one of default MXNet formats.
@@ -751,37 +751,37 @@ class NDArray {
     return ptr_->IsDefault();
   }
   /*
-   * All functions below return a raw pointer to mkldnn memory. Actually there
-   * is a shared pointer that hold the memory either in NDArray or in MKLDNN
+   * All functions below return a raw pointer to dnnl memory. Actually there
+   * is a shared pointer that hold the memory either in NDArray or in DNNL
    * stream. As long as we call these functions inside an operator, the return
    * memory is always valid.
    */
 
   /*
-   * This function returns mkldnn::memory with the default primitive_desc.
+   * This function returns dnnl::memory with the default primitive_desc.
    */
-  const mkldnn::memory* GetMKLDNNData() const;
+  const dnnl::memory* GetDNNLData() const;
   /*
-   * This function returns mkldnn::memory with the given primitive_desc
+   * This function returns dnnl::memory with the given primitive_desc
    * as long as the array size meets the required size in the given primitive_desc.
    */
-  const mkldnn::memory* GetMKLDNNData(const mkldnn::memory::desc& md) const;
+  const dnnl::memory* GetDNNLData(const dnnl::memory::desc& md) const;
   /*
-   * This function returns mkldnn::memory with the given primitive_desc.
-   * The returned mkldnn::memory will have the same physical layout as
+   * This function returns dnnl::memory with the given primitive_desc.
+   * The returned dnnl::memory will have the same physical layout as
    * the given primitive_desc.
    */
-  const mkldnn::memory* GetMKLDNNDataReorder(const mkldnn::memory::desc& md) const;
+  const dnnl::memory* GetDNNLDataReorder(const dnnl::memory::desc& md) const;
 
   /*
-   * This function copies data from mkldnn memory.
+   * This function copies data from dnnl memory.
    */
-  void CopyFrom(const mkldnn::memory& mem);
+  void CopyFrom(const dnnl::memory& mem);
   /*
-   * This function allocates memory for array and creates mkldnn memory
+   * This function allocates memory for array and creates dnnl memory
    * with the specified format.
    */
-  mkldnn::memory* CreateMKLDNNData(const mkldnn::memory::desc& md);
+  dnnl::memory* CreateDNNLData(const dnnl::memory::desc& md);
 
   /*
    * These are the async version of the methods above.
@@ -789,7 +789,7 @@ class NDArray {
    * the array are complete.
    */
   void Reorder2DefaultAsync() const;
-  void MKLDNNDataReorderAsync(const mkldnn::memory::desc& md) const;
+  void DNNLDataReorderAsync(const dnnl::memory::desc& md) const;
 
   /*
    * This creates a new NDArray with the reordered data.
@@ -803,7 +803,7 @@ class NDArray {
    */
   NDArray Reorder2DefaultFloatFormat() const;
 
-  void InvalidateMKLDNNData();
+  void InvalidateDNNLData();
 
   /*
    * This function is used inside operators to reshape an array.
@@ -815,12 +815,12 @@ class NDArray {
    * which can be expensive.
    * It's used by FullyConnected right now.
    */
-  NDArray MKLDNNDataReshape(const mxnet::TShape& shape) const;
+  NDArray DNNLDataReshape(const mxnet::TShape& shape) const;
 
   /*!
-   * \ Fix mkldnn memory descriptor mismatch from NDArray.
+   * \ Fix dnnl memory descriptor mismatch from NDArray.
    */
-  void UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc);
+  void UpdateDNNLMemDesc(const dnnl::memory::desc& desc);
 #endif
 
   /*!
@@ -857,9 +857,9 @@ class NDArray {
     std::vector<Storage::Handle> aux_handles;
 
 #if MXNET_USE_ONEDNN == 1
-    /*! This is created when data is stored in MKLDNN format.
+    /*! This is created when data is stored in DNNL format.
      */
-    std::shared_ptr<MKLDNNMemory> mkl_mem_;
+    std::shared_ptr<DNNLMemory> dnnl_mem_;
 #endif
     /*! \brief variable from engine */
     Engine::VarHandle var;
@@ -1035,7 +1035,7 @@ class NDArray {
       if (delay_alloc) {
         Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-        mkl_mem_ = nullptr;
+        dnnl_mem_ = nullptr;
 #endif
         delay_alloc = false;
       }
@@ -1051,7 +1051,7 @@ class NDArray {
         shandle.size = dbytes;
         Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-        mkl_mem_ = nullptr;
+        dnnl_mem_ = nullptr;
 #endif
         delay_alloc = false;
       } else if (shandle.size < dbytes) {
@@ -1061,7 +1061,7 @@ class NDArray {
         shandle.size = dbytes;
         Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-        mkl_mem_ = nullptr;
+        dnnl_mem_ = nullptr;
 #endif
       }
     }
@@ -1099,14 +1099,14 @@ class NDArray {
 
 #if MXNET_USE_ONEDNN == 1
     // Have MKL memory reference to the data in the default storage
-    // or create memory for MKLDNN.
+    // or create memory for DNNL.
     void SetMKLMem(const mxnet::TShape& shape, int dtype);
-    // If the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
+    // If the data is stored in DNNL layout, we reorder data in dnnl_mem_ and
     // save the result in shandle.
     void Reorder2Default();
     // Reroder data to a specified layout.
-    void MKLDNNDataReorder(const mkldnn::memory::desc& md);
-    bool IsMKLDNN() const;
+    void DNNLDataReorder(const dnnl::memory::desc& md);
+    bool IsDNNL() const;
     bool IsDefault() const;
 #endif
 
diff --git a/include/onednn/mkldnn.h b/include/onednn/mkldnn.h
deleted file mode 120000
index ef19407..0000000
--- a/include/onednn/mkldnn.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn.hpp b/include/onednn/mkldnn.hpp
deleted file mode 120000
index e7f56e9..0000000
--- a/include/onednn/mkldnn.hpp
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn.hpp
\ No newline at end of file
diff --git a/include/onednn/mkldnn_config.h b/include/onednn/mkldnn_config.h
deleted file mode 120000
index 714a586..0000000
--- a/include/onednn/mkldnn_config.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_config.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_debug.h b/include/onednn/mkldnn_debug.h
deleted file mode 120000
index ca0e6b9..0000000
--- a/include/onednn/mkldnn_debug.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_debug.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_dnnl_mangling.h b/include/onednn/mkldnn_dnnl_mangling.h
deleted file mode 120000
index 67bf8d0..0000000
--- a/include/onednn/mkldnn_dnnl_mangling.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_dnnl_mangling.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_types.h b/include/onednn/mkldnn_types.h
deleted file mode 120000
index 334078b..0000000
--- a/include/onednn/mkldnn_types.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_types.h
\ No newline at end of file
diff --git a/include/onednn/mkldnn_version.h b/include/onednn/mkldnn_version.h
deleted file mode 120000
index ed35758..0000000
--- a/include/onednn/mkldnn_version.h
+++ /dev/null
@@ -1 +0,0 @@
-../../3rdparty/onednn/include/mkldnn_version.h
\ No newline at end of file
diff --git a/python/mxnet/amp/lists/symbol_bf16.py b/python/mxnet/amp/lists/symbol_bf16.py
index 2990429..86f5b0a 100644
--- a/python/mxnet/amp/lists/symbol_bf16.py
+++ b/python/mxnet/amp/lists/symbol_bf16.py
@@ -360,8 +360,8 @@ FP32_FUNCS = [
     'uniform',
     'unravel_index',
     'zeros_like',
-    '_sg_mkldnn_conv',
-    '_sg_mkldnn_fully_connected',
+    '_sg_onednn_conv',
+    '_sg_onednn_fully_connected',
     'broadcast_mul',
     'Convolution_v1',
     'IdentityAttachKLSparseReg',
diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
index 009586e..b561b33 100644
--- a/python/mxnet/amp/lists/symbol_fp16.py
+++ b/python/mxnet/amp/lists/symbol_fp16.py
@@ -611,10 +611,10 @@ FP32_FUNCS = [
 
 if Features().is_enabled('ONEDNN'):
     FP32_FUNCS.extend([
-        '_sg_mkldnn_conv',
-        '_sg_mkldnn_fully_connected',
-        '_sg_mkldnn_selfatt_qk',
-        '_sg_mkldnn_selfatt_valatt',
+        '_sg_onednn_conv',
+        '_sg_onednn_fully_connected',
+        '_sg_onednn_selfatt_qk',
+        '_sg_onednn_selfatt_valatt',
     ])
 
 # Functions that have to be cast to FP32 only for
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index b7ff517..4444c4b 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -529,14 +529,13 @@ def quantize_model(sym, arg_params, aux_params, data_names=('data',),
 
     return qsym, qarg_params, aux_params
 
-
-def quantize_model_mkldnn(sym, arg_params, aux_params, data_names=('data',),
+def quantize_model_onednn(sym, arg_params, aux_params, data_names=('data',),
                           ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
                           calib_mode='entropy', calib_data=None, num_calib_batches=None,
                           quantized_dtype='int8', quantize_mode='smart',
                           quantize_granularity='tensor-wise', logger=None):
     """User-level API for generating a fusion + quantized model from a FP32 model
-    w/ or w/o calibration with Intel MKL-DNN.
+    w/ or w/o calibration with oneDNN.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
 
@@ -555,9 +554,9 @@ def quantize_model_mkldnn(sym, arg_params, aux_params, data_names=('data',),
         raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
     if ctx.device_type != 'cpu':
         raise ValueError(
-            'quantize_model_mkldnn only support Intel cpu platform with MKL-DNN Backend')
+            'quantize_model_onednn only support Intel cpu platform with oneDNN Backend')
 
-    sym = sym.optimize_for(backend='MKLDNN_QUANTIZE')
+    sym = sym.optimize_for(backend='ONEDNN_QUANTIZE')
 
     qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
                                                    data_names=data_names, ctx=ctx,
@@ -568,7 +567,7 @@ def quantize_model_mkldnn(sym, arg_params, aux_params, data_names=('data',),
                                                    quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
                                                    quantize_granularity=quantize_granularity, logger=logger)
 
-    qsym = qsym.optimize_for(backend='MKLDNN_QUANTIZE')
+    qsym = qsym.optimize_for(backend='ONEDNN_QUANTIZE')
 
     return qsym, qarg_params, aux_params
 
@@ -824,7 +823,7 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full', quantize
 
     if ctx != mx.cpu():
         raise ValueError('Quantization currently supports only CPU context')
-    backend = 'MKLDNN_QUANTIZE'
+    backend = 'ONEDNN_QUANTIZE'
 
     network.hybridize(static_alloc=False, static_shape=False)
     data_types = None
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f73cc18..edd2e55 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -163,7 +163,7 @@ void CustomFComputeDispatcher(const std::string op_name,
   std::vector<size_t> in_verIDs, out_verIDs;
   std::vector<const char*> in_dev_type, out_dev_type;
   std::vector<int> in_dev_id, out_dev_id;
-  std::vector<NDArray> conv_mkl;  // converted NDArrays from MKLDNN format
+  std::vector<NDArray> conv_mkl;  // converted NDArrays from DNNL format
 
   // Extra data for sparse inputs and outputs.
   std::vector<int> in_stypes(inputs.size(), 0), out_stypes(outputs.size(), 0);
@@ -176,9 +176,9 @@ void CustomFComputeDispatcher(const std::string op_name,
   for (size_t i = 0; i < inputs.size(); i++) {
     NDArray const* in_nd = &(inputs[i]);
 #if MXNET_USE_ONEDNN == 1
-    // reorder data if in MKLDNN format
-    if (in_nd->IsMKLDNNData()) {
-      // convert from MKLDNN
+    // reorder data if in DNNL format
+    if (in_nd->IsDNNLData()) {
+      // convert from DNNL
       conv_mkl.push_back(in_nd->Reorder2Default());
       in_nd = &(conv_mkl.back());
     }
@@ -1642,8 +1642,8 @@ void registerPasses(void* lib,
           const NDArray& in_arg = *(in_args_ptr[i]);
 
 #if MXNET_USE_ONEDNN == 1
-          // reorder data if in MKLDNN format
-          if (in_arg.IsMKLDNNData()) {
+          // reorder data if in DNNL format
+          if (in_arg.IsDNNLData()) {
             in_arg.Reorder2DefaultAsync();
             in_arg.WaitToRead();
           }
@@ -1668,8 +1668,8 @@ void registerPasses(void* lib,
           const auto& in_aux = *(in_aux_ptr[i]);
 
 #if MXNET_USE_ONEDNN == 1
-          // reorder data if in MKLDNN format
-          if (in_aux.IsMKLDNNData()) {
+          // reorder data if in DNNL format
+          if (in_aux.IsDNNLData()) {
             in_aux.Reorder2DefaultAsync();
             in_aux.WaitToRead();
           }
@@ -2557,7 +2557,7 @@ int MXNDArrayGetData(NDArrayHandle handle, void** out_pdata) {
   API_BEGIN();
   NDArray* arr = static_cast<NDArray*>(handle);
 #if MXNET_USE_ONEDNN == 1
-  if (arr->IsMKLDNNData()) {
+  if (arr->IsDNNLData()) {
     arr->Reorder2DefaultAsync();
     arr->WaitToRead();
   }
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index ec2aa7c..21a9713 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -94,7 +94,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
     const auto& nd = src[i];
 
 #if MXNET_USE_ONEDNN == 1
-    if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
+    if (req->at(i) == kWriteInplace && nd.IsDNNLData())
       // If it's write inplace and the output array doesn't use the default
       // layout, we'll generate a temporary output array below, which means
       // the input array and the output array are no longer the same array.
@@ -108,7 +108,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
       if (bufs != nullptr) {
         temp = bufs->at(i);
       } else if (kAddTo == req->at(i)) {
-        temp = nd.IsMKLDNNData() ? nd.Reorder2Default() : nd;
+        temp = nd.IsDNNLData() ? nd.Reorder2Default() : nd;
       } else {
         temp = NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
       }
diff --git a/src/common/utils.h b/src/common/utils.h
index 710cc61..c62fafa 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -49,7 +49,7 @@
 
 #include "../operator/mxnet_op.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/nn/dnnl/dnnl_base-inl.h"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
@@ -513,11 +513,11 @@ inline void LogStorageFallback(const nnvm::NodeAttrs& attrs,
   os << "\nStorage type fallback detected:\n" << op_str << warning;
   LogOnce(os.str());
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     common::LogOnce(
         "MXNET_ONEDNN_ENABLED flag is off. "
         "You can re-enable by setting MXNET_ONEDNN_ENABLED=1");
-  if (GetMKLDNNCacheSize() != -1)
+  if (GetDNNLCacheSize() != -1)
     common::LogOnce(
         "MXNET_ONEDNN_CACHE_NUM is set."
         "Should only be set if "
diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc
index a9402b7..4a8c51d 100644
--- a/src/imperative/attach_op_execs_pass.cc
+++ b/src/imperative/attach_op_execs_pass.cc
@@ -36,10 +36,10 @@ namespace mxnet {
 namespace exec {
 
 #if MXNET_USE_ONEDNN == 1
-#define CREATE_DEFAULT_INPUTS_MKLDNN(in_array, in_array_fallback, attrs) \
+#define CREATE_DEFAULT_INPUTS_DNNL(in_array, in_array_fallback, attrs) \
   CREATE_DEFAULT_INPUTS(true, attrs, CreateDefaultInputs(in_array, in_array_fallback))
 #else
-#define CREATE_DEFAULT_INPUTS_MKLDNN(in_array, in_array_fallback, attrs)  // empty macro
+#define CREATE_DEFAULT_INPUTS_DNNL(in_array, in_array_fallback, attrs)  // empty macro
 #endif
 
 // abstract OpExecutor which provides storage fallback procedure on
@@ -168,7 +168,7 @@ class StatefulComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray>* pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
+    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_);
     fcompute_(state_, op_ctx, *pInArray, req, out_array);
   }
 
@@ -240,7 +240,7 @@ class FComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray>* pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
+    CREATE_DEFAULT_INPUTS_DNNL(in_array, pInArray = &in_array_fallback, attrs_);
     fcompute_(attrs_, op_ctx, *pInArray, req, out_array);
   }
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 5944b0a..7d506fa 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -16,20 +16,23 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+// The first two includes below need to be in unalphabetical for the miscellaneous CI to pass.
 #include <mxnet/operator.h>
 #include <mxnet/imperative.h>
 #include <nnvm/pass_functions.h>
-#include <utility>
+
 #include <algorithm>
-#include <vector>
 #include <map>
 #include <string>
-#include "./exec_pass.h"
+#include <utility>
+#include <vector>
+
 #include "../c_api/c_api_common.h"
-#include "../common/utils.h"
 #include "../common/exec_utils.h"
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../common/utils.h"
+#include "../operator/nn/dnnl/dnnl_base-inl.h"
 #include "../operator/operator_common.h"
+#include "./exec_pass.h"
 
 #ifndef MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
 #define MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
@@ -51,7 +54,7 @@ void InvalidateOutputs(const std::vector<T>* pArrs, const std::vector<OpReqType>
   auto arrs = *pArrs;
   for (size_t i = 0; i < arrs.size(); i++) {
     if (reqs[i] == kWriteTo || reqs[i] == kNullOp)
-      pntr(arrs[i])->InvalidateMKLDNNData();
+      pntr(arrs[i])->InvalidateDNNLData();
   }
 }
 
@@ -60,7 +63,7 @@ static inline void CreateDefaultInputs(const std::vector<NDArray>& arrs,
                                        std::vector<NDArray>* out_arrs) {
   out_arrs->clear();
   for (size_t i = 0; i < arrs.size(); ++i) {
-    if (arrs[i].IsMKLDNNData())
+    if (arrs[i].IsDNNLData())
       out_arrs->push_back(arrs[i].Reorder2Default());
     else
       out_arrs->push_back(arrs[i]);
@@ -77,7 +80,7 @@ static inline void CreateDefaultInputs(std::vector<NDArray>* pArrs) {
 #define INVALIDATE_OUTPUTS(outputs, req) InvalidateOutputs(&outputs, req)
 // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
 // its FCcomputeEx, but AsyncPush the copy operation to engine.
-// So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+// So for the case that A is holding dnnl memory, and then copy A to B, and then copy B
 // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
 // copying A to B may not happen, and will corrupt A's memory.
 #define INVALIDATE_OUTPUTS_COND(cond, outputs, req) \
@@ -85,12 +88,12 @@ static inline void CreateDefaultInputs(std::vector<NDArray>* pArrs) {
     INVALIDATE_OUTPUTS(outputs, req);               \
   }
 
-// add for mkldnn OP + no mkldnn OP
-#define CREATE_DEFAULT_INPUTS(cond, attrs, func_call)      \
-  if (cond) {                                              \
-    const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN"); \
-    if (!is_mkldnn.get(attrs.op, false))                   \
-      func_call;                                           \
+// add for dnnl OP + no dnnl OP
+#define CREATE_DEFAULT_INPUTS(cond, attrs, func_call)  \
+  if (cond) {                                          \
+    const auto is_dnnl = Op::GetAttr<bool>("TIsDNNL"); \
+    if (!is_dnnl.get(attrs.op, false))                 \
+      func_call;                                       \
   }
 
 #else
@@ -573,7 +576,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray*>& src,
     const auto& nd = *src[i];
 
 #if MXNET_USE_ONEDNN == 1
-    if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
+    if (req->at(i) == kWriteInplace && nd.IsDNNLData())
       // If it's write inplace and the output array doesn't use the default
       // layout, we'll generate a temporary output array below, which means
       // the input array and the output array are no longer the same array.
@@ -586,7 +589,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray*>& src,
       if (bufs != nullptr) {
         temp = bufs->at(i);
       } else if (kAddTo == req->at(i)) {
-        temp = nd.IsMKLDNNData() ? nd.Reorder2Default() : nd;
+        temp = nd.IsDNNLData() ? nd.Reorder2Default() : nd;
       } else {
         temp = NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
       }
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 3e64a8d..d927ff8 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -25,20 +25,18 @@
 #include <dmlc/logging.h>
 #include <dmlc/memory_io.h>
 #include <dmlc/registry.h>
+#include <mshadow/tensor.h>
 #include <mxnet/base.h>
 #include <mxnet/imperative.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/resource.h>
 
-#include <mshadow/tensor.h>
-
-#include "./ndarray_function.h"
-
 #include "../common/utils.h"
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/nn/dnnl/dnnl_base-inl.h"
 #include "../operator/tensor/init_op.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../profiler/storage_profiler.h"
+#include "./ndarray_function.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -119,7 +117,7 @@ struct ChunkMem {
   Storage::Handle h;
   std::vector<Storage::Handle> aux_h;
 #if MXNET_USE_ONEDNN == 1
-  std::shared_ptr<MKLDNNMemory> mem;
+  std::shared_ptr<DNNLMemory> mem;
 #endif
 };
 
@@ -129,8 +127,8 @@ NDArray::Chunk::~Chunk() {
   mem.h     = this->shandle;
   mem.aux_h = this->aux_handles;
 #if MXNET_USE_ONEDNN == 1
-  // We want to delete mkldnn memory after deleting the variable.
-  mem.mem = this->mkl_mem_;
+  // We want to delete dnnl memory after deleting the variable.
+  mem.mem = this->dnnl_mem_;
 #endif
   if (auto engine = engine_ref_.lock()) {
     engine->DeleteVariable(
@@ -168,7 +166,7 @@ void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape& shape, int dtype) {
     shandle.size = dbytes;
     Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_ONEDNN == 1
-    mkl_mem_ = nullptr;
+    dnnl_mem_ = nullptr;
 #endif
   }
   // init shape
@@ -198,34 +196,34 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
 
 #if MXNET_USE_ONEDNN == 1
 
-NDArray::NDArray(const mkldnn::memory::desc& md)
+NDArray::NDArray(const dnnl::memory::desc& md)
     : storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
   shape_ = mxnet::TShape(md.data.dims, md.data.dims + md.data.ndims);
   dtype_ = get_mxnet_type(md.data.data_type);
   ptr_   = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
   ptr_->CheckAndAlloc(md.get_size());
-  ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(md, ptr_->shandle.dptr);
+  ptr_->dnnl_mem_ = std::make_shared<DNNLMemory>(md, ptr_->shandle.dptr);
 }
 
-NDArray::NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem)
+NDArray::NDArray(const std::shared_ptr<dnnl::memory>& dnnl_mem)
     : storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
-  auto mem_desc      = mkldnn_mem->get_desc();
+  auto mem_desc      = dnnl_mem->get_desc();
   shape_             = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
   dtype_             = get_mxnet_type(mem_desc.data.data_type);
   ptr_               = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
-  ptr_->shandle.dptr = mkldnn_mem->get_data_handle();
+  ptr_->shandle.dptr = dnnl_mem->get_data_handle();
   ptr_->shandle.size = mem_desc.get_size();
   ptr_->delay_alloc  = false;
-  ptr_->mkl_mem_     = std::make_shared<MKLDNNMemory>(mkldnn_mem);
+  ptr_->dnnl_mem_    = std::make_shared<DNNLMemory>(dnnl_mem);
   ptr_->static_data  = true;
 }
 
-NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape& shape) const {
+NDArray NDArray::DNNLDataReshape(const mxnet::TShape& shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
   CHECK_GE(shape_.Size(), shape.Size())
       << "NDArray.Reshape: target shape size is larger current shape";
   CHECK_EQ(storage_type(), kDefaultStorage);
-  if (!IsMKLDNNData()) {
+  if (!IsDNNLData()) {
     NDArray ret = this->Detach();
     ret.shape_  = shape;
     return ret;
@@ -233,22 +231,22 @@ NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape& shape) const {
     NDArray ret(shape, ctx(), true, dtype());
     // We shouldn't submit the reorder primitive here because submit will
     // be called in operators.
-    mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
-    CHECK(ptr_->IsMKLDNN());
-    mkldnn::memory::desc def_desc            = ptr_->mkl_mem_->GetDesc(format);
-    mkldnn::memory* def_mem                  = TmpMemMgr::Get()->Alloc(def_desc);
-    MKLDNNStream* stream                     = MKLDNNStream::Get();
-    std::shared_ptr<mkldnn::memory> curr_mem = ptr_->mkl_mem_->GetMem();
+    dnnl_format_tag_t format = ptr_->dnnl_mem_->GetDefaultFormat();
+    CHECK(ptr_->IsDNNL());
+    dnnl::memory::desc def_desc            = ptr_->dnnl_mem_->GetDesc(format);
+    dnnl::memory* def_mem                  = TmpMemMgr::Get()->Alloc(def_desc);
+    DNNLStream* stream                     = DNNLStream::Get();
+    std::shared_ptr<dnnl::memory> curr_mem = ptr_->dnnl_mem_->GetMem();
     stream->RegisterMem(curr_mem);
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, *curr_mem}, {MKLDNN_ARG_TO, *def_mem}});
-    stream->RegisterPrimArgs(mkldnn::reorder(*curr_mem, *def_mem), args);
+    std::unordered_map<int, dnnl::memory> args(
+        {{DNNL_ARG_FROM, *curr_mem}, {DNNL_ARG_TO, *def_mem}});
+    stream->RegisterPrimArgs(dnnl::reorder(*curr_mem, *def_mem), args);
     // def_mem points to a memory region in the temp space. It's only valid
     // inside an operator. As such, the returned NDArray can only be valid
     // inside an operator and the shared point doesn't need to do anything
     // when it's destroyed.
-    auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory* mem) {});
-    ret.ptr_->mkl_mem_.reset(new MKLDNNMemory(tmp));
+    auto tmp = std::shared_ptr<dnnl::memory>(def_mem, [](dnnl::memory* mem) {});
+    ret.ptr_->dnnl_mem_.reset(new DNNLMemory(tmp));
     ret.ptr_->shandle.dptr = def_mem->get_data_handle();
     ret.ptr_->shandle.size = def_mem->get_desc().get_size();
     ret.ptr_->delay_alloc  = false;
@@ -500,185 +498,185 @@ void NDArray::set_fresh_out_grad(bool state) const {
 
 #if MXNET_USE_ONEDNN == 1
 
-bool NDArray::Chunk::IsMKLDNN() const {
+bool NDArray::Chunk::IsDNNL() const {
   if (storage_type != kDefaultStorage)
     return false;
-  if (mkl_mem_ == nullptr)
+  if (dnnl_mem_ == nullptr)
     return false;
-  return mkl_mem_->IsMKLDNN();
+  return dnnl_mem_->IsDNNL();
 }
 
 bool NDArray::Chunk::IsDefault() const {
   if (storage_type != kDefaultStorage)
     return false;
-  // If we don't have mkldnn memory yet, we just assume it's not the default
+  // If we don't have dnnl memory yet, we just assume it's not the default
   // format.
-  if (mkl_mem_ == nullptr)
+  if (dnnl_mem_ == nullptr)
     return true;
-  return !mkl_mem_->IsMKLDNN();
+  return !dnnl_mem_->IsDNNL();
 }
 
 void NDArray::Chunk::Reorder2Default() {
-  if (mkl_mem_ == nullptr)
+  if (dnnl_mem_ == nullptr)
     return;
 
   if (IsDefault())
     return;
 
-  mkldnn_format_tag_t format    = mkl_mem_->GetDefaultFormat();
-  mkldnn::memory::desc def_desc = mkl_mem_->GetDesc(format);
-  mkldnn_mem_ptr def_mem(new mkldnn::memory(def_desc, CpuEngine::Get()->get_engine()));
-  mkl_mem_->ReorderTo(def_mem.get());
+  dnnl_format_tag_t format    = dnnl_mem_->GetDefaultFormat();
+  dnnl::memory::desc def_desc = dnnl_mem_->GetDesc(format);
+  dnnl_mem_ptr def_mem(new dnnl::memory(def_desc, CpuEngine::Get()->get_engine()));
+  dnnl_mem_->ReorderTo(def_mem.get());
 
   CHECK(shandle.size >= def_desc.get_size());
   CheckAndAlloc(def_desc.get_size());
   // TODO(zhengda) We need to avoid memory copy here.
   memcpy(shandle.dptr, def_mem->get_data_handle(), def_desc.get_size());
-  mkl_mem_ = nullptr;
+  dnnl_mem_ = nullptr;
 }
 
-void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc& md) {
+void NDArray::Chunk::DNNLDataReorder(const dnnl::memory::desc& md) {
   // If the memory already uses the specified layout, don't do anything.
-  if (mkl_mem_ != nullptr && mkl_mem_->SameFormat(md))
+  if (dnnl_mem_ != nullptr && dnnl_mem_->SameFormat(md))
     return;
 
   // If the memory is default, don't do anything.
-  if (!mxnet::IsMKLDNN(md) && IsDefault())
+  if (!mxnet::IsDNNL(md) && IsDefault())
     return;
-  if (!mxnet::IsMKLDNN(md)) {
+  if (!mxnet::IsDNNL(md)) {
     // If the specified layout is default, we should use Reorder2Default.
     Reorder2Default();
     return;
   }
   auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::stream s(engine);
+  dnnl::stream s(engine);
 
-  std::shared_ptr<mkldnn::memory> new_mem(new mkldnn::memory(md, engine));
-  std::shared_ptr<mkldnn::memory> old_mem;
+  std::shared_ptr<dnnl::memory> new_mem(new dnnl::memory(md, engine));
+  std::shared_ptr<dnnl::memory> old_mem;
   if (IsDefault()) {
-    mkldnn_format_tag_t def_format = GetDefaultFormat(md);
-    mkldnn::memory::desc def_desc  = GetDesc(md, def_format);
-    old_mem.reset(new mkldnn::memory(def_desc, engine, shandle.dptr));
+    dnnl_format_tag_t def_format = GetDefaultFormat(md);
+    dnnl::memory::desc def_desc  = GetDesc(md, def_format);
+    old_mem.reset(new dnnl::memory(def_desc, engine, shandle.dptr));
   } else {
-    old_mem = this->mkl_mem_->GetMem();
+    old_mem = this->dnnl_mem_->GetMem();
   }
   CHECK(old_mem->get_desc().data.ndims == md.data.ndims);
 
-  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
-  mkldnn::reorder(*old_mem, *new_mem).execute(s, *old_mem, *new_mem);
+  // This may be called in DNNL operators. We can't use DNNLStream here.
+  dnnl::reorder(*old_mem, *new_mem).execute(s, *old_mem, *new_mem);
 
   CHECK(shandle.size >= md.get_size());
   CheckAndAlloc(md.get_size());
   // TODO(zhengda) We need to avoid memory copy here.
   memcpy(shandle.dptr, new_mem->get_data_handle(), md.get_size());
-  mkl_mem_.reset(new MKLDNNMemory(md, shandle.dptr));
+  dnnl_mem_.reset(new DNNLMemory(md, shandle.dptr));
 }
 
 void NDArray::Chunk::SetMKLMem(const mxnet::TShape& shape, int dtype) {
   // The shape of the array and the one of the MKL memory may mismatch.
   // For example, if the array stores parameters, the MKL memory may store data
   // in 5 dimensions while the NDArray stores data in 4 dimensions.
-  if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr && mkl_mem_->SameFormat(shape, dtype)) {
+  if (dnnl_mem_ && dnnl_mem_->GetDataHandle() == shandle.dptr &&
+      dnnl_mem_->SameFormat(shape, dtype)) {
     return;
   }
 
-  mkldnn::memory::dims dims;
-  // These are shapes supprted by MKLDNN.
+  dnnl::memory::dims dims;
+  // These are shapes supprted by DNNL.
   if (shape.ndim() >= 1 && shape.ndim() <= 6) {
     dims.resize(shape.ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape[i];
   } else {
-    LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions";
+    LOG(FATAL) << "DNNL doesn't support " << shape.ndim() << " dimensions";
   }
-  mkldnn::memory::format_tag layout = mkldnn::memory::format_tag::undef;
+  dnnl::memory::format_tag layout = dnnl::memory::format_tag::undef;
   switch (dims.size()) {
     case 1:
-      layout = mkldnn::memory::format_tag::a;
+      layout = dnnl::memory::format_tag::a;
       break;
     case 2:
-      layout = mkldnn::memory::format_tag::ab;
+      layout = dnnl::memory::format_tag::ab;
       break;
     case 3:
-      layout = mkldnn::memory::format_tag::abc;
+      layout = dnnl::memory::format_tag::abc;
       break;
     case 4:
-      layout = mkldnn::memory::format_tag::abcd;
+      layout = dnnl::memory::format_tag::abcd;
       break;
     case 5:
-      layout = mkldnn::memory::format_tag::abcde;
+      layout = dnnl::memory::format_tag::abcde;
       break;
     case 6:
-      layout = mkldnn::memory::format_tag::abcdef;
+      layout = dnnl::memory::format_tag::abcdef;
       break;
     default:
-      LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for MKLDNN";
+      LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for DNNL";
   }
-  mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout};
+  dnnl::memory::desc data_md{dims, get_dnnl_type(dtype), layout};
   if (shandle.dptr == nullptr) {
     CHECK(delay_alloc);
     CheckAndAlloc();
   }
   CHECK(shandle.size >= data_md.get_size());
-  mkl_mem_.reset(new MKLDNNMemory(data_md, shandle.dptr));
+  dnnl_mem_.reset(new DNNLMemory(data_md, shandle.dptr));
 }
 
-const mkldnn::memory* NDArray::GetMKLDNNData(const mkldnn::memory::desc& desc) const {
+const dnnl::memory* NDArray::GetDNNLData(const dnnl::memory::desc& desc) const {
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    LOG(FATAL) << "The size of NDArray doesn't match the requested DNNL memory desc";
     return nullptr;
   }
-  const mkldnn::memory* mem  = GetMKLDNNData();
-  mkldnn::memory::desc desc1 = mem->get_desc();
+  const dnnl::memory* mem  = GetDNNLData();
+  dnnl::memory::desc desc1 = mem->get_desc();
   // The MKL memory has the same format and shape as required,
   // or both use the default format, we can return the MKL memory.
-  if (desc1 == desc || ((!mxnet::IsMKLDNN(desc1)) && (!mxnet::IsMKLDNN(desc)))) {
-    return GetMKLDNNExact(mem, desc);
+  if (desc1 == desc || ((!mxnet::IsDNNL(desc1)) && (!mxnet::IsDNNL(desc)))) {
+    return GetDNNLExact(mem, desc);
   } else {
     return nullptr;
   }
 }
 
-const mkldnn::memory* NDArray::GetMKLDNNDataReorder(const mkldnn::memory::desc& new_desc) const {
+const dnnl::memory* NDArray::GetDNNLDataReorder(const dnnl::memory::desc& new_desc) const {
   CHECK(storage_type() == kDefaultStorage);
 
-  const mkldnn::memory* mem = GetMKLDNNData();
+  const dnnl::memory* mem = GetDNNLData();
   // If the memory descriptor matches, it's easy.
-  MKLDNNStream* stream = MKLDNNStream::Get();
+  DNNLStream* stream = DNNLStream::Get();
   if (mem->get_desc() == new_desc) {
-    return GetMKLDNNExact(mem, new_desc);
+    return GetDNNLExact(mem, new_desc);
   }
 
-  mkldnn::memory::desc old_desc = mem->get_desc();
+  dnnl::memory::desc old_desc = mem->get_desc();
   // Now we need to determine if we should reorder the memory.
   // If both use the default formats, we think we don't need to reorder.
-  if ((!mxnet::IsMKLDNN(old_desc)) && (!mxnet::IsMKLDNN(new_desc))) {
-    mkldnn_mem_ptr ret(
-        new mkldnn::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+  if ((!mxnet::IsDNNL(old_desc)) && (!mxnet::IsDNNL(new_desc))) {
+    dnnl_mem_ptr ret(
+        new dnnl::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
     stream->RegisterMem(ret);
     return ret.get();
   } else if (same_shape(old_desc, new_desc)) {
     // If they have the same shape, we can reorder data directly.
-    mkldnn::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
-    std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
-    stream->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
+    dnnl::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
+    std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *mem}, {DNNL_ARG_TO, *ret}});
+    stream->RegisterPrimArgs(dnnl::reorder(*mem, *ret), args);
     return ret;
   } else {
     // If they have different shapes, we need to reshape the array first.
     // Since this method will only be used inside an operator, we can call
-    // MKLDNNDataReshape to reshape an array.
+    // DNNLDataReshape to reshape an array.
     mxnet::TShape required_shape(new_desc.data.ndims, -1);
     for (int i = 0; i < new_desc.data.ndims; i++)
       required_shape[i] = new_desc.data.dims[i];
-    NDArray reshaped          = MKLDNNDataReshape(required_shape);
-    const mkldnn::memory* ret = reshaped.GetMKLDNNData();
+    NDArray reshaped        = DNNLDataReshape(required_shape);
+    const dnnl::memory* ret = reshaped.GetDNNLData();
     if (ret->get_desc() == new_desc) {
-      return GetMKLDNNExact(ret, new_desc);
+      return GetDNNLExact(ret, new_desc);
     } else {
-      mkldnn::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, *ret}, {MKLDNN_ARG_TO, *ret2}});
-      stream->RegisterPrimArgs(mkldnn::reorder(*ret, *ret2), args);
+      dnnl::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
+      std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *ret}, {DNNL_ARG_TO, *ret2}});
+      stream->RegisterPrimArgs(dnnl::reorder(*ret, *ret2), args);
       return ret2;
     }
   }
@@ -687,22 +685,22 @@ const mkldnn::memory* NDArray::GetMKLDNNDataReorder(const mkldnn::memory::desc&
 NDArray NDArray::Reorder2Default() const {
   CHECK(storage_type() == kDefaultStorage);
 
-  if (ptr_->mkl_mem_ == nullptr)
+  if (ptr_->dnnl_mem_ == nullptr)
     return *this;
-  if (!ptr_->mkl_mem_->IsMKLDNN())
+  if (!ptr_->dnnl_mem_->IsDNNL())
     return *this;
 
-  // create new ndarray from  mkldnn layout
-  mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetDesc();
+  // create new ndarray from  dnnl layout
+  dnnl::memory::desc from_desc = ptr_->dnnl_mem_->GetDesc();
   mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++)
     tshape[i] = from_desc.data.dims[i];
   NDArray ret(tshape, ctx(), false, dtype());
-  mkldnn_format_tag_t format    = ptr_->mkl_mem_->GetDefaultFormat();
-  mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
+  dnnl_format_tag_t format    = ptr_->dnnl_mem_->GetDefaultFormat();
+  dnnl::memory::desc def_desc = ptr_->dnnl_mem_->GetDesc(format);
   CHECK(ret.ptr_->shandle.size >= def_desc.get_size());
-  mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
-  ptr_->mkl_mem_->ReorderTo(&def_mem);
+  dnnl::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
+  ptr_->dnnl_mem_->ReorderTo(&def_mem);
   // reshape as needed
   ret.shape_       = shape_;
   ret.byte_offset_ = byte_offset_;
@@ -711,17 +709,17 @@ NDArray NDArray::Reorder2Default() const {
 }
 
 void NDArray::SelfReorder2Default() {
-  if (!IsMKLDNNData())
+  if (!IsDNNLData())
     return;
 
   CHECK(storage_type() == kDefaultStorage);
 
-  const auto mkl_mem = ptr_->mkl_mem_;
-  if (mkl_mem == nullptr || !mkl_mem->IsMKLDNN())
+  const auto dnnl_mem = ptr_->dnnl_mem_;
+  if (dnnl_mem == nullptr || !dnnl_mem->IsDNNL())
     return;
 
-  // create new ndarray from  mkldnn layout
-  mkldnn::memory::desc from_desc = mkl_mem->GetDesc();
+  // create new ndarray from  dnnl layout
+  dnnl::memory::desc from_desc = dnnl_mem->GetDesc();
   mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++)
     tshape[i] = from_desc.data.dims[i];
@@ -730,11 +728,11 @@ void NDArray::SelfReorder2Default() {
   const auto saved_byte_offset = byte_offset_;
   this->ReInit(kDefaultStorage, tshape, ctx(), dtype(), false);
 
-  mkldnn_format_tag_t format    = mkl_mem->GetDefaultFormat();
-  mkldnn::memory::desc def_desc = mkl_mem->GetDesc(format);
+  dnnl_format_tag_t format    = dnnl_mem->GetDefaultFormat();
+  dnnl::memory::desc def_desc = dnnl_mem->GetDesc(format);
   CHECK(ptr_->shandle.size >= def_desc.get_size());
-  mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ptr_->shandle.dptr);
-  mkl_mem->ReorderTo(&def_mem);
+  dnnl::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ptr_->shandle.dptr);
+  dnnl_mem->ReorderTo(&def_mem);
   // reshape as needed
   shape_       = saved_shape;
   byte_offset_ = saved_byte_offset;
@@ -765,14 +763,14 @@ NDArray NDArray::Reorder2DefaultFloatFormat() const {
     return Reorder2Default();
   }
   NDArray ret(shape(), ctx(), false, mshadow::DataType<float>::kFlag);
-  auto src_mem = GetMKLDNNData();
-  auto dst_mem = ret.GetMKLDNNData();
+  auto src_mem = GetDNNLData();
+  auto dst_mem = ret.GetDNNLData();
   ReorderTo(src_mem, dst_mem);
 
   return ret;
 }
 
-void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
+void NDArray::DNNLDataReorderAsync(const dnnl::memory::desc& desc) const {
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp        = *this;
@@ -782,7 +780,7 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
         // MXNet will try to reuse NDArray from memory planning, so we need to ensure
         // the NDArray is still holding the original trunk data.
         if (tmp.version() == version) {
-          tmp.ptr_->MKLDNNDataReorder(desc);
+          tmp.ptr_->DNNLDataReorder(desc);
         }
         on_complete();
       },
@@ -794,120 +792,119 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
       "Reorder");
 }
 
-const mkldnn::memory* NDArray::GetMKLDNNData() const {
+const dnnl::memory* NDArray::GetDNNLData() const {
   CHECK(storage_type() == kDefaultStorage);
   const auto is_view = IsView();
-  if (IsMKLDNNData()) {
-    // If this array uses MKLDNN layout, we have to make sure it's not a view.
+  if (IsDNNLData()) {
+    // If this array uses DNNL layout, we have to make sure it's not a view.
     // Otherwise, we'll have to change the layout inside the array.
     CHECK(!is_view);
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-    // If this array uses MKLDNN format, we should return now. Otherwise,
-    // SetMKLMem may mess up mkl_mem_.
-    return ptr_->mkl_mem_->GetRaw();
+    DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+    // If this array uses DNNL format, we should return now. Otherwise,
+    // SetMKLMem may mess up dnnl_mem_.
+    return ptr_->dnnl_mem_->GetRaw();
   }
 
   CheckAndAlloc();
   if (is_view) {
-    // If this is a view, we can't create a MKLDNN memory for the chunk
+    // If this is a view, we can't create a DNNL memory for the chunk
     // because we don't have the complete data type and shape information for
     // the chunk.
     void* off_addr = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
-    // Create the primitive desc for the new mkldnn memory.
-    mkldnn::memory::dims dims(shape().ndim());
+    // Create the primitive desc for the new dnnl memory.
+    dnnl::memory::dims dims(shape().ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape()[i];
 
-    const auto cpp_format =
-        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(shape().ndim()));
-    mkldnn::memory::desc data_md(dims, get_mkldnn_type(dtype_), cpp_format);
-    std::shared_ptr<mkldnn::memory> ret(
-        new mkldnn::memory(data_md, CpuEngine::Get()->get_engine(), off_addr));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    const auto cpp_format = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(shape().ndim()));
+    dnnl::memory::desc data_md(dims, get_dnnl_type(dtype_), cpp_format);
+    std::shared_ptr<dnnl::memory> ret(
+        new dnnl::memory(data_md, CpuEngine::Get()->get_engine(), off_addr));
+    DNNLStream::Get()->RegisterMem(ret);
     return ret.get();
   }
 
-  // If this isn't a view, we can create a MKLDNN memory and store it in the chunk
+  // If this isn't a view, we can create a DNNL memory and store it in the chunk
   ptr_->SetMKLMem(shape_, dtype_);
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-  return ptr_->mkl_mem_->GetRaw();
+  DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+  return ptr_->dnnl_mem_->GetRaw();
 }
 
-void NDArray::InvalidateMKLDNNData() {
-  // Removing mkl_mem_ means the NDArray will store data in the default format.
-  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->IsMKLDNN())
-    ptr_->mkl_mem_ = nullptr;
+void NDArray::InvalidateDNNLData() {
+  // Removing dnnl_mem_ means the NDArray will store data in the default format.
+  if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->IsDNNL())
+    ptr_->dnnl_mem_ = nullptr;
 }
 
-void NDArray::CopyFrom(const mkldnn::memory& mem) {
+void NDArray::CopyFrom(const dnnl::memory& mem) {
   CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
-  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetRaw() == &mem)
+  if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->GetRaw() == &mem)
     return;
 
   CHECK(mem.get_desc().get_size() == shape().Size() * GetTypeSize(dtype_))
-      << "The size of NDArray doesn't match the requested MKLDNN memory desc";
-  // If this array uses MKLDNN layout, we have to make sure it's not a view.
+      << "The size of NDArray doesn't match the requested DNNL memory desc";
+  // If this array uses DNNL layout, we have to make sure it's not a view.
   // Otherwise, we'll have to change the layout inside the array.
 
-  if (IsMKLDNNData() && IsView())
+  if (IsDNNLData() && IsView())
     ptr_->Reorder2Default();
 
-  const mkldnn::memory* this_mem = GetMKLDNNData();
-  MKLDNNMemoryCopy(mem, this_mem);
+  const dnnl::memory* this_mem = GetDNNLData();
+  DNNLMemoryCopy(mem, this_mem);
 }
 
-mkldnn::memory* NDArray::CreateMKLDNNData(const mkldnn::memory::desc& desc) {
+dnnl::memory* NDArray::CreateDNNLData(const dnnl::memory::desc& desc) {
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc. "
-               << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
+    LOG(FATAL) << "The size of NDArray doesn't match the requested DNNL memory desc. "
+               << "DNNL memory requests for " << desc.get_size() << " bytes, but got "
                << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
     return nullptr;
   }
   bool isDefaultFormat = IsDefaultFormat(desc);
   if (isDefaultFormat && !IsView()) {
     ptr_->SetMKLMem(shape_, dtype_);
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-    return GetMKLDNNExact(ptr_->mkl_mem_->GetRaw(), desc);
+    DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+    return GetDNNLExact(ptr_->dnnl_mem_->GetRaw(), desc);
   } else if (isDefaultFormat) {
     ptr_->CheckAndAlloc();
     CHECK(ptr_->shandle.dptr);
     // When this is a view and a user wants the default layout, we can simply
-    // create a new mkldnn memory that points to the right memory.
-    std::shared_ptr<mkldnn::memory> mem(
-        new mkldnn::memory(desc,
-                           CpuEngine::Get()->get_engine(),
-                           static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
-    MKLDNNStream::Get()->RegisterMem(mem);
+    // create a new dnnl memory that points to the right memory.
+    std::shared_ptr<dnnl::memory> mem(
+        new dnnl::memory(desc,
+                         CpuEngine::Get()->get_engine(),
+                         static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
+    DNNLStream::Get()->RegisterMem(mem);
     return mem.get();
   } else if (IsView()) {
     // If this is a view and a user wants to write data to it with special
-    // a MKLDNN format, we should reorder the data in the array and return NULL.
+    // a DNNL format, we should reorder the data in the array and return NULL.
     // In this way, the user will create a new NDArray for the special format
     // and copy data back.
     ptr_->Reorder2Default();
     return nullptr;
   }
 
-  if (ptr_->mkl_mem_)
-    CHECK(ptr_->mkl_mem_->GetDataHandle() == ptr_->shandle.dptr);
-  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetDesc() == desc) {
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-    return GetMKLDNNExact(ptr_->mkl_mem_->GetRaw(), desc);
+  if (ptr_->dnnl_mem_)
+    CHECK(ptr_->dnnl_mem_->GetDataHandle() == ptr_->shandle.dptr);
+  if (ptr_->dnnl_mem_ && ptr_->dnnl_mem_->GetDesc() == desc) {
+    DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+    return GetDNNLExact(ptr_->dnnl_mem_->GetRaw(), desc);
   }
 
   CHECK(ptr_->shandle.size >= desc.get_size());
   ptr_->CheckAndAlloc(desc.get_size());
-  ptr_->mkl_mem_.reset(new MKLDNNMemory(desc, ptr_->shandle.dptr));
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-  return ptr_->mkl_mem_->GetRaw();
+  ptr_->dnnl_mem_.reset(new DNNLMemory(desc, ptr_->shandle.dptr));
+  DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
+  return ptr_->dnnl_mem_->GetRaw();
 }
 
-void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc) {
+void NDArray::UpdateDNNLMemDesc(const dnnl::memory::desc& desc) {
   auto new_desc           = desc;
-  auto this_dtype         = get_mkldnn_type(dtype());
-  new_desc.data.data_type = static_cast<mkldnn_data_type_t>(this_dtype);
-  ptr_->mkl_mem_.reset(new MKLDNNMemory(new_desc, ptr_->shandle.dptr));
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
+  auto this_dtype         = get_dnnl_type(dtype());
+  new_desc.data.data_type = static_cast<dnnl_data_type_t>(this_dtype);
+  ptr_->dnnl_mem_.reset(new DNNLMemory(new_desc, ptr_->shandle.dptr));
+  DNNLStream::Get()->RegisterMem(ptr_->dnnl_mem_->GetMem());
 }
 
 #endif
@@ -919,8 +916,8 @@ void NDArray::SetTBlob() const {
   auto stype          = storage_type();
   if (stype == kDefaultStorage) {
 #if MXNET_USE_ONEDNN == 1
-    CHECK(!IsMKLDNNData()) << "We can't generate TBlob for MKLDNN data. "
-                           << "Please use Reorder2Default() to generate a new NDArray first";
+    CHECK(!IsDNNLData()) << "We can't generate TBlob for DNNL data. "
+                         << "Please use Reorder2Default() to generate a new NDArray first";
 #endif
     dptr += byte_offset_;
   } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
@@ -1316,38 +1313,38 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
 template <typename from_xpu, typename to_xpu>
 inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
 #if MXNET_USE_ONEDNN == 1
-  // If neither is MKLDNN, we can copy data normally.
-  if (!from.IsMKLDNNData() && !to.IsMKLDNNData()) {
+  // If neither is DNNL, we can copy data normally.
+  if (!from.IsDNNLData() && !to.IsDNNLData()) {
 #endif
     using namespace mshadow;
     CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
     TBlob tmp = to.data();
     ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp, from.ctx(), to.ctx(), ctx);
 #if MXNET_USE_ONEDNN == 1
-  } else if (SupportMKLDNN(from.dtype(), from.shape()) && SupportMKLDNN(to.dtype(), to.shape()) &&
+  } else if (SupportDNNL(from.dtype(), from.shape()) && SupportDNNL(to.dtype(), to.shape()) &&
              from.ctx().dev_mask() == cpu::kDevMask && to.ctx().dev_mask() == cpu::kDevMask) {
     // If we copy data directly, we need to make sure both NDArrays are supported
-    // by MKLDNN.
-    auto from_mem = from.GetMKLDNNData();
-    auto to_mem   = to.GetMKLDNNData();
+    // by DNNL.
+    auto from_mem = from.GetDNNLData();
+    auto to_mem   = to.GetDNNLData();
     if (from_mem->get_desc() == to_mem->get_desc()) {
       size_t size = std::min(from_mem->get_desc().get_size(), to_mem->get_desc().get_size());
       memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
     } else {
       const_cast<NDArray&>(to).CopyFrom(*from_mem);
-      MKLDNNStream::Get()->Submit();
+      DNNLStream::Get()->Submit();
     }
   } else {
-    // In this case, one of the NDArray isn't supported by MKLDNN, we need
-    // to convert the MKLDNN array to the default format first and copy data
+    // In this case, one of the NDArray isn't supported by DNNL, we need
+    // to convert the DNNL array to the default format first and copy data
     // with Copy().
     NDArray tmp_from = from;
-    if (tmp_from.IsMKLDNNData()) {
+    if (tmp_from.IsDNNLData()) {
       // TODO(zhengda) tmp_from should be cached.
       tmp_from     = NDArray(from.shape(), from.ctx(), false, from.dtype());
-      auto tmp_mem = from.GetMKLDNNData();
+      auto tmp_mem = from.GetDNNLData();
       tmp_from.CopyFrom(*tmp_mem);
-      MKLDNNStream::Get()->Submit();
+      DNNLStream::Get()->Submit();
     }
     CHECK(tmp_from.IsDefaultData());
     CHECK(to.IsDefaultData());
@@ -1896,7 +1893,7 @@ void NDArray::Save(dmlc::Stream* strm) const {
     this->WaitToRead();
     nd_cpu = *this;
 #if MXNET_USE_ONEDNN == 1
-    if (nd_cpu.IsMKLDNNData())
+    if (nd_cpu.IsDNNLData())
       nd_cpu = nd_cpu.Reorder2Default();
 #endif
     save_data = nd_cpu.data();
@@ -2346,7 +2343,7 @@ void NDArray::SyncCopyToCPU(void* data, size_t size) const {
     RunContext rctx{this->ctx(), nullptr, nullptr, false};
     NDArray src = *this;
 #if MXNET_USE_ONEDNN == 1
-    if (src.IsMKLDNNData())
+    if (src.IsDNNLData())
       src = this->Reorder2Default();
 #endif
     ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), rctx);
diff --git a/src/operator/contrib/batch_norm_relu.cc b/src/operator/contrib/batch_norm_relu.cc
index 93ccbd9..d223c65 100644
--- a/src/operator/contrib/batch_norm_relu.cc
+++ b/src/operator/contrib/batch_norm_relu.cc
@@ -28,7 +28,7 @@
 #include "../elemwise_op_common.h"
 #include "../operator_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "../nn/mkldnn/mkldnn_batch_norm-inl.h"
+#include "../nn/dnnl/dnnl_batch_norm-inl.h"
 #endif
 
 namespace mxnet {
@@ -130,7 +130,7 @@ static bool BatchNormWithReLUType(const nnvm::NodeAttrs& attrs,
 }
 
 #if MXNET_USE_ONEDNN == 1
-static inline bool SupportMKLDNNBNReLU(const NDArray& input, const BatchNormParam& param) {
+static inline bool SupportDNNLBNReLU(const NDArray& input, const BatchNormParam& param) {
   if (mxnet::op::batchnorm::disable_mkl)
     return false;
   const mxnet::TShape shape = input.shape();
@@ -139,7 +139,7 @@ static inline bool SupportMKLDNNBNReLU(const NDArray& input, const BatchNormPara
     return false;
   const int dtype = input.dtype();
   return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
-         SupportStorageMKLDNN(input.storage_type());
+         SupportStorageDNNL(input.storage_type());
 }
 
 void BatchNormWithReLUComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -150,15 +150,15 @@ void BatchNormWithReLUComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 5U);
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = true;
-  if (SupportMKLDNNBNReLU(inputs[0], param)) {
+  if (SupportDNNLBNReLU(inputs[0], param)) {
     CHECK_GT(outputs.size(), 3U);
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNN_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
-      MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNL_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
+      DNNLBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
     });
     return;
   }
-  LOG(FATAL) << "BatchNormWithReLU operator only supports MKL-DNN Backend.";
+  LOG(FATAL) << "BatchNormWithReLU operator only supports DNNL Backend.";
 }
 
 void BatchNormWithReLUGradComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -168,13 +168,13 @@ void BatchNormWithReLUGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                        const std::vector<NDArray>& outputs) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = true;
-  if (SupportMKLDNNBNReLU(inputs[0], param)) {
+  if (SupportDNNLBNReLU(inputs[0], param)) {
     CHECK_EQ(inputs.size(), 9U);
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
     return;
   }
-  LOG(FATAL) << "BatchNormWithReLU operator only supports MKL-DNN Backend.";
+  LOG(FATAL) << "BatchNormWithReLU operator only supports DNNL Backend.";
 }
 #endif
 
@@ -188,9 +188,9 @@ static inline bool BatchNormWithReLUStorageType(const nnvm::NodeAttrs& attrs,
   bool dispatched = false;
 #if MXNET_USE_ONEDNN == 1
   if (!dispatched) {
-    dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+    dispatched = DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
   }
-  if (!MKLDNNEnvSet()) {
+  if (!DNNLEnvSet()) {
     *dispatch_mode = DispatchMode::kFComputeFallback;
   }
 #else
@@ -288,7 +288,7 @@ An extented operator of Batch normalization which can fuse ReLU activation.
 #endif
     .set_attr<nnvm::FGradient>("FGradient", BatchNormWithReLUGrad)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
@@ -322,7 +322,7 @@ NNVM_REGISTER_OP(_backward_contrib_BatchNormWithReLU)
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormWithReLUGradComputeExCPU)
 #endif
     .set_attr_parser(ParamParser<BatchNormParam>);
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index 799311b..dd331ad 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -25,8 +25,8 @@
 
 #include "./leaky_relu-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./nn/mkldnn/mkldnn_base-inl.h"
-#include "./nn/mkldnn/mkldnn_ops-inl.h"
+#include "./nn/dnnl/dnnl_base-inl.h"
+#include "./nn/dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN == 1
 
 #include <nnvm/op_attr_types.h>
@@ -95,10 +95,10 @@ static void LeakyReLUComputeExCPU(const nnvm::NodeAttrs& attrs,
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
   size_t expected             = param.act_type == leakyrelu::kPReLU ? 2 : 1;
   CHECK_EQ(inputs.size(), expected);
-  if (SupportMKLDNNLeakyRelu(param, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLeakyReluForward, attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNN_OPCHECK_RUN(LeakyReLUCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLLeakyRelu(param, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLeakyReluForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+    DNNL_OPCHECK_RUN(LeakyReLUCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(LeakyReLUCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -112,11 +112,11 @@ void LeakyReLUGradComputeExCPU(const nnvm::NodeAttrs& attrs,
   if (inputs[0].shape().Size() == 0U)
     return;
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
-  if (SupportMKLDNNLeakyRelu(param, inputs[0])) {
+  if (SupportDNNLLeakyRelu(param, inputs[0])) {
     std::vector<NDArray> in_data{inputs[0], inputs[1]};
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNLeakyReluBackward, attrs, ctx, in_data, req, outputs);
-    MKLDNN_OPCHECK_RUN(LeakyReLUGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLLeakyReluBackward, attrs, ctx, in_data, req, outputs);
+    DNNL_OPCHECK_RUN(LeakyReLUGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(LeakyReLUGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -130,8 +130,8 @@ inline static bool LeakyReLUStorageType(const nnvm::NodeAttrs& attrs,
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
   size_t expected             = param.act_type == leakyrelu::kPReLU ? 2 : 1;
   CHECK_EQ(in_attrs->size(), expected);
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardLeakyReLUStorageType(const nnvm::NodeAttrs& attrs,
@@ -140,8 +140,8 @@ inline static bool BackwardLeakyReLUStorageType(const nnvm::NodeAttrs& attrs,
                                                 std::vector<int>* in_attrs,
                                                 std::vector<int>* out_attrs) {
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLLeakyRelu(param), dispatch_mode, in_attrs, out_attrs);
 }
 #endif  // MXNET_USE_ONEDNN == 1
 
@@ -197,7 +197,7 @@ The following modified ReLU Activation functions are supported:
     .set_attr<nnvm::FInferType>("FInferType", LeakyReLUType)
     .set_attr<FCompute>("FCompute<cpu>", LeakyReLUCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LeakyReLUComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_LeakyReLU"})
@@ -248,7 +248,7 @@ NNVM_REGISTER_OP(_backward_LeakyReLU)
                                 })
     .set_attr_parser(ParamParser<LeakyReLUParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", LeakyReLUGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", LeakyReLUGradCompute<cpu>);
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 4efe4cd..a228bf8 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -26,8 +26,8 @@
 #include "../mshadow_op.h"
 #include "../tensor/elemwise_unary_op.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN == 1
 #include "../operator_common.h"
 #include "../../common/utils.h"
@@ -112,10 +112,10 @@ static void ActivationComputeExCPU(const nnvm::NodeAttrs& attrs,
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-  if (SupportMKLDNNAct(param, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNActivationForward, attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNN_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLAct(param, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLActivationForward, attrs, ctx, inputs[0], req[0], outputs[0]);
+    DNNL_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ActivationComputeImpl<cpu>, attrs, ctx, inputs, req, outputs);
@@ -128,10 +128,10 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), activation::GradNumInputs(param.act_type));
-  if (SupportMKLDNNAct(param, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNActivationBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLAct(param, inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLActivationBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ActivationGradComputeImpl<cpu>, attrs, ctx, inputs, req, outputs);
@@ -145,8 +145,8 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNAct(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLAct(param), dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
@@ -156,8 +156,8 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
                                           std::vector<int>* out_attrs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), activation::GradNumInputs(param.act_type));
-  return MKLDNNStorageType(
-      attrs, dev_mask, SupportMKLDNNAct(param), dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(
+      attrs, dev_mask, SupportDNNLAct(param), dispatch_mode, in_attrs, out_attrs);
 }
 #endif  // MXNET_USE_ONEDNN == 1
 
@@ -186,7 +186,7 @@ The following activation functions are supported:
                                       })
     .set_attr<FCompute>("FCompute<cpu>", ActivationCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ActivationComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", ActivationGrad{"_backward_Activation"})
@@ -216,7 +216,7 @@ NNVM_REGISTER_OP(_backward_Activation)
 #endif
     .set_attr_parser(ParamParser<ActivationParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ActivationGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", ActivationGradCompute<cpu>);
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 5a18363..d3502b9 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -30,7 +30,7 @@
 
 #include "batch_norm-inl.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_batch_norm-inl.h"
+#include "./dnnl/dnnl_batch_norm-inl.h"
 #endif
 
 namespace mxnet {
@@ -446,7 +446,7 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
 }
 
 #if MXNET_USE_ONEDNN == 1
-static inline bool SupportMKLDNNBN(const NDArray& input, const BatchNormParam& param) {
+static inline bool SupportDNNLBN(const NDArray& input, const BatchNormParam& param) {
   if (mxnet::op::batchnorm::disable_mkl)
     return false;
   const mxnet::TShape shape = input.shape();
@@ -455,7 +455,7 @@ static inline bool SupportMKLDNNBN(const NDArray& input, const BatchNormParam& p
     return false;
   const int dtype = input.dtype();
   return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
-         SupportStorageMKLDNN(input.storage_type());
+         SupportStorageDNNL(input.storage_type());
 }
 
 void BatchNormComputeExCPU(const nnvm::NodeAttrs& attrs,
@@ -466,12 +466,12 @@ void BatchNormComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 5U);
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = false;
-  if (SupportMKLDNNBN(inputs[0], param)) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNN_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
-      MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
+  if (SupportDNNLBN(inputs[0], param)) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNL_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
+      DNNLBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
     });
-    MKLDNN_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -484,10 +484,10 @@ void BatchNormGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                const std::vector<NDArray>& outputs) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   bool fuse_relu              = false;
-  if (SupportMKLDNNBN(inputs[0], param)) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
-    MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLBN(inputs[0], param)) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
+    DNNL_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -504,9 +504,9 @@ static inline bool BatchNormStorageType(const nnvm::NodeAttrs& attrs,
   bool dispatched = false;
 #if MXNET_USE_ONEDNN == 1
   if (!dispatched) {
-    dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+    dispatched = DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
   }
-  if (!MKLDNNEnvSet()) {
+  if (!DNNLEnvSet()) {
     *dispatch_mode = DispatchMode::kFComputeFallback;
   }
 #else
@@ -648,7 +648,7 @@ then set ``gamma`` to 1 and its gradient to 0.
 #endif
     .set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
@@ -687,7 +687,7 @@ NNVM_REGISTER_OP(_backward_BatchNorm)
                                 })
     .set_attr_parser(ParamParser<BatchNormParam>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 5bfe8cf..580183f 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -23,10 +23,10 @@
  * \author Bing Xu
  */
 
-#include "./concat-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
 #include "../../common/utils.h"
+#include "./concat-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -212,7 +212,7 @@ inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs,
     dispatched = dispatch_fallback(out_attrs, dispatch_mode);
   }
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     *dispatch_mode = DispatchMode::kFComputeFallback;
 #endif  // MXNET_USE_ONEDNN == 1
   return dispatched;
@@ -234,13 +234,13 @@ inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs,
 #endif  // MXNET_USE_ONEDNN == 1
     wanted_mode = DispatchMode::kFCompute;
 #if MXNET_USE_ONEDNN == 1
-  if (!MKLDNNEnvSet())
+  if (!DNNLEnvSet())
     wanted_mode = DispatchMode::kFComputeFallback;
 #endif  // MXNET_USE_ONEDNN == 1
   return storage_type_assign(out_attrs, mxnet::kDefaultStorage, dispatch_mode, wanted_mode);
 }
 #if MXNET_USE_ONEDNN == 1
-bool SupportMKLDNNConcat(const std::vector<NDArray>& arrs) {
+bool SupportDNNLConcat(const std::vector<NDArray>& arrs) {
   for (auto& arr : arrs) {
     if (arr.IsView())
       return false;
@@ -250,8 +250,8 @@ bool SupportMKLDNNConcat(const std::vector<NDArray>& arrs) {
     if (arr.shape().Size() == 0)
       return false;
     int ndim               = arr.shape().ndim();
-    const int mkldnn_ndims = arr.GetMKLDNNData()->get_desc().data.ndims;
-    if (!(ndim == 2 || ndim == 4) || ndim != mkldnn_ndims)
+    const int dnnl_ndims   = arr.GetDNNLData()->get_desc().data.ndims;
+    if (!(ndim == 2 || ndim == 4) || ndim != dnnl_ndims)
       return false;
   }
   return true;
@@ -271,10 +271,10 @@ static void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs,
       outputs[0].storage_type() == kCSRStorage) {
     ConcatCSRImpl<cpu>(attrs, op_ctx, inputs, req, outputs);
 #if MXNET_USE_ONEDNN == 1
-  } else if (SupportMKLDNNConcat(inputs)) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConcatForward, attrs, op_ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
+  } else if (SupportDNNLConcat(inputs)) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConcatForward, attrs, op_ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
   } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) {
     FallBackCompute(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
 #endif  // MXNET_USE_ONEDNN == 1
@@ -289,10 +289,10 @@ static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                    const std::vector<NDArray>& inputs,
                                    const std::vector<OpReqType>& req,
                                    const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConcat(inputs)) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConcatBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLConcat(inputs)) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConcatBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -390,7 +390,7 @@ Example::
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
     .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
 #endif  // MXNET_USE_ONEDNN == 1
         CONCAT_FORWARD_ATTRS.set_attr<mxnet::FInferShape>("FInferShape", ConcatShape)
     .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
@@ -419,7 +419,7 @@ NNVM_REGISTER_OP(_backward_Concat)
     .set_attr<nnvm::TIsBackward>("TIsBackward", true)
     .set_attr<FInferStorageType>("FInferStorageType", BackwardConcatStorageType)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ConcatGradComputeExCPU)
 #endif  // MXNET_USE_ONEDNN == 1
     .set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 7dcc203..0e054c0 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -27,8 +27,8 @@
 #include "../elemwise_op_common.h"
 #include "../operator_common.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN
 
 namespace mxnet {
@@ -54,10 +54,10 @@ static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
   const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNConv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConvolutionForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLConv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConvolutionForward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -69,10 +69,10 @@ static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                         const std::vector<OpReqType>& req,
                                         const std::vector<NDArray>& outputs) {
   const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNConv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNConvolutionBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLConv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLConvolutionBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -319,7 +319,7 @@ inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), 1);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
@@ -333,7 +333,7 @@ inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), out_expected);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -507,7 +507,7 @@ There are other options to tune the performance.
 #endif
     .set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionComputeExCPU)
 #endif
     .set_attr<nnvm::FGradient>("FGradient", ConvolutionGrad{"_backward_Convolution"})
@@ -540,7 +540,7 @@ NNVM_REGISTER_OP(_backward_Convolution)
                                 })
     .set_attr_parser(ConvolutionParamParser)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradComputeExCPU)
 #endif
     .set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>);
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 2c71672..86cde82 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -27,8 +27,8 @@
 #include "../operator_common.h"
 #include "../../common/utils.h"
 #if MXNET_USE_ONEDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./dnnl/dnnl_base-inl.h"
+#include "./dnnl/dnnl_ops-inl.h"
 #endif  // MXNET_USE_ONEDNN
 
 namespace mxnet {
@@ -41,10 +41,10 @@ static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                       const std::vector<OpReqType>& req,
                                       const std::vector<NDArray>& outputs) {
   const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNDeconv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNDeconvolutionForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLDeconv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLDeconvolutionForward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -56,10 +56,10 @@ static void DeconvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                           const std::vector<OpReqType>& req,
                                           const std::vector<NDArray>& outputs) {
   const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  if (SupportMKLDNNDeconv(params, inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNDeconvolutionBackward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  if (SupportDNNLDeconv(params, inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLDeconvolutionBackward, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -75,7 +75,7 @@ inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), 1);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
@@ -89,7 +89,7 @@ inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), out_expected);
 
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }
 #endif
 
@@ -436,7 +436,7 @@ NNVM_REGISTER_OP(Deconvolution)
     .set_attr<FCompute>("FCompute<cpu>", DeconvolutionCompute<cpu>)
     .set_attr<nnvm::FGradient>("FGradient", DeconvolutionGrad{"_backward_Deconvolution"})
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", DeconvStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionComputeExCPU)
 #endif
@@ -464,7 +464,7 @@ NNVM_REGISTER_OP(_backward_Deconvolution)
                                 })
     .set_attr_parser(DeconvolutionParamParser)
 #if MXNET_USE_ONEDNN == 1
-    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<bool>("TIsDNNL", true)
     .set_attr<FInferStorageType>("FInferStorageType", BackwardDeconvStorageType)
     .set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionGradComputeExCPU)
 #endif
diff --git a/src/operator/nn/dnnl/dnnl_act-inl.h b/src/operator/nn/dnnl/dnnl_act-inl.h
new file mode 100644
index 0000000..3c8c16b
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_act-inl.h
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_act-inl.h
+ * \brief DNNL Activation operator
+ * /author Zhiyuan Huang
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_ACT_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_ACT_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+#include <utility>
+#include <vector>
+
+#include "../../leaky_relu-inl.h"
+#include "../activation-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct DNNLActParam {
+  dnnl::algorithm alg;
+  float slope = 0.f;
+
+  bool operator==(const DNNLActParam& other) const {
+    return this->alg == other.alg && this->slope == other.slope;
+  }
+};
+
+dnnl::algorithm GetDNNLActAlgo(const ActivationParam& param);
+dnnl::algorithm GetDNNLActAlgo(const LeakyReLUParam& param);
+
+dnnl::eltwise_forward::primitive_desc GetActFwdDescImpl(const DNNLActParam& param,
+                                                        bool is_train,
+                                                        const dnnl::memory& input_mem);
+
+class DNNLActForward {
+ public:
+  const dnnl::eltwise_forward::primitive_desc fwd_pd;
+
+  DNNLActForward(const DNNLActParam& param,
+                 bool is_train,
+                 const NDArray& data,
+                 const dnnl::memory& mem)
+      : fwd_pd(GetActFwdDescImpl(param, is_train, mem)) {
+    fwd_ = std::make_shared<dnnl::eltwise_forward>(fwd_pd);
+  }
+  const inline dnnl::eltwise_forward& GetFwd() const;
+
+ private:
+  std::shared_ptr<dnnl::eltwise_forward> fwd_;
+};
+
+typedef ParamOpSign<DNNLActParam> DNNLActSignature;
+DNNLActForward& GetActForward(const DNNLActParam& param,
+                              const OpContext& ctx,
+                              const NDArray& in_data,
+                              const dnnl::memory& in_mem);
+
+dnnl::eltwise_backward::primitive_desc GetActBwdDescImpl(const DNNLActParam& param,
+                                                         const dnnl::memory& input_mem,
+                                                         const dnnl::memory& diff_dst_memory);
+
+class DNNLActBackward {
+ public:
+  const dnnl::eltwise_backward::primitive_desc bwd_pd;
+
+  explicit DNNLActBackward(const DNNLActParam& param,
+                           const NDArray& data,
+                           const dnnl::memory& mem,
+                           const dnnl::memory& diff_dst_memory)
+      : bwd_pd(GetActBwdDescImpl(param, mem, diff_dst_memory)) {
+    bwd_prim_ = std::make_shared<dnnl::eltwise_backward>(bwd_pd);
+  }
+  const inline dnnl::eltwise_backward& GetBwd() const;
+
+ private:
+  std::shared_ptr<dnnl::eltwise_backward> bwd_prim_;
+};
+}  // namespace op
+}  // namespace mxnet
+
+namespace std {
+template <>
+struct hash<mxnet::op::DNNLActParam> {
+  size_t operator()(const mxnet::op::DNNLActParam& val) {
+    size_t ret = 0;
+    ret        = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
+    ret        = dmlc::HashCombine(ret, val.slope);
+    return ret;
+  }
+};
+}  // namespace std
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_ACT_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_act.cc b/src/operator/nn/dnnl/dnnl_act.cc
new file mode 100644
index 0000000..90a8fd0
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_act.cc
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_act.cc
+ * \brief
+ * \author Da Zheng
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../../operator_common.h"
+#include "./dnnl_base-inl.h"
+#include "dnnl_act-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportDNNLAct(const ActivationParam& param) {
+  return param.act_type == activation::kReLU || param.act_type == activation::kSigmoid ||
+         param.act_type == activation::kLogSigmoid || param.act_type == activation::kMish ||
+         param.act_type == activation::kSoftReLU || param.act_type == activation::kTanh;
+}
+
+bool SupportDNNLAct(const ActivationParam& param, const NDArray& input) {
+  // DNNL Activation supports 1d, 2d, 3d, 4d and 5d data layout
+  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
+      !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
+    return false;
+  return SupportDNNLAct(param);
+}
+
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param) {
+  return param.act_type == leakyrelu::kLeakyReLU || param.act_type == leakyrelu::kELU ||
+         param.act_type == leakyrelu::kGELU;
+}
+
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param, const NDArray& input) {
+  // DNNL Activation supports 1d, 2d, 3d, 4d and 5d data layout
+  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
+      !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
+    return false;
+  return SupportDNNLLeakyRelu(param);
+}
+
+bool SupportQuantizedDNNLAct(const ActivationParam& param) {
+  // TODO(zhennan): Add more activation type when dnnl supports.
+  //                Remove this when it's identity to SupportDNNLAct.
+  return param.act_type == activation::kReLU;
+}
+
+dnnl::algorithm GetDNNLActAlgo(const ActivationParam& param) {
+  switch (param.act_type) {
+    case activation::kReLU:
+      return dnnl::algorithm::eltwise_relu;
+    case activation::kSigmoid:
+      return dnnl::algorithm::eltwise_logistic;
+    case activation::kLogSigmoid:
+      return dnnl::algorithm::eltwise_logsigmoid;
+    case activation::kMish:
+      return dnnl::algorithm::eltwise_mish;
+    case activation::kTanh:
+      return dnnl::algorithm::eltwise_tanh;
+    case activation::kSoftReLU:
+      return dnnl::algorithm::eltwise_soft_relu;
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return dnnl::algorithm::eltwise_relu;
+  }
+}
+
+dnnl::algorithm GetDNNLActAlgo(const LeakyReLUParam& param) {
+  switch (param.act_type) {
+    case leakyrelu::kLeakyReLU:
+      return dnnl::algorithm::eltwise_relu;
+    case leakyrelu::kELU:
+      return dnnl::algorithm::eltwise_elu;
+    case leakyrelu::kGELU:
+      return dnnl::algorithm::eltwise_gelu_erf;
+    default:
+      LOG(FATAL) << "unknown activation type for LeakyReLU: " << param.act_type;
+      return dnnl::algorithm::eltwise_relu;
+  }
+}
+
+dnnl::eltwise_forward::primitive_desc GetActFwdDescImpl(const DNNLActParam& param,
+                                                        bool is_train,
+                                                        const dnnl::memory& input_mem) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto alg                   = param.alg;
+
+  auto prop = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+  auto desc = dnnl::eltwise_forward::desc(prop, alg, data_md, param.slope);
+  return dnnl::eltwise_forward::primitive_desc(desc, cpu_engine);
+}
+
+const inline dnnl::eltwise_forward& DNNLActForward::GetFwd() const {
+  return *fwd_;
+}
+
+DNNLActForward& GetActForward(const DNNLActParam& param,
+                              const OpContext& ctx,
+                              const NDArray& in_data,
+                              const dnnl::memory& in_mem) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLActSignature, DNNLActForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLActSignature, DNNLActForward, OpHash> fwds;
+#endif
+  DNNLActSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(static_cast<int>(param.alg));
+  key.AddSign(param.slope);
+  key.AddSign(in_data);
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLActForward fwd(param, ctx.is_train, in_data, in_mem);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLActivationForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  DNNLActParam param_;
+  param_.alg               = GetDNNLActAlgo(param);
+  const NDArray& in_buffer = in_data;
+  DNNLStream* stream       = DNNLStream::Get();
+  auto input_mem           = in_buffer.GetDNNLData();
+  DNNLActForward& fwd      = GetActForward(param_, ctx, in_buffer, *input_mem);
+  auto out_mem_t           = CreateDNNLMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+  stream->RegisterPrimArgs(fwd.GetFwd(),
+                           {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DST, *out_mem_t.second}});
+  CommitOutput(out_data, out_mem_t);
+  stream->Submit();
+}
+
+void DNNLLeakyReluForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const NDArray& in_data,
+                          const OpReqType& req,
+                          const NDArray& out_data) {
+  const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
+  DNNLActParam param_;
+  param_.alg   = GetDNNLActAlgo(param);
+  param_.slope = param.slope;
+
+  NDArray in_buffer  = in_data;
+  DNNLStream* stream = DNNLStream::Get();
+
+  if (in_data.IsView() && in_data.IsDNNLData())
+    in_buffer = in_data.Reorder2Default();
+
+  auto input_mem      = in_buffer.GetDNNLData();
+  DNNLActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
+  auto out_mem_t      = CreateDNNLMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+  stream->RegisterPrimArgs(fwd.GetFwd(),
+                           {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DST, *out_mem_t.second}});
+  CommitOutput(out_data, out_mem_t);
+  stream->Submit();
+}
+
+dnnl::eltwise_backward::primitive_desc GetActBwdDescImpl(const DNNLActParam& param,
+                                                         const dnnl::memory& input_mem,
+                                                         const dnnl::memory& diff_dst_memory) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  dnnl::memory::desc diff_md = diff_dst_memory.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto alg                   = param.alg;
+
+  dnnl::eltwise_forward::desc fw_desc(dnnl::prop_kind::forward_training, alg, data_md, param.slope);
+  dnnl::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
+  dnnl::eltwise_backward::desc bw_desc(alg, diff_md, data_md, param.slope);
+  dnnl::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc);
+  return bw_pdesc;
+}
+
+const inline dnnl::eltwise_backward& DNNLActBackward::GetBwd() const {
+  return *bwd_prim_;
+}
+
+static inline DNNLActBackward& GetActBackward(const DNNLActParam& param,
+                                              const OpContext& ctx,
+                                              const NDArray& in_data,
+                                              const NDArray& out_grad,
+                                              const dnnl::memory& in_mem) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLActSignature, DNNLActBackward, OpHash> bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLActSignature, DNNLActBackward, OpHash> bwds;
+#endif
+  DNNLActSignature key(param);
+  key.AddSign(in_data);
+  key.AddSign(out_grad);
+
+  auto it = bwds.find(key);
+  if (it == bwds.end()) {
+    DNNLActBackward bwd(param, in_data, in_mem, *out_grad.GetDNNLData());
+    it = AddToCache(&bwds, key, bwd);
+  }
+  return it->second;
+}
+
+// For backward relu activation, it's okay to pass "out_data" as "in_data" to this
+// function, since the computation only involes non-zeros.
+void DNNLActivationBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  // XXX: for y = relu(x), y is passed as "in_data" to Backward()
+  const bool relu           = param.act_type == activation::kReLU;
+  const NDArray& out_buffer = inputs[0];
+  const NDArray& in_buffer  = relu ? inputs[1] : inputs[2];
+  const NDArray& in_grad    = outputs[0];
+  DNNLActParam param_;
+  param_.alg = GetDNNLActAlgo(param);
+  TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
+  auto diff_dst_memory = out_buffer.GetDNNLData();
+  auto input_mem       = in_buffer.GetDNNLData();
+  // We need to make sure the two inputs to eltwise_backward has the same memory
+  // descriptor. Otherwise, the perf will suffer.
+  if (input_mem->get_desc() != diff_dst_memory->get_desc()) {
+    input_mem = in_buffer.GetDNNLDataReorder(diff_dst_memory->get_desc());
+  }
+
+  DNNLActBackward& bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+  DNNLStream* stream   = DNNLStream::Get();
+  dnnl_args_map_t args = {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DIFF_DST, *diff_dst_memory}};
+  if (req[0] != kAddTo) {
+    // req[0] is kWriteTo or kWriteInplace
+    auto diff_src_memory = const_cast<NDArray&>(in_grad).CreateDNNLData(bwd.bwd_pd.diff_src_desc());
+    args.insert({DNNL_ARG_DIFF_SRC, *diff_src_memory});
+    stream->RegisterPrimArgs(bwd.GetBwd(), args);
+    stream->Submit();
+  } else {
+    auto diff_src_memory = CreateDNNLMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
+    args.insert({DNNL_ARG_DIFF_SRC, *diff_src_memory.second});
+    stream->RegisterPrimArgs(bwd.GetBwd(), args);
+    CommitOutput(in_grad, diff_src_memory);
+    stream->Submit();
+  }
+}
+
+void DNNLLeakyReluBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  const NDArray& out_buffer = inputs[0];
+  const NDArray& in_buffer  = inputs[1];
+  const NDArray& output     = outputs[0];
+
+  const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
+  DNNLActParam param_;
+  param_.alg   = GetDNNLActAlgo(param);
+  param_.slope = param.slope;
+
+  TmpMemMgr::Get()->Init(ctx.requested[leakyrelu::kRandom]);
+  auto diff_dst_memory = out_buffer.GetDNNLData();
+  auto input_mem       = in_buffer.GetDNNLData();
+  // We need to make sure the two inputs to eltwise_backward has the same memory
+  // descriptor. Otherwise, the perf will suffer.
+  if (input_mem->get_desc() != diff_dst_memory->get_desc())
+    input_mem = in_buffer.GetDNNLDataReorder(diff_dst_memory->get_desc());
+  DNNLActBackward& bwd          = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+  DNNLStream* stream            = DNNLStream::Get();
+  dnnl_output_t diff_src_memory = CreateDNNLMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
+  dnnl_args_map_t args          = {
+      {DNNL_ARG_SRC, *input_mem},
+      {DNNL_ARG_DIFF_DST, *diff_dst_memory},
+      {DNNL_ARG_DIFF_SRC, *diff_src_memory.second},
+  };
+  stream->RegisterPrimArgs(bwd.GetBwd(), args);
+  CommitOutput(output, diff_src_memory);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/dnnl/dnnl_base-inl.h
similarity index 52%
rename from src/operator/nn/mkldnn/mkldnn_base-inl.h
rename to src/operator/nn/dnnl/dnnl_base-inl.h
index cf7c9b1..d0a4871 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/dnnl/dnnl_base-inl.h
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * \file mkldnn_base-inl.h
+ * \file dnnl_base-inl.h
  * \brief
  * \author young.jin.kim@intel.com
  *         ashok.emani@intel.com
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_BASE_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_BASE_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <algorithm>
@@ -36,13 +36,13 @@
 #include <utility>
 #include <vector>
 
-#include "mkldnn.hpp"
+#include "dnnl.hpp"
 #include "mxnet/graph_attr_types.h"
 #include "mxnet/ndarray.h"
 #include "mxnet/op_attr_types.h"
 #include "mxnet/resource.h"
 
-#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...) \
+#define DNNL_REAL_TYPE_SWITCH(type, DType, ...)   \
   switch (type) {                                 \
     case mshadow::kFloat32: {                     \
       typedef float DType;                        \
@@ -64,7 +64,7 @@ class CpuEngine {
  public:
   static CpuEngine* Get() {
     // I's thread-safe in C++11.
-    // ensure same mkldnn engine is used across threads
+    // ensure same dnnl engine is used across threads
     static CpuEngine myInstance;
     return &myInstance;
   }
@@ -73,16 +73,16 @@ class CpuEngine {
   CpuEngine& operator=(CpuEngine const&) = delete;  // Copy assign
   CpuEngine& operator=(CpuEngine&&) = delete;       // Move assign
 
-  mkldnn::engine& get_engine() {
+  dnnl::engine& get_engine() {
     return _cpu_engine;
   }
 
  protected:
-  CpuEngine() : _cpu_engine(mkldnn::engine::kind::cpu, 0) {}
+  CpuEngine() : _cpu_engine(dnnl::engine::kind::cpu, 0) {}
   ~CpuEngine() {}
 
  private:
-  mkldnn::engine _cpu_engine;
+  dnnl::engine _cpu_engine;
 };
 
 // type enumerator
@@ -91,30 +91,30 @@ struct data_type_enum {};
 
 template <>
 struct data_type_enum<float> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::f32) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::f32) };
 };
 
 template <>
 struct data_type_enum<mshadow::bfloat::bf16_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::bf16) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::bf16) };
 };
 
 template <>
 struct data_type_enum<int32_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::s32) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::s32) };
 };
 
 template <>
 struct data_type_enum<int8_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::s8) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::s8) };
 };
 
 template <>
 struct data_type_enum<uint8_t> {
-  enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::u8) };
+  enum { type = static_cast<unsigned int>(dnnl::memory::data_type::u8) };
 };
 
-static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape& shape) {
+static inline bool SupportDNNLArray(int dtype, const mxnet::TShape& shape) {
   int ndim     = shape.ndim();
   bool support = ndim == 1 || ndim == 2 || ndim == 4;
   support      = support &&
@@ -123,37 +123,37 @@ static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape& shape) {
   return support;
 }
 
-static inline bool SupportStorageMKLDNN(int stype) {
+static inline bool SupportStorageDNNL(int stype) {
   return stype == kDefaultStorage;
 }
 
-static inline bool SupportMKLDNN(int dtype, const mxnet::TShape& shape) {
+static inline bool SupportDNNL(int dtype, const mxnet::TShape& shape) {
   int ndim = shape.ndim();
   if (ndim == 0 || shape.Size() == 0) {
-    // MKLDNN currently does not support 0-dim Tensor and 0-size Tensor
+    // DNNL currently does not support 0-dim Tensor and 0-size Tensor
     return false;
   }
   return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
          (ndim == 1 || ndim == 2 || ndim == 4);
 }
 
-static inline bool IsMKLDNNType(int dtype) {
+static inline bool IsDNNLType(int dtype) {
   return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 || dtype == mshadow::kUint8 ||
          dtype == mshadow::kBfloat16;
 }
 
-static inline bool SupportMKLDNN(const NDArray& input) {
-  return SupportMKLDNN(input.dtype(), input.shape()) && SupportStorageMKLDNN(input.storage_type());
+static inline bool SupportDNNL(const NDArray& input) {
+  return SupportDNNL(input.dtype(), input.shape()) && SupportStorageDNNL(input.storage_type());
 }
 
-static inline bool MKLDNNEnvSet() {
-  static bool is_mkldnn_enabled = dmlc::GetEnv("MXNET_ONEDNN_ENABLED", true);
-  return is_mkldnn_enabled;
+static inline bool DNNLEnvSet() {
+  static bool is_dnnl_enabled = dmlc::GetEnv("MXNET_ONEDNN_ENABLED", true);
+  return is_dnnl_enabled;
 }
 
-static inline int GetMKLDNNCacheSize() {
-  static int mkldnn_cache_size = dmlc::GetEnv("MXNET_ONEDNN_CACHE_NUM", -1);
-  return mkldnn_cache_size;
+static inline int GetDNNLCacheSize() {
+  static int dnnl_cache_size = dmlc::GetEnv("MXNET_ONEDNN_CACHE_NUM", -1);
+  return dnnl_cache_size;
 }
 
 // TODO(alex): (MXNET-1075) Will remove env variable and calculate cache size during runtime
@@ -161,8 +161,8 @@ template <typename S, typename I, typename H>
 static typename std::unordered_map<S, I, H>::iterator AddToCache(std::unordered_map<S, I, H>* cache,
                                                                  const S& key,
                                                                  const I& item) {
-  int mkldnn_cache_size = GetMKLDNNCacheSize();
-  if (mkldnn_cache_size != -1 && static_cast<int>(cache->size()) > mkldnn_cache_size)
+  int dnnl_cache_size = GetDNNLCacheSize();
+  if (dnnl_cache_size != -1 && static_cast<int>(cache->size()) > dnnl_cache_size)
     cache->erase(cache->begin());
   auto ins_return = cache->insert(std::pair<S, I>(key, item));
   CHECK(ins_return.second);
@@ -184,22 +184,20 @@ struct SoftmaxOutputParam;
 struct TransposeParam;
 struct ReshapeParam;
 struct LayerNormParam;
-bool SupportMKLDNNAct(const ActivationParam& param);
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input);
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param);
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input);
-bool SupportQuantizedMKLDNNAct(const ActivationParam& param);
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input);
-bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input);
-bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
-bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param,
-                             const NDArray& input,
-                             const NDArray& output);
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam& param);
-bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray& data);
-bool SupportMKLDNNBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
-bool SupportMKLDNNLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs);
-bool SupportMKLDNNReshape(const NDArray& input, const NDArray& output);
+bool SupportDNNLAct(const ActivationParam& param);
+bool SupportDNNLAct(const ActivationParam& param, const NDArray& input);
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param);
+bool SupportDNNLLeakyRelu(const LeakyReLUParam& param, const NDArray& input);
+bool SupportQuantizedDNNLAct(const ActivationParam& param);
+bool SupportDNNLConv(const ConvolutionParam& params, const NDArray& input);
+bool SupportDNNLDeconv(const DeconvolutionParam& params, const NDArray& input);
+bool SupportDNNLSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
+bool SupportDNNLLogSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
+bool SupportDNNLSoftmaxOutput(const SoftmaxOutputParam& param);
+bool SupportDNNLTranspose(const TransposeParam& param, const NDArray& data);
+bool SupportDNNLBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
+bool SupportDNNLLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs);
+bool SupportDNNLReshape(const NDArray& input, const NDArray& output);
 }  // namespace op
 
 static int GetTypeSize(int dtype) {
@@ -209,64 +207,64 @@ static int GetTypeSize(int dtype) {
 }
 
 static inline size_t GetArraySize(const NDArray& arr) {
-  if (arr.IsMKLDNNData()) {
-    return arr.GetMKLDNNData()->get_desc().get_size();
+  if (arr.IsDNNLData()) {
+    return arr.GetDNNLData()->get_desc().get_size();
   }
   return arr.shape().Size() * GetTypeSize(arr.dtype());
 }
 
-static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
+static inline dnnl::memory::data_type get_dnnl_type(int dtype) {
   switch (dtype) {
     case mshadow::kFloat32:
-      return mkldnn::memory::data_type::f32;
+      return dnnl::memory::data_type::f32;
     case mshadow::kBfloat16:
-      return mkldnn::memory::data_type::bf16;
+      return dnnl::memory::data_type::bf16;
     case mshadow::kInt32:
-      return mkldnn::memory::data_type::s32;
+      return dnnl::memory::data_type::s32;
     case mshadow::kInt8:
-      return mkldnn::memory::data_type::s8;
+      return dnnl::memory::data_type::s8;
     case mshadow::kUint8:
-      return mkldnn::memory::data_type::u8;
+      return dnnl::memory::data_type::u8;
     default:
-      LOG(FATAL) << "unknown type for MKLDNN :" << static_cast<int>(dtype);
-      return mkldnn::memory::data_type::undef;
+      LOG(FATAL) << "unknown type for DNNL :" << static_cast<int>(dtype);
+      return dnnl::memory::data_type::undef;
   }
 }
 
 template <typename T>
-static inline mkldnn::memory::data_type get_mkldnn_type() {
-  return static_cast<mkldnn::memory::data_type>(data_type_enum<T>::type);
+static inline dnnl::memory::data_type get_dnnl_type() {
+  return static_cast<dnnl::memory::data_type>(data_type_enum<T>::type);
 }
 
-static inline mkldnn_data_type_t get_mkldnn_type_t(int dtype) {
-  return static_cast<mkldnn_data_type_t>(get_mkldnn_type(dtype));
+static inline dnnl_data_type_t get_dnnl_type_t(int dtype) {
+  return static_cast<dnnl_data_type_t>(get_dnnl_type(dtype));
 }
 
 template <typename T>
-static inline mkldnn_data_type_t get_mkldnn_type_t() {
-  return static_cast<mkldnn_data_type_t>(data_type_enum<T>::type);
+static inline dnnl_data_type_t get_dnnl_type_t() {
+  return static_cast<dnnl_data_type_t>(data_type_enum<T>::type);
 }
 
-static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
-  auto mkldnn_dtype = static_cast<mkldnn::memory::data_type>(dtype);
-  switch (mkldnn_dtype) {
-    case mkldnn::memory::data_type::f32:
+static inline int get_mxnet_type(dnnl_data_type_t dtype) {
+  auto dnnl_dtype = static_cast<dnnl::memory::data_type>(dtype);
+  switch (dnnl_dtype) {
+    case dnnl::memory::data_type::f32:
       return mshadow::kFloat32;
-    case mkldnn::memory::data_type::bf16:
+    case dnnl::memory::data_type::bf16:
       return mshadow::kBfloat16;
-    case mkldnn::memory::data_type::s32:
+    case dnnl::memory::data_type::s32:
       return mshadow::kInt32;
-    case mkldnn::memory::data_type::s8:
+    case dnnl::memory::data_type::s8:
       return mshadow::kInt8;
-    case mkldnn::memory::data_type::u8:
+    case dnnl::memory::data_type::u8:
       return mshadow::kUint8;
     default:
-      LOG(FATAL) << "unknown MKLDNN type";
+      LOG(FATAL) << "unknown DNNL type";
       return mshadow::kFloat32;
   }
 }
 
-static inline size_t GetMemDescSize(const mkldnn::memory::desc& md) {
+static inline size_t GetMemDescSize(const dnnl::memory::desc& md) {
   if (md.data.ndims == 0)
     return 0;
 
@@ -279,53 +277,53 @@ static inline size_t GetMemDescSize(const mkldnn::memory::desc& md) {
   return ret;
 }
 
-inline static mkldnn::memory::desc GetMemDesc(const NDArray& arr, int dtype = -1) {
+inline static dnnl::memory::desc GetMemDesc(const NDArray& arr, int dtype = -1) {
   int ndim = arr.shape().ndim();
-  mkldnn::memory::dims dims(ndim);
+  dnnl::memory::dims dims(ndim);
   dtype = (dtype == -1) ? arr.dtype() : dtype;
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = arr.shape()[i];
-  return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
+  return dnnl::memory::desc{dims, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
 }
 
-inline static bool ChooseBRGEMMImpl(const mkldnn::memory::dims& weight_dims, size_t batch_size) {
+inline static bool ChooseBRGEMMImpl(const dnnl::memory::dims& weight_dims, size_t batch_size) {
   // Conditions based on measurement results done on CLX8280
   // https://github.com/apache/incubator-mxnet/pull/20533
   return weight_dims[0] >= 1024 && weight_dims[1] >= 1024 && batch_size >= 16384 &&
          weight_dims[0] % 64 == 0 && weight_dims[1] % 64 == 0;
 }
 
-inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray& arr,
-                                                   size_t batch_size,
-                                                   int dtype = -1) {
+inline static dnnl::memory::desc GetFCWeightDesc(const NDArray& arr,
+                                                 size_t batch_size,
+                                                 int dtype = -1) {
   int ndim = arr.shape().ndim();
-  mkldnn::memory::dims dims(ndim);
+  dnnl::memory::dims dims(ndim);
   dtype = (dtype == -1) ? arr.dtype() : dtype;
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = arr.shape()[i];
-  auto format = mkldnn::memory::format_tag::any;
+  auto format = dnnl::memory::format_tag::any;
   // for batch 256 alexnet benchmark test
   const bool force_fc_ab_format = dmlc::GetEnv("MXNET_ONEDNN_FORCE_FC_AB_FORMAT", false);
   if (dims.size() == 2) {
     if (force_fc_ab_format || !ChooseBRGEMMImpl(dims, batch_size)) {
-      format = mkldnn::memory::format_tag::ab;
+      format = dnnl::memory::format_tag::ab;
     }
   }
 
-  return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), format};
+  return dnnl::memory::desc{dims, get_dnnl_type(dtype), format};
 }
 
-inline static mkldnn::memory::desc GetWeightDesc(const NDArray& arr,
-                                                 int num_groups,
-                                                 bool quantized = false) {
+inline static dnnl::memory::desc GetWeightDesc(const NDArray& arr,
+                                               int num_groups,
+                                               bool quantized = false) {
   int dtype = quantized ? mshadow::kInt8 : arr.dtype();
   if (num_groups == 1) {
     return GetMemDesc(arr, dtype);
   } else {
     const auto ndim = arr.shape().ndim();
     CHECK((ndim == 3) || (ndim == 4) || (ndim == 5))
-        << "MKL-DNN weight currently supports 3d or 4d or 5d layout";
-    auto tz = mkldnn::memory::dims{0};
+        << "DNNL weight currently supports 3d or 4d or 5d layout";
+    auto tz = dnnl::memory::dims{0};
     int N = 0, C = 1, H = 2, W = 3;
     int D = -1;
     if (ndim == 5) {
@@ -335,39 +333,39 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray& arr,
     }
     switch (ndim) {
       case 3:
-        tz = mkldnn::memory::dims{
+        tz = dnnl::memory::dims{
             num_groups, arr.shape()[N] / num_groups, arr.shape()[C], arr.shape()[H]};
         break;
       case 4:
-        tz = mkldnn::memory::dims{num_groups,
-                                  arr.shape()[N] / num_groups,
-                                  arr.shape()[C],
-                                  arr.shape()[H],
-                                  arr.shape()[W]};
+        tz = dnnl::memory::dims{num_groups,
+                                arr.shape()[N] / num_groups,
+                                arr.shape()[C],
+                                arr.shape()[H],
+                                arr.shape()[W]};
         break;
       case 5:
-        tz = mkldnn::memory::dims{num_groups,
-                                  arr.shape()[N] / num_groups,
-                                  arr.shape()[C],
-                                  arr.shape()[D],
-                                  arr.shape()[H],
-                                  arr.shape()[W]};
+        tz = dnnl::memory::dims{num_groups,
+                                arr.shape()[N] / num_groups,
+                                arr.shape()[C],
+                                arr.shape()[D],
+                                arr.shape()[H],
+                                arr.shape()[W]};
     }
-    return mkldnn::memory::desc{tz, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
+    return dnnl::memory::desc{tz, get_dnnl_type(dtype), dnnl::memory::format_tag::any};
   }
 }
 
-inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray>& inputs) {
+inline static bool CheckDNNLInputArrayIsView(const std::vector<NDArray>& inputs) {
   for (const auto& in : inputs) {
-    if (in.IsView() && in.IsMKLDNNData()) {
+    if (in.IsView() && in.IsDNNLData()) {
       return true;
     }
   }
   return false;
 }
 
-typedef std::shared_ptr<mkldnn::memory> mkldnn_mem_ptr;
-typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
+typedef std::shared_ptr<dnnl::memory> dnnl_mem_ptr;
+typedef std::shared_ptr<const dnnl::memory> dnnl_mem_const_ptr;
 
 /*
  * This is to manage the temporary memory provided by MXNet for operators.
@@ -388,7 +386,7 @@ class TmpMemMgr {
   size_t curr_size;
   // This estimate the required temp memory size in an operator.
   size_t est_size;
-  const size_t alignment = kMKLDNNAlign;
+  const size_t alignment = kDNNLAlign;
 
  public:
   static TmpMemMgr* Get() {
@@ -428,26 +426,26 @@ class TmpMemMgr {
     this->est_size = 0;
   }
 
-  mkldnn::memory* Alloc(const mkldnn::memory::desc& md);
+  dnnl::memory* Alloc(const dnnl::memory::desc& md);
 };
 
-typedef std::unordered_map<int, mkldnn::memory> mkldnn_args_map_t;
-class MKLDNNStream {
-  std::vector<std::pair<mkldnn::primitive, mkldnn_args_map_t> > net_prim_args;
+typedef std::unordered_map<int, dnnl::memory> dnnl_args_map_t;
+class DNNLStream {
+  std::vector<std::pair<dnnl::primitive, dnnl_args_map_t> > net_prim_args;
   // Here we hold all memory related to the operators in the stream.
-  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
-  mkldnn::stream s;
+  std::vector<std::shared_ptr<const dnnl::memory> > mem_holder;
+  dnnl::stream s;
 
  public:
-  static MKLDNNStream* Get();
+  static DNNLStream* Get();
 
-  MKLDNNStream() : s(CpuEngine::Get()->get_engine()) {}
+  DNNLStream() : s(CpuEngine::Get()->get_engine()) {}
 
-  void RegisterPrimArgs(const mkldnn::primitive& prim, const mkldnn_args_map_t& args) {
+  void RegisterPrimArgs(const dnnl::primitive& prim, const dnnl_args_map_t& args) {
     net_prim_args.emplace_back(prim, args);
   }
 
-  void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
+  void RegisterMem(std::shared_ptr<const dnnl::memory> mem) {
     mem_holder.push_back(mem);
   }
 
@@ -456,9 +454,9 @@ class MKLDNNStream {
   }
 
   /*
-   * After submitting mkldnn operations for execution, we need to
+   * After submitting dnnl operations for execution, we need to
    * clean up memory held by the stream. However, sometimes users
-   * might want to separate mkldnn execution and memory cleanup.
+   * might want to separate dnnl execution and memory cleanup.
    */
   void Submit(bool cleanup = true) {
     if (!net_prim_args.empty()) {
@@ -483,62 +481,61 @@ enum OutDataOp {
   AddBack,
 };
 
-typedef std::pair<OutDataOp, mkldnn::memory*> mkldnn_output_t;
-void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem);
+typedef std::pair<OutDataOp, dnnl::memory*> dnnl_output_t;
+void DNNLMemoryCopy(const dnnl::memory& mem, const dnnl::memory* this_mem);
 
 /*
- * Here we want to get MKLDNN memory whose desc is exactly the same as
+ * Here we want to get DNNL memory whose desc is exactly the same as
  * the given one. operator== can't guarantee that. == can return true even if
  * the formats are different. I need to double check its format.
  */
-static inline mkldnn::memory* GetMKLDNNExact(const mkldnn::memory* mem,
-                                             const mkldnn::memory::desc& desc) {
-  mkldnn::memory::desc src_desc = mem->get_desc();
+static inline dnnl::memory* GetDNNLExact(const dnnl::memory* mem, const dnnl::memory::desc& desc) {
+  dnnl::memory::desc src_desc = mem->get_desc();
   if (desc == src_desc) {
-    return const_cast<mkldnn::memory*>(mem);
+    return const_cast<dnnl::memory*>(mem);
   } else {
-    std::shared_ptr<mkldnn::memory> ret(
-        new mkldnn::memory(desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    std::shared_ptr<dnnl::memory> ret(
+        new dnnl::memory(desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+    DNNLStream::Get()->RegisterMem(ret);
     return ret.get();
   }
 }
 
 /*
- * These two functions try to create MKLDNN memory in an NDArray based on `req'.
- * The difference is that the first function can create MKLDNN memory with
- * special layouts in an NDArray, while the second one can only create MKLDNN
+ * These two functions try to create DNNL memory in an NDArray based on `req'.
+ * The difference is that the first function can create DNNL memory with
+ * special layouts in an NDArray, while the second one can only create DNNL
  * memory with default layouts.
  * Also an optional in_arr parameter can be passed in the first function with
- * the kWriteInPlace req to validate if mkldnn can support write in place;
+ * the kWriteInPlace req to validate if dnnl can support write in place;
  * otherwise new memory will be written to an copied back onto out_arr.
  * If these two functions are used, we have to call CommitOutput to write
  * the output back to the output NDArray.
  */
-mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
-                                const mkldnn::memory::desc& desc,
-                                OpReqType req,
-                                const NDArray* in_arr = nullptr);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
-                                       const mkldnn::memory::desc& desc,
-                                       OpReqType req);
+dnnl_output_t CreateDNNLMem(const NDArray& out_arr,
+                            const dnnl::memory::desc& desc,
+                            OpReqType req,
+                            const NDArray* in_arr = nullptr);
+dnnl_output_t CreateDNNLWeightGrad(const NDArray& out_arr,
+                                   const dnnl::memory::desc& desc,
+                                   OpReqType req);
 /* This function has to be used with one of the functions above. */
-void CommitOutput(const NDArray& arr, const mkldnn_output_t& res);
+void CommitOutput(const NDArray& arr, const dnnl_output_t& res);
 
-const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups);
+const dnnl::memory* GetWeights(const NDArray& arr, int num_groups);
 
-const mkldnn::memory* GetWeights(const NDArray& arr,
-                                 const mkldnn::memory::desc& target_md,
-                                 int num_groups);
+const dnnl::memory* GetWeights(const NDArray& arr,
+                               const dnnl::memory::desc& target_md,
+                               int num_groups);
 
-bool IsDefaultFormat(const mkldnn::memory::desc& desc);
-bool IsMKLDNN(const mkldnn::memory::desc& desc);
+bool IsDefaultFormat(const dnnl::memory::desc& desc);
+bool IsDNNL(const dnnl::memory::desc& desc);
 
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& md);
-mkldnn_format_tag_t GetDefaultFormat(int num_dims);
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& md, const mkldnn_format_tag_t& format);
+dnnl_format_tag_t GetDefaultFormat(const dnnl::memory::desc& md);
+dnnl_format_tag_t GetDefaultFormat(int num_dims);
+dnnl::memory::desc GetDesc(const dnnl::memory::desc& md, const dnnl_format_tag_t& format);
 
-inline bool same_shape(const mxnet::TShape& shape, const mkldnn_dims_t dims, int ndims) {
+inline bool same_shape(const mxnet::TShape& shape, const dnnl_dims_t dims, int ndims) {
   if (shape.ndim() != ndims)
     return false;
   for (int i = 0; i < ndims; i++)
@@ -547,7 +544,7 @@ inline bool same_shape(const mxnet::TShape& shape, const mkldnn_dims_t dims, int
   return true;
 }
 
-inline bool same_shape(const mkldnn::memory::desc& desc1, const mkldnn::memory::desc& desc2) {
+inline bool same_shape(const dnnl::memory::desc& desc1, const dnnl::memory::desc& desc2) {
   if (desc1.data.ndims != desc2.data.ndims)
     return false;
   for (int i = 0; i < desc1.data.ndims; i++)
@@ -556,28 +553,28 @@ inline bool same_shape(const mkldnn::memory::desc& desc1, const mkldnn::memory::
   return true;
 }
 
-inline bool same_shape(const mxnet::TShape& shape, int dtype, const mkldnn::memory::desc& desc) {
+inline bool same_shape(const mxnet::TShape& shape, int dtype, const dnnl::memory::desc& desc) {
   return same_shape(shape, desc.data.dims, desc.data.ndims) &&
-         get_mkldnn_type(dtype) == desc.data.data_type;
+         get_dnnl_type(dtype) == desc.data.data_type;
 }
 
 /*
- * There is a large overhead of getting mkldnn::memory::desc from
- * mkldnn::memory. This class is created to cache the metadata of mkldnn memory
+ * There is a large overhead of getting dnnl::memory::desc from
+ * dnnl::memory. This class is created to cache the metadata of dnnl memory
  * to provide a much more lightweight method to access them.
  */
-class MKLDNNMemory {
-  std::shared_ptr<mkldnn::memory> mem;
-  mkldnn::memory::desc desc;
+class DNNLMemory {
+  std::shared_ptr<dnnl::memory> mem;
+  dnnl::memory::desc desc;
   size_t size;  // The number of bytes.
 
  public:
-  MKLDNNMemory(mkldnn::memory::desc md, void* addr) : desc(md) {
-    mem.reset(new mkldnn::memory(md, CpuEngine::Get()->get_engine(), addr));
+  DNNLMemory(dnnl::memory::desc md, void* addr) : desc(md) {
+    mem.reset(new dnnl::memory(md, CpuEngine::Get()->get_engine(), addr));
     size = desc.get_size();
   }
 
-  explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem) : desc(mem->get_desc()) {
+  explicit DNNLMemory(std::shared_ptr<dnnl::memory> mem) : desc(mem->get_desc()) {
     this->mem = mem;
     size      = desc.get_size();
   }
@@ -590,11 +587,11 @@ class MKLDNNMemory {
     return mem->get_data_handle();
   }
 
-  std::shared_ptr<mkldnn::memory> GetMem() const {
+  std::shared_ptr<dnnl::memory> GetMem() const {
     return mem;
   }
 
-  mkldnn::memory* GetRaw() const {
+  dnnl::memory* GetRaw() const {
     return mem.get();
   }
 
@@ -602,31 +599,31 @@ class MKLDNNMemory {
     return size;
   }
 
-  mkldnn::memory::desc GetDesc() const {
+  dnnl::memory::desc GetDesc() const {
     return mem->get_desc();
   }
 
-  mkldnn::memory::desc GetDesc(
-      mkldnn_format_tag_t format,
-      mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
-    mkldnn::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
-    mkldnn::memory::data_type cpp_type =
-        (data_type == mkldnn::memory::data_type::undef)
-            ? static_cast<mkldnn::memory::data_type>(desc.data.data_type)
+  dnnl::memory::desc GetDesc(
+      dnnl_format_tag_t format,
+      dnnl::memory::data_type data_type = dnnl::memory::data_type::undef) const {
+    dnnl::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
+    dnnl::memory::data_type cpp_type =
+        (data_type == dnnl::memory::data_type::undef)
+            ? static_cast<dnnl::memory::data_type>(desc.data.data_type)
             : data_type;
-    mkldnn::memory::desc data_md(dims, cpp_type, static_cast<mkldnn::memory::format_tag>(format));
+    dnnl::memory::desc data_md(dims, cpp_type, static_cast<dnnl::memory::format_tag>(format));
     return data_md;
   }
 
-  mkldnn_format_tag_t GetDefaultFormat() const {
+  dnnl_format_tag_t GetDefaultFormat() const {
     return mxnet::GetDefaultFormat(desc);
   }
 
-  bool IsMKLDNN() const {
-    return mxnet::IsMKLDNN(desc);
+  bool IsDNNL() const {
+    return mxnet::IsDNNL(desc);
   }
 
-  bool SameFormat(mkldnn::memory::desc md) const {
+  bool SameFormat(dnnl::memory::desc md) const {
     return mem->get_desc() == md;
   }
 
@@ -634,14 +631,14 @@ class MKLDNNMemory {
     return same_shape(shape, dtype, desc);
   }
 
-  void ReorderTo(mkldnn::memory* other) const {
-    mkldnn::stream s(CpuEngine::Get()->get_engine());
-    mkldnn::reorder(*mem, *other).execute(s, *mem, *other);
+  void ReorderTo(dnnl::memory* other) const {
+    dnnl::stream s(CpuEngine::Get()->get_engine());
+    dnnl::reorder(*mem, *other).execute(s, *mem, *other);
   }
 };
 
-// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst);
+// reorder dnnl src to dst format dtype
+void ReorderTo(const dnnl::memory* src, const dnnl::memory* dst);
 
 template <typename Compute, typename AttrState>
 void FallBackCompute(Compute fn,
@@ -652,7 +649,7 @@ void FallBackCompute(Compute fn,
                      const std::vector<NDArray>& outputs);
 
 /*
- * This class is used to check the correctness of MKLDNN operators.
+ * This class is used to check the correctness of DNNL operators.
  */
 class OpCheck {
   std::vector<mxnet::NDArray> inputs;
@@ -679,39 +676,39 @@ class OpCheck {
   void CopyResult(const std::vector<mxnet::NDArray>& outputs_, const std::vector<size_t>& indice);
 };
 
-bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
-                       const int dev_mask,
-                       bool support_mkldnn,
-                       DispatchMode* dispatch_mode,
-                       std::vector<int>* in_attrs,
-                       std::vector<int>* out_attrs);
-
-#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
-  static bool debug = dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false);   \
-  OpCheck check(backward, num_checks);                             \
-  if (debug)                                                       \
-    check.Init(inputs, outputs);
+bool DNNLStorageType(const nnvm::NodeAttrs& attrs,
+                     const int dev_mask,
+                     bool support_dnnl,
+                     DispatchMode* dispatch_mode,
+                     std::vector<int>* in_attrs,
+                     std::vector<int>* out_attrs);
 
-#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
+#define DNNL_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
+  static bool debug = dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false); \
+  OpCheck check(backward, num_checks);                           \
   if (debug)                                                     \
+    check.Init(inputs, outputs);
+
+#define DNNL_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
+  if (debug)                                                   \
     check.Run(fn, attrs, ctx, inputs, req, outputs);
-#define MKLDNN_OPCHECK_COPY_RESULT(outputs, indice) \
-  if (debug)                                        \
+#define DNNL_OPCHECK_COPY_RESULT(outputs, indice) \
+  if (debug)                                      \
     check.CopyResult(outputs, indice);
 
-struct MKLDNNPostEltwiseParam {
-  mkldnn::algorithm alg = mkldnn::algorithm::undef;
-  float scale           = 1.f;
-  float alpha           = 0.f;
-  float beta            = 1.f;
+struct DNNLPostEltwiseParam {
+  dnnl::algorithm alg = dnnl::algorithm::undef;
+  float scale         = 1.f;
+  float alpha         = 0.f;
+  float beta          = 1.f;
 };
 
-void MKLDNNRun(mxnet::FComputeEx fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const std::vector<mxnet::NDArray>& inputs_,
-               const std::vector<mxnet::OpReqType>& req,
-               const std::vector<mxnet::NDArray>& outputs_);
+void DNNLRun(mxnet::FComputeEx fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const std::vector<mxnet::NDArray>& inputs_,
+             const std::vector<mxnet::OpReqType>& req,
+             const std::vector<mxnet::NDArray>& outputs_);
 
 using FComputeExUnary = std::function<void(const nnvm::NodeAttrs& attrs,
                                            const OpContext& ctx,
@@ -719,13 +716,13 @@ using FComputeExUnary = std::function<void(const nnvm::NodeAttrs& attrs,
                                            const OpReqType& req,
                                            const NDArray& output)>;
 
-void MKLDNNRun(FComputeExUnary fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const mxnet::NDArray& inputs_,
-               const mxnet::OpReqType& req,
-               const mxnet::NDArray& outputs_);
+void DNNLRun(FComputeExUnary fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const mxnet::NDArray& inputs_,
+             const mxnet::OpReqType& req,
+             const mxnet::NDArray& outputs_);
 
 }  // namespace mxnet
 #endif
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_BASE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/dnnl/dnnl_base.cc
similarity index 57%
rename from src/operator/nn/mkldnn/mkldnn_base.cc
rename to src/operator/nn/dnnl/dnnl_base.cc
index 5415e9e..d1e8918 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/dnnl/dnnl_base.cc
@@ -21,19 +21,18 @@
 
 #include <atomic>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../../../common/exec_utils.h"
 #include "../../operator_common.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 
-MKLDNNStream* MKLDNNStream::Get() {
+DNNLStream* DNNLStream::Get() {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local MKLDNNStream stream;
+  static thread_local DNNLStream stream;
 #else
-  static MX_THREAD_LOCAL MKLDNNStream stream;
+  static MX_THREAD_LOCAL DNNLStream stream;
 #endif
   return &stream;
 }
@@ -56,15 +55,15 @@ void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space) {
   return reinterpret_cast<void*>(addr);
 }
 
-mkldnn::memory* TmpMemMgr::Alloc(const mkldnn::memory::desc& md) {
+dnnl::memory* TmpMemMgr::Alloc(const dnnl::memory::desc& md) {
   // We need to include the size of the memory used for alignment.
   this->est_size += md.get_size() + alignment;
   void* mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
   if (mem) {
     // The memory is allocated from the temporary memory space in the
     // operator. It'll only become invalid after we exit from the operator.
-    mkldnn_mem_ptr ret(new mkldnn::memory(md, CpuEngine::Get()->get_engine(), mem));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    dnnl_mem_ptr ret(new dnnl::memory(md, CpuEngine::Get()->get_engine(), mem));
+    DNNLStream::Get()->RegisterMem(ret);
     CHECK_EQ(mem, mem);
     this->curr_size -= md.get_size();
     this->curr_mem = static_cast<char*>(mem) + md.get_size();
@@ -73,170 +72,163 @@ mkldnn::memory* TmpMemMgr::Alloc(const mkldnn::memory::desc& md) {
     // If curr_mem has been initialized and we still reach here, it means the current
     // allocated memory isn't enough. But it doesn't matter for multiple invokes of a
     // operator, as the TmpMemMgr could estimate the space at the first iteration and
-    // then re-requests abundant space from MXNet resource. MKL-DNN could allocate
+    // then re-requests abundant space from MXNet resource. DNNL could allocate
     // the space by itself. Thus, we just let it continue for estimating the maximum
     // required space size. It will be allocated at next call.
     if (this->curr_mem && dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false)) {
-      LOG(WARNING) << "mkl-dnn debug message: The rest of the temporary space is not "
-                   << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
+      LOG(WARNING) << "DNNL debug message: The rest of the temporary space is not "
+                   << "adequate for allocating " << md.get_size() << " bytes. Thus, DNNL "
                    << "allocate the space by itself.";
     }
-    mkldnn_mem_ptr ret(new mkldnn::memory(md, CpuEngine::Get()->get_engine()));
-    MKLDNNStream::Get()->RegisterMem(ret);
+    dnnl_mem_ptr ret(new dnnl::memory(md, CpuEngine::Get()->get_engine()));
+    DNNLStream::Get()->RegisterMem(ret);
     return ret.get();
   }
 }
 
-void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem) {
-  MKLDNNStream* stream                = MKLDNNStream::Get();
-  mkldnn::memory::desc from_desc      = mem.get_desc();
-  mkldnn::memory::desc this_desc      = this_mem->get_desc();
-  mkldnn_format_tag_t from_def_format = GetDefaultFormat(from_desc);
-  mkldnn_format_tag_t this_def_format = GetDefaultFormat(this_desc);
+void DNNLMemoryCopy(const dnnl::memory& mem, const dnnl::memory* this_mem) {
+  DNNLStream* stream                = DNNLStream::Get();
+  dnnl::memory::desc from_desc      = mem.get_desc();
+  dnnl::memory::desc this_desc      = this_mem->get_desc();
+  dnnl_format_tag_t from_def_format = GetDefaultFormat(from_desc);
+  dnnl_format_tag_t this_def_format = GetDefaultFormat(this_desc);
 
   if (!same_shape(this_desc, from_desc) && IsDefaultFormat(from_desc)) {
-    // In this case, we can simply create a new MKLDNN memory for the required
+    // In this case, we can simply create a new DNNL memory for the required
     // shape.
-    mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims);
-    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
-    mkldnn::memory::desc data_md(
-        dims, this_dtype, static_cast<mkldnn::memory::format_tag>(this_def_format));
+    dnnl::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<dnnl::memory::data_type>(this_desc.data.data_type);
+    dnnl::memory::desc data_md(
+        dims, this_dtype, static_cast<dnnl::memory::format_tag>(this_def_format));
 
-    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(data_md, mem.get_engine(), mem.get_data_handle()));
+    dnnl_mem_ptr tmp_mem(new dnnl::memory(data_md, mem.get_engine(), mem.get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
-    stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
+    std::unordered_map<int, dnnl::memory> args(
+        {{DNNL_ARG_FROM, *tmp_mem}, {DNNL_ARG_TO, *this_mem}});
+    stream->RegisterPrimArgs(dnnl::reorder(*tmp_mem, *this_mem), args);
   } else if (!same_shape(this_desc, from_desc)) {
     // In this case, the source memory stores data in a customized layout. We
     // need to reorganize the data in memory before we can reshape.
-    mkldnn::memory::desc def_desc = GetDesc(from_desc, from_def_format);
-    mkldnn::memory* def_mem       = TmpMemMgr::Get()->Alloc(def_desc);
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *def_mem}});
-    stream->RegisterPrimArgs(mkldnn::reorder(mem, *def_mem), args);
+    dnnl::memory::desc def_desc = GetDesc(from_desc, from_def_format);
+    dnnl::memory* def_mem       = TmpMemMgr::Get()->Alloc(def_desc);
+    std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *def_mem}});
+    stream->RegisterPrimArgs(dnnl::reorder(mem, *def_mem), args);
 
     // Now we can reshape it
-    mkldnn_mem_ptr tmp_mem(
-        new mkldnn::memory(this_desc, mem.get_engine(), def_mem->get_data_handle()));
+    dnnl_mem_ptr tmp_mem(new dnnl::memory(this_desc, mem.get_engine(), def_mem->get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    args = {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}};
-    stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
+    args = {{DNNL_ARG_FROM, *tmp_mem}, {DNNL_ARG_TO, *this_mem}};
+    stream->RegisterPrimArgs(dnnl::reorder(*tmp_mem, *this_mem), args);
   } else if (this_desc == from_desc) {
-    std::unordered_map<int, mkldnn::memory> args(
-        {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
+    std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *this_mem}});
     // If the layout is the same, we can just copy data.
-    stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
+    stream->RegisterPrimArgs(dnnl::reorder(mem, *this_mem), args);
   } else {
     // If both are not using the default layouts. There isn't much we can do,
     // other than reorder data layout directly.
     if (!IsDefaultFormat(this_desc) && !IsDefaultFormat(from_desc)) {
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
-      stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
+      std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *this_mem}});
+      stream->RegisterPrimArgs(dnnl::reorder(mem, *this_mem), args);
     } else if (IsDefaultFormat(this_desc)) {
       // If the dest mem uses the default memory layout, we can simply use
       // the default format of the source memory to improve perf of reorder.
-      mkldnn::memory::desc desc = GetDesc(from_desc, from_def_format);
-      mkldnn_mem_ptr tmp_mem(
-          new mkldnn::memory(desc, mem.get_engine(), this_mem->get_data_handle()));
+      dnnl::memory::desc desc = GetDesc(from_desc, from_def_format);
+      dnnl_mem_ptr tmp_mem(new dnnl::memory(desc, mem.get_engine(), this_mem->get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *tmp_mem}});
-      stream->RegisterPrimArgs(mkldnn::reorder(mem, *tmp_mem), args);
+      std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, mem}, {DNNL_ARG_TO, *tmp_mem}});
+      stream->RegisterPrimArgs(dnnl::reorder(mem, *tmp_mem), args);
     } else {
       // If the src mem uses the default memory layout, we can use
       // the default format of the source memory to improve perf.
-      mkldnn::memory::desc desc = GetDesc(this_desc, this_def_format);
-      mkldnn_mem_ptr tmp_mem(
-          new mkldnn::memory(desc, this_mem->get_engine(), mem.get_data_handle()));
+      dnnl::memory::desc desc = GetDesc(this_desc, this_def_format);
+      dnnl_mem_ptr tmp_mem(new dnnl::memory(desc, this_mem->get_engine(), mem.get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      std::unordered_map<int, mkldnn::memory> args(
-          {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
-      stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
+      std::unordered_map<int, dnnl::memory> args(
+          {{DNNL_ARG_FROM, *tmp_mem}, {DNNL_ARG_TO, *this_mem}});
+      stream->RegisterPrimArgs(dnnl::reorder(*tmp_mem, *this_mem), args);
     }
   }
 }
 
-bool CanWriteTo(const NDArray& out_arr, const NDArray& in_arr, const mkldnn::memory::desc& desc) {
-  auto in_mem     = in_arr.GetMKLDNNData();
-  bool add_same   = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
-  bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc && in_mem->get_desc() == desc;
+bool CanWriteTo(const NDArray& out_arr, const NDArray& in_arr, const dnnl::memory::desc& desc) {
+  auto in_mem     = in_arr.GetDNNLData();
+  bool add_same   = in_mem->get_data_handle() == out_arr.GetDNNLData()->get_data_handle();
+  bool pdesc_same = out_arr.GetDNNLData()->get_desc() == desc && in_mem->get_desc() == desc;
   return add_same && pdesc_same;
 }
 
-mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
-                                const mkldnn::memory::desc& desc,
-                                OpReqType req,
-                                const NDArray* in_arr) {
+dnnl_output_t CreateDNNLMem(const NDArray& out_arr,
+                            const dnnl::memory::desc& desc,
+                            OpReqType req,
+                            const NDArray* in_arr) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+    return dnnl_output_t(OutDataOp::AddBack, tmp);
   } else if (kWriteInplace == req && in_arr != nullptr && CanWriteTo(out_arr, *in_arr, desc)) {
-    mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
-    // mem is nullptr if out_arr is view and desc is MKLDNN format.
-    // need to Reorder2Default before calling CreateMKLDNNMem
+    dnnl::memory* mem = const_cast<NDArray&>(out_arr).CreateDNNLData(desc);
+    // mem is nullptr if out_arr is view and desc is DNNL format.
+    // need to Reorder2Default before calling CreateDNNLMem
     CHECK(mem != nullptr);
-    return mkldnn_output_t(OutDataOp::Noop, mem);
+    return dnnl_output_t(OutDataOp::Noop, mem);
   } else if (kWriteInplace == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    return dnnl_output_t(OutDataOp::CopyBack, tmp);
   } else if (kWriteTo == req) {
-    mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
+    dnnl::memory* mem = const_cast<NDArray&>(out_arr).CreateDNNLData(desc);
     if (nullptr == mem) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+      return dnnl_output_t(OutDataOp::CopyBack, tmp);
     }
-    return mkldnn_output_t(OutDataOp::Noop, mem);
+    return dnnl_output_t(OutDataOp::Noop, mem);
   }
   auto tmp = TmpMemMgr::Get()->Alloc(desc);
-  return mkldnn_output_t(OutDataOp::Noop, tmp);
+  return dnnl_output_t(OutDataOp::Noop, tmp);
 }
 
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
-                                       const mkldnn::memory::desc& desc,
-                                       OpReqType req) {
+dnnl_output_t CreateDNNLWeightGrad(const NDArray& out_arr,
+                                   const dnnl::memory::desc& desc,
+                                   OpReqType req) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+    return dnnl_output_t(OutDataOp::AddBack, tmp);
   } else if (kWriteInplace == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    return dnnl_output_t(OutDataOp::CopyBack, tmp);
   } else {
-    mkldnn::memory* mem = nullptr;
+    dnnl::memory* mem = nullptr;
     if (IsDefaultFormat(desc)) {
-      mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
+      mem = const_cast<NDArray&>(out_arr).CreateDNNLData(desc);
     }
     if (mem == nullptr) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+      return dnnl_output_t(OutDataOp::CopyBack, tmp);
     } else {
-      return mkldnn_output_t(OutDataOp::Noop, mem);
+      return dnnl_output_t(OutDataOp::Noop, mem);
     }
   }
 }
 
-void CommitOutput(const NDArray& arr, const mkldnn_output_t& res) {
+void CommitOutput(const NDArray& arr, const dnnl_output_t& res) {
   if (res.first == CopyBack) {
     const_cast<NDArray&>(arr).CopyFrom(*res.second);
   } else if (res.first == AddBack) {
     auto res_memory = res.second;
-    auto target_pd  = arr.GetMKLDNNData()->get_desc();
-    auto mem        = arr.GetMKLDNNData(res.second->get_desc());
+    auto target_pd  = arr.GetDNNLData()->get_desc();
+    auto mem        = arr.GetDNNLData(res.second->get_desc());
     if (mem == nullptr) {
       auto tmp_memory = TmpMemMgr::Get()->Alloc(target_pd);
-      MKLDNNMemoryCopy(*res_memory, tmp_memory);
+      DNNLMemoryCopy(*res_memory, tmp_memory);
       res_memory = tmp_memory;
-      mem        = arr.GetMKLDNNData();
+      mem        = arr.GetDNNLData();
     }
-    op::MKLDNNSum(*mem, *res_memory, *mem);
+    op::DNNLSum(*mem, *res_memory, *mem);
   }
 }
 
-const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups) {
-  const auto type = get_mkldnn_type(arr.dtype());
-  auto tz         = mkldnn::memory::dims{0};
-  auto format_tag = mkldnn::memory::format_tag::undef;
+const dnnl::memory* GetWeights(const NDArray& arr, int num_groups) {
+  const auto type = get_dnnl_type(arr.dtype());
+  auto tz         = dnnl::memory::dims{0};
+  auto format_tag = dnnl::memory::format_tag::undef;
   auto engine     = CpuEngine::Get()->get_engine();
   const int ndim  = arr.shape().ndim();
   int O = 0, I = 1, H = 2, W = 3;
@@ -247,69 +239,67 @@ const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups) {
     W = 4;
   }
   if (ndim == 2) {
-    tz         = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
-    format_tag = mkldnn::memory::format_tag::oi;
+    tz         = dnnl::memory::dims{arr.shape()[O], arr.shape()[I]};
+    format_tag = dnnl::memory::format_tag::oi;
   } else if (ndim == 3) {
-    tz = num_groups > 1 ? mkldnn::memory::dims{num_groups,
-                                               arr.shape()[O] / num_groups,
-                                               arr.shape()[I],
-                                               arr.shape()[H]}
-                        : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
-    format_tag =
-        num_groups > 1 ? mkldnn::memory::format_tag::goiw : mkldnn::memory::format_tag::oiw;
+    tz = num_groups > 1 ? dnnl::memory::dims{num_groups,
+                                             arr.shape()[O] / num_groups,
+                                             arr.shape()[I],
+                                             arr.shape()[H]}
+                        : dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
+    format_tag = num_groups > 1 ? dnnl::memory::format_tag::goiw : dnnl::memory::format_tag::oiw;
   } else if (ndim == 4) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    arr.shape()[O] / num_groups,
-                                    arr.shape()[I],
-                                    arr.shape()[H],
-                                    arr.shape()[W]}
-             : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
-    format_tag =
-        num_groups > 1 ? mkldnn::memory::format_tag::goihw : mkldnn::memory::format_tag::oihw;
+             ? dnnl::memory::dims{num_groups,
+                                  arr.shape()[O] / num_groups,
+                                  arr.shape()[I],
+                                  arr.shape()[H],
+                                  arr.shape()[W]}
+             : dnnl::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
+    format_tag = num_groups > 1 ? dnnl::memory::format_tag::goihw : dnnl::memory::format_tag::oihw;
   } else if (ndim == 5) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    arr.shape()[O] / num_groups,
-                                    arr.shape()[I],
-                                    arr.shape()[D],
-                                    arr.shape()[H],
-                                    arr.shape()[W]}
-             : mkldnn::memory::dims{
+             ? dnnl::memory::dims{num_groups,
+                                  arr.shape()[O] / num_groups,
+                                  arr.shape()[I],
+                                  arr.shape()[D],
+                                  arr.shape()[H],
+                                  arr.shape()[W]}
+             : dnnl::memory::dims{
                    arr.shape()[O], arr.shape()[I], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
     format_tag =
-        num_groups > 1 ? mkldnn::memory::format_tag::goidhw : mkldnn::memory::format_tag::oidhw;
+        num_groups > 1 ? dnnl::memory::format_tag::goidhw : dnnl::memory::format_tag::oidhw;
   } else {
     LOG(FATAL) << "The weight array has an unsupported number of dimensions";
   }
-  const auto md = mkldnn::memory::desc{tz, type, format_tag};
-  return arr.GetMKLDNNData(md);
+  const auto md = dnnl::memory::desc{tz, type, format_tag};
+  return arr.GetDNNLData(md);
 }
 
-const mkldnn::memory* GetWeights(const NDArray& arr,
-                                 const mkldnn::memory::desc& target_desc,
-                                 int num_groups) {
-  const mkldnn::memory* mem = arr.GetMKLDNNData(target_desc);
+const dnnl::memory* GetWeights(const NDArray& arr,
+                               const dnnl::memory::desc& target_desc,
+                               int num_groups) {
+  const dnnl::memory* mem = arr.GetDNNLData(target_desc);
   // If the weight array already uses the target layout, simply return it directly.
   if (mem)
     return mem;
   mem = GetWeights(arr, num_groups);
   if (mem == nullptr)
-    mem = arr.GetMKLDNNDataReorder(target_desc);
+    mem = arr.GetDNNLDataReorder(target_desc);
   if (mem->get_desc() == target_desc)
     return mem;
 
   auto ret = TmpMemMgr::Get()->Alloc(target_desc);
-  std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
-  MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
+  std::unordered_map<int, dnnl::memory> args({{DNNL_ARG_FROM, *mem}, {DNNL_ARG_TO, *ret}});
+  DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(*mem, *ret), args);
   return ret;
 }
 
 // default: block and dims' stride increase monotonically
-// mkldnn: 1.winograd 2.rnn packed 3. block and dims'stride is not increase monotonically
-bool IsMKLDNN(const mkldnn::memory::desc& desc) {
+// dnnl: 1.winograd 2.rnn packed 3. block and dims'stride is not increase monotonically
+bool IsDNNL(const dnnl::memory::desc& desc) {
   bool rslt = true;
-  if (desc.data.format_kind == mkldnn_blocked) {
+  if (desc.data.format_kind == dnnl_blocked) {
     if (desc.data.format_desc.blocking.inner_nblks == 0) {
       int i = 0;
       for (i = 0; i < desc.data.ndims - 1; i++) {
@@ -326,33 +316,33 @@ bool IsMKLDNN(const mkldnn::memory::desc& desc) {
   return rslt;
 }
 
-mkldnn_format_tag_t GetDefaultFormat(int num_dims) {
+dnnl_format_tag_t GetDefaultFormat(int num_dims) {
   switch (num_dims) {
     case 1:
-      return mkldnn_a;
+      return dnnl_a;
     case 2:
-      return mkldnn_ab;
+      return dnnl_ab;
     case 3:
-      return mkldnn_abc;
+      return dnnl_abc;
     case 4:
-      return mkldnn_abcd;
+      return dnnl_abcd;
     case 5:
-      return mkldnn_abcde;
+      return dnnl_abcde;
     case 6:
-      return mkldnn_abcdef;
+      return dnnl_abcdef;
     default:
-      LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for MKLDNN";
-      return mkldnn_format_tag_undef;
+      LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for DNNL";
+      return dnnl_format_tag_undef;
   }
 }
 
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& desc) {
+dnnl_format_tag_t GetDefaultFormat(const dnnl::memory::desc& desc) {
   return GetDefaultFormat(desc.data.ndims);
 }
 
-bool IsDefaultFormat(const mkldnn::memory::desc& desc) {
+bool IsDefaultFormat(const dnnl::memory::desc& desc) {
   bool rslt = false;
-  if (desc.data.format_kind == mkldnn_blocked) {
+  if (desc.data.format_kind == dnnl_blocked) {
     if (desc.data.format_desc.blocking.inner_nblks == 0) {
       int i = 0;
       for (i = 0; i < desc.data.ndims - 1; i++) {
@@ -369,22 +359,22 @@ bool IsDefaultFormat(const mkldnn::memory::desc& desc) {
   return rslt;
 }
 
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& desc, const mkldnn_format_tag_t& format) {
-  mkldnn::memory::dims dims(desc.data.ndims);
+dnnl::memory::desc GetDesc(const dnnl::memory::desc& desc, const dnnl_format_tag_t& format) {
+  dnnl::memory::dims dims(desc.data.ndims);
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = desc.data.dims[i];
-  mkldnn::memory::format_tag cpp_format = static_cast<mkldnn::memory::format_tag>(format);
-  mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(desc.data.data_type);
-  mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
-  return mkldnn::memory::desc(dims, cpp_type, cpp_format);
+  dnnl::memory::format_tag cpp_format = static_cast<dnnl::memory::format_tag>(format);
+  dnnl::memory::data_type cpp_type    = static_cast<dnnl::memory::data_type>(desc.data.data_type);
+  dnnl::memory::desc data_md(dims, cpp_type, cpp_format);
+  return dnnl::memory::desc(dims, cpp_type, cpp_format);
 }
 
-// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst) {
-  mkldnn::stream s(CpuEngine::Get()->get_engine());
+// reorder dnnl src to dst format dtype
+void ReorderTo(const dnnl::memory* src, const dnnl::memory* dst) {
+  dnnl::stream s(CpuEngine::Get()->get_engine());
   auto new_src = *src;
   auto new_dst = *dst;
-  mkldnn::reorder(new_src, new_dst).execute(s, new_src, new_dst);
+  dnnl::reorder(new_src, new_dst).execute(s, new_src, new_dst);
 }
 
 template <typename Compute, typename AttrState>
@@ -415,7 +405,7 @@ void FallBackCompute(Compute fn,
       in_blobs[i] = in_bufs.back().data();
     }
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 
   std::vector<TBlob> out_blobs(outputs.size());
   std::vector<NDArray> temp_src, temp_dst;
@@ -432,14 +422,14 @@ void FallBackCompute(Compute fn,
         new_req[i] = kWriteTo;
       }
     } else {
-      // ensure output does not use mkldnn mem.
+      // ensure output does not use dnnl mem.
       // for inplace, we already converted & copied input above.
       if ((req[i] == kWriteTo) || (req[i] == kWriteInplace)) {
-        const_cast<NDArray&>(output).InvalidateMKLDNNData();
+        const_cast<NDArray&>(output).InvalidateDNNLData();
         if (req[i] == kWriteInplace) {
           new_req[i] = kWriteTo;
         }
-      } else if (req[i] == kAddTo && output.IsMKLDNNData()) {
+      } else if (req[i] == kAddTo && output.IsDNNLData()) {
         NDArray temp = outputs[i].Reorder2Default();
         temp_src.emplace_back(temp);
         temp_dst.emplace_back(outputs[i]);
@@ -452,11 +442,11 @@ void FallBackCompute(Compute fn,
   fn(attrs_states, ctx, in_blobs, new_req, out_blobs);
   for (size_t i = 0, bf16_pos = 0; i < out_blobs.size(); i++) {
     if (outputs[i].dtype() == mshadow::kBfloat16) {
-      auto src_mem = temp_bf16_src[bf16_pos].GetMKLDNNData();
-      auto dst_mem = temp_bf16_dst[bf16_pos].GetMKLDNNData();
+      auto src_mem = temp_bf16_src[bf16_pos].GetDNNLData();
+      auto dst_mem = temp_bf16_dst[bf16_pos].GetDNNLData();
       bf16_pos++;
       ReorderTo(src_mem, dst_mem);
-    } else if (req[i] == kAddTo && outputs[i].IsMKLDNNData()) {
+    } else if (req[i] == kAddTo && outputs[i].IsDNNLData()) {
       mxnet::common::CastNonDefaultStorage(temp_src, temp_dst, ctx, false);
     }
   }
@@ -479,28 +469,28 @@ static bool SimilarArray(const mxnet::NDArray& arr1,
   if (arr1.shape().Size() != arr2.shape().Size())
     return false;
 
-  // This function should be used outside an MKLDNN operator.
+  // This function should be used outside an DNNL operator.
   // There shouldn't be any operators in the stream.
-  CHECK(!MKLDNNStream::Get()->HasOps());
+  CHECK(!DNNLStream::Get()->HasOps());
   // We need to reorder data in the arrays to the default layout.
   // But we shouldn't reorder data in the original array.
   NDArray buf1, buf2;
-  if (arr1.IsMKLDNNData()) {
+  if (arr1.IsDNNLData()) {
     buf1     = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
-    auto mem = arr1.GetMKLDNNData();
+    auto mem = arr1.GetDNNLData();
     buf1.CopyFrom(*mem);
   }
-  if (arr2.IsMKLDNNData()) {
+  if (arr2.IsDNNLData()) {
     buf2     = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
-    auto mem = arr2.GetMKLDNNData();
+    auto mem = arr2.GetDNNLData();
     buf2.CopyFrom(*mem);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 
   DType* data1 =
-      reinterpret_cast<DType*>(arr1.IsMKLDNNData() ? buf1.data().dptr_ : arr1.data().dptr_);
+      reinterpret_cast<DType*>(arr1.IsDNNLData() ? buf1.data().dptr_ : arr1.data().dptr_);
   DType* data2 =
-      reinterpret_cast<DType*>(arr2.IsMKLDNNData() ? buf2.data().dptr_ : arr2.data().dptr_);
+      reinterpret_cast<DType*>(arr2.IsDNNLData() ? buf2.data().dptr_ : arr2.data().dptr_);
   std::atomic<bool> success(true);
 #pragma omp parallel for
 #ifdef _MSC_VER
@@ -543,23 +533,23 @@ template void FallBackCompute(void (*)(OpStatePtr const&,
 void OpCheck::Init(const std::vector<mxnet::NDArray>& inputs_,
                    const std::vector<mxnet::NDArray>& outputs_) {
   auto ctx = inputs_[0].ctx();
-  CHECK(!MKLDNNStream::Get()->HasOps());
+  CHECK(!DNNLStream::Get()->HasOps());
   for (size_t i = 0; i < inputs_.size(); i++) {
     NDArray data = inputs_[i];
     inputs.emplace_back(data.shape(), ctx, false, data.dtype());
-    if (data.IsMKLDNNData() && data.IsView())
+    if (data.IsDNNLData() && data.IsView())
       data = data.Reorder2Default();
-    auto mem = data.GetMKLDNNData();
+    auto mem = data.GetDNNLData();
     inputs[i].CopyFrom(*mem);
   }
   for (size_t i = 0; i < outputs_.size(); i++) {
     outputs.emplace_back(outputs_[i].shape(), ctx, false, outputs_[i].dtype());
     if (backward) {
-      auto mem = outputs_[i].GetMKLDNNData();
+      auto mem = outputs_[i].GetDNNLData();
       outputs[i].CopyFrom(*mem);
     }
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 void OpCheck::Run(mxnet::FCompute fn,
@@ -568,9 +558,9 @@ void OpCheck::Run(mxnet::FCompute fn,
                   const std::vector<mxnet::NDArray>& inputs_,
                   const std::vector<mxnet::OpReqType>& req,
                   const std::vector<mxnet::NDArray>& outputs_) {
-  static auto& is_excluded = Op::GetAttr<bool>("TExcludeMKLDNNDebug");
+  static auto& is_excluded = Op::GetAttr<bool>("TExcludeDNNLDebug");
   if (is_excluded.get(attrs.op, false)) {
-    LOG(WARNING) << attrs.op->name << " not checked. TExcludeMKLDNNDebug flag present";
+    LOG(WARNING) << attrs.op->name << " not checked. TExcludeDNNLDebug flag present";
     return;
   }
   std::vector<mxnet::TBlob> in_blobs(inputs.size());
@@ -601,30 +591,30 @@ void OpCheck::Run(mxnet::FCompute fn,
 
 void OpCheck::CopyResult(const std::vector<mxnet::NDArray>& outputs_,
                          const std::vector<size_t>& indice) {
-  CHECK(!MKLDNNStream::Get()->HasOps());
+  CHECK(!DNNLStream::Get()->HasOps());
   auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray>&>(outputs_);
   for (auto i = indice.begin(); i != indice.end(); ++i) {
-    auto mem = outputs[*i].GetMKLDNNData();
+    auto mem = outputs[*i].GetDNNLData();
     non_const_outputs_[*i].CopyFrom(*mem);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
-                       const int dev_mask,
-                       bool support_mkldnn,
-                       DispatchMode* dispatch_mode,
-                       std::vector<int>* in_attrs,
-                       std::vector<int>* out_attrs) {
+bool DNNLStorageType(const nnvm::NodeAttrs& attrs,
+                     const int dev_mask,
+                     bool support_dnnl,
+                     DispatchMode* dispatch_mode,
+                     std::vector<int>* in_attrs,
+                     std::vector<int>* out_attrs) {
   for (int& v : *in_attrs)
     if (v == -1)
       v = kDefaultStorage;
 
   DispatchMode wanted_mode;
 #if MXNET_USE_ONEDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
+  if (dev_mask == mshadow::cpu::kDevMask && !DNNLEnvSet())
     wanted_mode = DispatchMode::kFComputeFallback;
-  else if (dev_mask == mshadow::cpu::kDevMask && support_mkldnn)
+  else if (dev_mask == mshadow::cpu::kDevMask && support_dnnl)
     wanted_mode = DispatchMode::kFComputeEx;
   else
 #endif
@@ -641,11 +631,11 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
-inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray>& inputs) {
+inline static const std::vector<NDArray> GetDNNLInputArray(const std::vector<NDArray>& inputs) {
   std::vector<NDArray> ret;
   ret.reserve(inputs.size());
   for (const auto& in : inputs) {
-    if (in.IsView() && in.IsMKLDNNData()) {
+    if (in.IsView() && in.IsDNNLData()) {
       ret.push_back(in.Reorder2Default());
     } else {
       ret.push_back(in);
@@ -654,30 +644,30 @@ inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<N
   return ret;
 }
 
-void MKLDNNRun(mxnet::FComputeEx fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const std::vector<mxnet::NDArray>& inputs,
-               const std::vector<mxnet::OpReqType>& req,
-               const std::vector<mxnet::NDArray>& outputs) {
-  if (CheckMKLDNNInputArrayIsView(inputs)) {
-    const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
-    fn(attrs, ctx, mkldnn_inputs, req, outputs);
+void DNNLRun(mxnet::FComputeEx fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const std::vector<mxnet::NDArray>& inputs,
+             const std::vector<mxnet::OpReqType>& req,
+             const std::vector<mxnet::NDArray>& outputs) {
+  if (CheckDNNLInputArrayIsView(inputs)) {
+    const auto dnnl_inputs = GetDNNLInputArray(inputs);
+    fn(attrs, ctx, dnnl_inputs, req, outputs);
   } else {
     fn(attrs, ctx, inputs, req, outputs);
   }
 }
 
-void MKLDNNRun(FComputeExUnary fn,
-               const nnvm::NodeAttrs& attrs,
-               const mxnet::OpContext& ctx,
-               const mxnet::NDArray& input,
-               const mxnet::OpReqType& req,
-               const mxnet::NDArray& output) {
-  auto mkldnn_input = input;
-  if (input.IsView() && input.IsMKLDNNData()) {
-    mkldnn_input = input.Reorder2Default();
-    fn(attrs, ctx, mkldnn_input, req, output);
+void DNNLRun(FComputeExUnary fn,
+             const nnvm::NodeAttrs& attrs,
+             const mxnet::OpContext& ctx,
+             const mxnet::NDArray& input,
+             const mxnet::OpReqType& req,
+             const mxnet::NDArray& output) {
+  auto dnnl_input = input;
+  if (input.IsView() && input.IsDNNLData()) {
+    dnnl_input = input.Reorder2Default();
+    fn(attrs, ctx, dnnl_input, req, output);
   } else {
     fn(attrs, ctx, input, req, output);
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h b/src/operator/nn/dnnl/dnnl_batch_dot-inl.h
similarity index 64%
rename from src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h
rename to src/operator/nn/dnnl/dnnl_batch_dot-inl.h
index 2459ea1..2c07a32 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h
+++ b/src/operator/nn/dnnl/dnnl_batch_dot-inl.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file mkldnn_batch_dot-inl.h
+ * \file dnnl_batch_dot-inl.h
  * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_DOT_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_DOT_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_DOT_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_DOT_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
@@ -31,28 +31,27 @@
 #include <utility>
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../../tensor/dot-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using batch_dot_fwd_t    = mkldnn::matmul;
-using batch_dot_fwd_pd_t = mkldnn::matmul::primitive_desc;
+using batch_dot_fwd_t    = dnnl::matmul;
+using batch_dot_fwd_pd_t = dnnl::matmul::primitive_desc;
 
 typedef ParamOpSign<DotParam> BatchDotSignature;
 
-class MKLDNNBatchDotFwd {
+class DNNLBatchDotFwd {
  public:
-  static MKLDNNBatchDotFwd& GetCached(const DotParam& param,
-                                      const std::vector<NDArray>& inputs,
-                                      const std::vector<NDArray>& outputs);
+  static DNNLBatchDotFwd& GetCached(const DotParam& param,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<NDArray>& outputs);
 
-  MKLDNNBatchDotFwd(const DotParam& param,
-                    const std::vector<NDArray>& inputs,
-                    const std::vector<NDArray>& outputs);
+  DNNLBatchDotFwd(const DotParam& param,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<NDArray>& outputs);
 
   void Execute(const std::vector<NDArray>& inputs,
                const std::vector<OpReqType>& req,
@@ -66,4 +65,4 @@ class MKLDNNBatchDotFwd {
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_DOT_INL_H__
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_DOT_INL_H__
diff --git a/src/operator/nn/dnnl/dnnl_batch_dot.cc b/src/operator/nn/dnnl/dnnl_batch_dot.cc
new file mode 100644
index 0000000..bb9f911
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_batch_dot.cc
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_batch_dot.cc
+ * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include "./dnnl_batch_dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportDNNLBatchDot(const std::vector<NDArray>& inputs, const NDArray& output) {
+  return inputs[0].shape().Size() != 0 && inputs[1].shape().Size() != 0 &&
+         output.shape().Size() != 0 &&
+         (inputs[0].dtype() == mshadow::kFloat32 || inputs[0].dtype() == mshadow::kBfloat16);
+}
+
+void DNNLBatchDotForward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& outputs) {
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  DNNLBatchDotFwd& fwd  = DNNLBatchDotFwd::GetCached(param, inputs, outputs);
+  fwd.Execute(inputs, req, outputs);
+}
+
+DNNLBatchDotFwd& DNNLBatchDotFwd::GetCached(const DotParam& param,
+                                            const std::vector<NDArray>& inputs,
+                                            const std::vector<NDArray>& outputs) {
+  using batch_dot_fwd_map = std::unordered_map<BatchDotSignature, DNNLBatchDotFwd, OpHash>;
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local batch_dot_fwd_map fwds;
+#else
+  static MX_THREAD_LOCAL batch_dot_fwd_map fwds;
+#endif
+
+  BatchDotSignature key(param);
+  key.AddSign(inputs[0]);
+  key.AddSign(inputs[1]);
+  key.AddSign(outputs[0]);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    const DNNLBatchDotFwd fwd(param, inputs, outputs);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+DNNLBatchDotFwd::DNNLBatchDotFwd(const DotParam& param,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<NDArray>& outputs) {
+  auto shape  = inputs[0].shape();
+  auto ndim   = shape.ndim();
+  auto bigDim = shape[0];
+  for (size_t i = 1; i < ndim - 2; ++i) {
+    bigDim *= shape[i];
+  }
+
+  auto GetMemoryDesc = [&ndim, &bigDim](const NDArray& tensor, const bool transpose) {
+    auto shape = tensor.shape();
+    if (transpose) {
+      return dnnl::memory::desc(dnnl::memory::dims{bigDim, shape[ndim - 1], shape[ndim - 2]},
+                                get_dnnl_type(tensor.dtype()),
+                                dnnl::memory::format_tag::acb);
+    } else {
+      return dnnl::memory::desc(dnnl::memory::dims{bigDim, shape[ndim - 2], shape[ndim - 1]},
+                                get_dnnl_type(tensor.dtype()),
+                                dnnl::memory::format_tag::any);
+    }
+  };
+
+  dnnl::memory::desc data_md    = GetMemoryDesc(inputs[0], param.transpose_a);
+  dnnl::memory::desc weights_md = GetMemoryDesc(inputs[1], param.transpose_b);
+  dnnl::memory::desc out_md({bigDim, data_md.dims()[1], weights_md.dims()[2]},
+                            get_dnnl_type(outputs[0].dtype()),
+                            dnnl::memory::format_tag::any);
+  dnnl::matmul::desc fwd_desc(data_md, weights_md, out_md);
+  fwd_pd = std::make_shared<batch_dot_fwd_pd_t>(fwd_desc, mxnet::CpuEngine::Get()->get_engine());
+  fwd    = std::make_shared<batch_dot_fwd_t>(*fwd_pd);
+}
+
+void DNNLBatchDotFwd::Execute(const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+  auto engine = mxnet::CpuEngine::Get()->get_engine();
+  auto data =
+      dnnl::memory(fwd_pd->src_desc(), engine, reinterpret_cast<void*>(inputs[0].data().dptr_));
+  auto weights =
+      dnnl::memory(fwd_pd->weights_desc(), engine, reinterpret_cast<void*>(inputs[1].data().dptr_));
+  dnnl_output_t out_mem = CreateDNNLMem(outputs[0], fwd_pd->dst_desc(), req[0], &inputs[0]);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, data},
+      {DNNL_ARG_WEIGHTS, weights},
+      {DNNL_ARG_DST, *out_mem.second},
+  };
+
+  DNNLStream::Get()->RegisterPrimArgs(*fwd, args);
+  CommitOutput(outputs[0], out_mem);
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/dnnl/dnnl_batch_norm-inl.h
similarity index 59%
rename from src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
rename to src/operator/nn/dnnl/dnnl_batch_norm-inl.h
index 2a4b2bf..f7dc97b 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ b/src/operator/nn/dnnl/dnnl_batch_norm-inl.h
@@ -18,98 +18,96 @@
  */
 
 /*!
- * \file mkldnn_batch_norm.cc
+ * \file dnnl_batch_norm.cc
  * \brief
  * \author Tao Lv
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_NORM_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_NORM_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
-#include <mkldnn.hpp>
-
+#include <dnnl.hpp>
 #include <utility>
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../batch_norm-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
-typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc;
-typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
-typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc;
+typedef dnnl::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
+typedef dnnl::batch_normalization_forward::desc t_bn_f_desc;
+typedef dnnl::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
+typedef dnnl::batch_normalization_backward::desc t_bn_b_desc;
 
-inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray>& in_data,
-                                                    const std::vector<NDArray>& aux_states,
-                                                    bool is_train_and_not_global_stats,
-                                                    bool fuse_relu) {
-  mkldnn::normalization_flags flags = static_cast<mkldnn::normalization_flags>(0U);
+inline static dnnl::normalization_flags _GetFlags(const std::vector<NDArray>& in_data,
+                                                  const std::vector<NDArray>& aux_states,
+                                                  bool is_train_and_not_global_stats,
+                                                  bool fuse_relu) {
+  dnnl::normalization_flags flags = static_cast<dnnl::normalization_flags>(0U);
   if (in_data.size() == 3U) {
-    flags |= mkldnn::normalization_flags::use_scale_shift;
+    flags |= dnnl::normalization_flags::use_scale_shift;
   }
 
   // aux_states[0]: inMean
   // aux_states[1]: inVariance
   if (aux_states.size() == 2U && !is_train_and_not_global_stats) {
-    flags |= mkldnn::normalization_flags::use_global_stats;
+    flags |= dnnl::normalization_flags::use_global_stats;
   }
 
   if (fuse_relu) {
-    flags |= mkldnn::normalization_flags::fuse_norm_relu;
+    flags |= dnnl::normalization_flags::fuse_norm_relu;
   }
   return flags;
 }
 
-inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory& data_mem,
+inline static t_bn_f_pdesc _GetFwd(const dnnl::memory& data_mem,
                                    bool is_train,
                                    float eps,
-                                   mkldnn::normalization_flags flags) {
+                                   dnnl::normalization_flags flags) {
   auto data_md = data_mem.get_desc();
   auto engine  = CpuEngine::Get()->get_engine();
 
   if (is_train) {
-    t_bn_f_desc bnFwd_desc(mkldnn::prop_kind::forward_training, data_md, eps, flags);
+    t_bn_f_desc bnFwd_desc(dnnl::prop_kind::forward_training, data_md, eps, flags);
     return t_bn_f_pdesc(bnFwd_desc, engine);
   } else {
-    t_bn_f_desc bnFwd_desc(mkldnn::prop_kind::forward_inference, data_md, eps, flags);
+    t_bn_f_desc bnFwd_desc(dnnl::prop_kind::forward_inference, data_md, eps, flags);
     return t_bn_f_pdesc(bnFwd_desc, engine);
   }
 }
 
-inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory& data_mem,
-                                   const mkldnn::memory& diff_mem,
+inline static t_bn_b_pdesc _GetBwd(const dnnl::memory& data_mem,
+                                   const dnnl::memory& diff_mem,
                                    float eps,
-                                   mkldnn::normalization_flags flags) {
+                                   dnnl::normalization_flags flags) {
   auto data_md = data_mem.get_desc();
   auto diff_md = diff_mem.get_desc();
   auto engine  = CpuEngine::Get()->get_engine();
 
-  t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
+  t_bn_b_desc bnBwd_desc(dnnl::prop_kind::backward, diff_md, data_md, eps, flags);
   return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
 }
 
-typedef ParamOpSign<BatchNormParam> MKLDNNBNSignature;
+typedef ParamOpSign<BatchNormParam> DNNLBNSignature;
 
-class MKLDNNBNForward {
-  std::shared_ptr<const mkldnn::memory> weight_m;
-  std::shared_ptr<mkldnn::batch_normalization_forward> fwd;
+class DNNLBNForward {
+  std::shared_ptr<const dnnl::memory> weight_m;
+  std::shared_ptr<dnnl::batch_normalization_forward> fwd;
   bool is_train_and_not_global_stats;
   t_bn_f_pdesc pd;
 
  public:
-  MKLDNNBNForward(const t_bn_f_pdesc& _pd, bool is_train_and_not_global_stats) : pd(_pd) {
-    weight_m.reset(new mkldnn::memory(pd.weights_desc(), CpuEngine::Get()->get_engine()));
-    fwd.reset(new mkldnn::batch_normalization_forward(pd));
+  DNNLBNForward(const t_bn_f_pdesc& _pd, bool is_train_and_not_global_stats) : pd(_pd) {
+    weight_m.reset(new dnnl::memory(pd.weights_desc(), CpuEngine::Get()->get_engine()));
+    fwd.reset(new dnnl::batch_normalization_forward(pd));
     this->is_train_and_not_global_stats = is_train_and_not_global_stats;
   }
 
-  const mkldnn::memory& GetWeight() const {
+  const dnnl::memory& GetWeight() const {
     return *weight_m;
   }
 
@@ -117,22 +115,22 @@ class MKLDNNBNForward {
     return pd;
   }
 
-  const mkldnn::batch_normalization_forward& GetFwd() const {
+  const dnnl::batch_normalization_forward& GetFwd() const {
     return *fwd;
   }
 };
 
 template <typename DType>
-static MKLDNNBNForward& GetBNForward(const BatchNormParam& param,
-                                     const OpContext& ctx,
-                                     const mkldnn::memory* data_mem,
-                                     mkldnn::normalization_flags flags) {
+static DNNLBNForward& GetBNForward(const BatchNormParam& param,
+                                   const OpContext& ctx,
+                                   const dnnl::memory* data_mem,
+                                   dnnl::normalization_flags flags) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
+  static thread_local std::unordered_map<DNNLBNSignature, DNNLBNForward, OpHash> fwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
+  static MX_THREAD_LOCAL std::unordered_map<DNNLBNSignature, DNNLBNForward, OpHash> fwds;
 #endif
-  MKLDNNBNSignature key(param);
+  DNNLBNSignature key(param);
   key.AddSign(ctx.is_train);
   key.AddSign(*data_mem);
   key.AddSign(static_cast<int>(flags));
@@ -140,19 +138,19 @@ static MKLDNNBNForward& GetBNForward(const BatchNormParam& param,
   auto it = fwds.find(key);
   if (it == fwds.end()) {
     auto fwd_pd = _GetFwd(*data_mem, ctx.is_train, param.eps, flags);
-    MKLDNNBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
+    DNNLBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
 template <typename DType>
-void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const std::vector<NDArray>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<NDArray>& outputs,
-                            bool fuse_relu) {
+void DNNLBatchNormForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs,
+                          bool fuse_relu) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
 
@@ -173,27 +171,27 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
 
   const std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  mkldnn::normalization_flags flags =
+  dnnl::normalization_flags flags =
       _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
   NDArray& data = in_data[batchnorm::kData];
-  if (data.IsMKLDNNData() && data.IsView())
+  if (data.IsDNNLData() && data.IsView())
     data = data.Reorder2Default();
-  auto data_mem = data.GetMKLDNNData();
+  auto data_mem = data.GetDNNLData();
   auto& fwd     = GetBNForward<DType>(param, ctx, data_mem, flags);
 
   // for output memory
-  auto out_mem = const_cast<NDArray&>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
+  auto out_mem = const_cast<NDArray&>(out).CreateDNNLData(fwd.GetPd().dst_desc());
 
   // mxnet will always use scale shift.
   // But if fix_gamma is true, then all scale elements will be set to 1.0f
-  if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
+  if (static_cast<int>(flags) & static_cast<int>(dnnl::normalization_flags::use_scale_shift)) {
     const NDArray& gamma = in_data[batchnorm::kGamma];
     const NDArray& beta  = in_data[batchnorm::kBeta];
     CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage);
     CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage);
 
-    const mkldnn::memory& weight_mem = fwd.GetWeight();
-    float* weight_buf                = reinterpret_cast<float*>(weight_mem.get_data_handle());
+    const dnnl::memory& weight_mem = fwd.GetWeight();
+    float* weight_buf              = reinterpret_cast<float*>(weight_mem.get_data_handle());
 
     index_t channels_ = data.shape()[1];
     CHECK(weight_mem.get_desc().get_size() == channels_ * sizeof(float) * 2);
@@ -216,20 +214,20 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
       }
     }
 
-    mkldnn_args_map_t net_args;
-    net_args[MKLDNN_ARG_SRC]         = *data_mem;
-    net_args[MKLDNN_ARG_SCALE_SHIFT] = weight_mem;
-    net_args[MKLDNN_ARG_DST]         = *out_mem;
+    dnnl_args_map_t net_args;
+    net_args[DNNL_ARG_SRC]         = *data_mem;
+    net_args[DNNL_ARG_SCALE_SHIFT] = weight_mem;
+    net_args[DNNL_ARG_DST]         = *out_mem;
     if (fuse_relu) {
       const NDArray* workspace = nullptr;
       workspace                = &outputs[3];
       auto engine              = CpuEngine::Get()->get_engine();
       if (workspace == nullptr) {
-        LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
+        LOG(FATAL) << "DNNL BatchNorm: incorrect workspace input";
       }
-      auto ws = std::make_shared<mkldnn::memory>(
-          fwd.GetPd().workspace_desc(), engine, workspace->GetMKLDNNData()->get_data_handle());
-      net_args[MKLDNN_ARG_WORKSPACE] = *ws;
+      auto ws = std::make_shared<dnnl::memory>(
+          fwd.GetPd().workspace_desc(), engine, workspace->GetDNNLData()->get_data_handle());
+      net_args[DNNL_ARG_WORKSPACE] = *ws;
     }
     if (!ctx.is_train || param.use_global_stats) {
       float* omean  = outputs[batchnorm::kMean].data().dptr<float>();
@@ -241,17 +239,17 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
         omean[i] = inmean[i];
         ovar[i]  = VARIANCE_TO_INVSTD(invar[i], param.eps);
       }
-      net_args[MKLDNN_ARG_MEAN]     = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = *(aux_states[batchnorm::kMovingVar].GetMKLDNNData());
-      MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-      MKLDNNStream::Get()->Submit();
+      net_args[DNNL_ARG_MEAN]     = *(aux_states[batchnorm::kMovingMean].GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = *(aux_states[batchnorm::kMovingVar].GetDNNLData());
+      DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+      DNNLStream::Get()->Submit();
     } else {  // training
-      const NDArray& outMean        = outputs[batchnorm::kMean];
-      const NDArray& outVar         = outputs[batchnorm::kVar];
-      net_args[MKLDNN_ARG_MEAN]     = *(outMean.GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = *(outVar.GetMKLDNNData());
-      MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-      MKLDNNStream::Get()->Submit();
+      const NDArray& outMean      = outputs[batchnorm::kMean];
+      const NDArray& outVar       = outputs[batchnorm::kVar];
+      net_args[DNNL_ARG_MEAN]     = *(outMean.GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = *(outVar.GetDNNLData());
+      DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+      DNNLStream::Get()->Submit();
 
       float* ovar = outVar.data().dptr<float>();
       for (index_t i = 0; i < channels_; i++) {
@@ -259,52 +257,52 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
       }
     }
   } else {  // no input gamma and beta
-    LOG(FATAL) << "MKLDNN batch normalization: should not reach here ...";
+    LOG(FATAL) << "DNNL batch normalization: should not reach here ...";
   }
 }
 
-class MKLDNNBNBackward {
-  std::shared_ptr<mkldnn::batch_normalization_backward> bwd;
-  const std::shared_ptr<mkldnn::memory> weight_m;
-  const std::shared_ptr<mkldnn::memory> gradw_m;
+class DNNLBNBackward {
+  std::shared_ptr<dnnl::batch_normalization_backward> bwd;
+  const std::shared_ptr<dnnl::memory> weight_m;
+  const std::shared_ptr<dnnl::memory> gradw_m;
 
  public:
   const t_bn_b_pdesc pd;
 
-  explicit MKLDNNBNBackward(const t_bn_b_pdesc& _pd)
-      : weight_m(new mkldnn::memory(_pd.weights_desc(), CpuEngine::Get()->get_engine())),
-        gradw_m(new mkldnn::memory(_pd.diff_weights_desc(), CpuEngine::Get()->get_engine())),
+  explicit DNNLBNBackward(const t_bn_b_pdesc& _pd)
+      : weight_m(new dnnl::memory(_pd.weights_desc(), CpuEngine::Get()->get_engine())),
+        gradw_m(new dnnl::memory(_pd.diff_weights_desc(), CpuEngine::Get()->get_engine())),
         pd(_pd) {
-    bwd.reset(new mkldnn::batch_normalization_backward(pd));
+    bwd.reset(new dnnl::batch_normalization_backward(pd));
   }
 
-  const mkldnn::memory& GetWeight() const {
+  const dnnl::memory& GetWeight() const {
     return *weight_m;
   }
 
-  const mkldnn::memory& GetGradw() const {
+  const dnnl::memory& GetGradw() const {
     return *gradw_m;
   }
 
-  const mkldnn::batch_normalization_backward& GetBwd() const {
+  const dnnl::batch_normalization_backward& GetBwd() const {
     return *bwd;
   }
 };
 
 template <typename DType>
-static MKLDNNBNBackward& GetBNBackward(const BatchNormParam& param,
-                                       const OpContext& ctx,
-                                       const NDArray& in_data,
-                                       const mkldnn::memory& in_mem,
-                                       const NDArray& diff_data,
-                                       const mkldnn::memory& diff_mem,
-                                       mkldnn::normalization_flags flags) {
+static DNNLBNBackward& GetBNBackward(const BatchNormParam& param,
+                                     const OpContext& ctx,
+                                     const NDArray& in_data,
+                                     const dnnl::memory& in_mem,
+                                     const NDArray& diff_data,
+                                     const dnnl::memory& diff_mem,
+                                     dnnl::normalization_flags flags) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNBackward, OpHash> bwds;
+  static thread_local std::unordered_map<DNNLBNSignature, DNNLBNBackward, OpHash> bwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNBNSignature, MKLDNNBNBackward, OpHash> bwds;
+  static MX_THREAD_LOCAL std::unordered_map<DNNLBNSignature, DNNLBNBackward, OpHash> bwds;
 #endif
-  MKLDNNBNSignature key(param);
+  DNNLBNSignature key(param);
   key.AddSign(in_data);
   key.AddSign(diff_data);
   key.AddSign(static_cast<int>(flags));
@@ -312,19 +310,19 @@ static MKLDNNBNBackward& GetBNBackward(const BatchNormParam& param,
   auto it = bwds.find(key);
   if (it == bwds.end()) {
     auto bwd_pd = _GetBwd(in_mem, diff_mem, param.eps, flags);
-    MKLDNNBNBackward bwd(bwd_pd);
+    DNNLBNBackward bwd(bwd_pd);
     it = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
 template <typename DType>
-void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<NDArray>& outputs,
-                             bool fuse_relu) {
+void DNNLBatchNormBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs,
+                           bool fuse_relu) {
   if (fuse_relu) {
     CHECK_EQ(inputs.size(), 9U);
   } else {
@@ -345,7 +343,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
   aux_states[batchnorm::kMovingVar]   = inputs[7];
   const std::vector<NDArray>& in_grad = outputs;
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  mkldnn::normalization_flags flags =
+  dnnl::normalization_flags flags =
       _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
 
   NDArray data               = in_data[batchnorm::kData];
@@ -376,19 +374,19 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
     gradIn = gradIn.Reshape(new_shape);
   }
 
-  auto data_mem = data.GetMKLDNNData();
-  auto diff_mem = diff.GetMKLDNNData();
-  // MKLDNN batchnorm should run on special layouts. If one of them isn't, we
+  auto data_mem = data.GetDNNLData();
+  auto diff_mem = diff.GetDNNLData();
+  // DNNL batchnorm should run on special layouts. If one of them isn't, we
   // should reorder them.
   if (data.IsDefaultData())
-    data_mem = data.GetMKLDNNDataReorder(diff_mem->get_desc());
+    data_mem = data.GetDNNLDataReorder(diff_mem->get_desc());
   else if (diff.IsDefaultData())
-    diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_desc());
+    diff_mem = diff.GetDNNLDataReorder(data_mem->get_desc());
   auto& bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
   auto gradi_mem =
-      CreateMKLDNNMem(const_cast<NDArray&>(gradIn), bwd.pd.diff_src_desc(), req[batchnorm::kData]);
+      CreateDNNLMem(const_cast<NDArray&>(gradIn), bwd.pd.diff_src_desc(), req[batchnorm::kData]);
 
-  if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
+  if (static_cast<int>(flags) & static_cast<int>(dnnl::normalization_flags::use_scale_shift)) {
     const NDArray& gamma   = in_data[batchnorm::kGamma];
     const NDArray& beta    = in_data[batchnorm::kBeta];
     DType* weight_buf      = reinterpret_cast<DType*>(bwd.GetWeight().get_data_handle());
@@ -405,18 +403,18 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
       }
       memcpy(&weight_buf[channels_], bias_ptr, copy_size);
     }
-    mkldnn_args_map_t net_args;
-    net_args[MKLDNN_ARG_SRC]              = *data_mem;
-    net_args[MKLDNN_ARG_DIFF_SRC]         = *gradi_mem.second;
-    net_args[MKLDNN_ARG_SCALE_SHIFT]      = bwd.GetWeight();
-    net_args[MKLDNN_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
-    net_args[MKLDNN_ARG_DIFF_DST]         = *diff_mem;
+    dnnl_args_map_t net_args;
+    net_args[DNNL_ARG_SRC]              = *data_mem;
+    net_args[DNNL_ARG_DIFF_SRC]         = *gradi_mem.second;
+    net_args[DNNL_ARG_SCALE_SHIFT]      = bwd.GetWeight();
+    net_args[DNNL_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
+    net_args[DNNL_ARG_DIFF_DST]         = *diff_mem;
 
     if (fuse_relu) {
       const NDArray* workspace = nullptr;
       workspace                = &inputs[8];
       if (workspace != nullptr) {
-        net_args[MKLDNN_ARG_WORKSPACE] = *(workspace->GetMKLDNNData());
+        net_args[DNNL_ARG_WORKSPACE] = *(workspace->GetDNNLData());
       }
     }
 
@@ -426,7 +424,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
       DType* moving_var_ptr  = moving_var.data().dptr<DType>();
       DType* out_mean_ptr    = out_mean.data().dptr<DType>();
       DType* out_var_ptr     = out_var.data().dptr<DType>();
-      mkldnn::memory var_mem(bwd.pd.variance_desc(), CpuEngine::Get()->get_engine());
+      dnnl::memory var_mem(bwd.pd.variance_desc(), CpuEngine::Get()->get_engine());
       DType* tmp_var_ptr = reinterpret_cast<DType*>(var_mem.get_data_handle());
 
       DType minus_mom = (1.0f - param.momentum);
@@ -436,15 +434,15 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
         tmp_var_ptr[i]     = variance;
         moving_var_ptr[i]  = moving_var_ptr[i] * param.momentum + variance * minus_mom;
       }
-      net_args[MKLDNN_ARG_MEAN]     = *(out_mean.GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = var_mem;
+      net_args[DNNL_ARG_MEAN]     = *(out_mean.GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = var_mem;
     } else {
-      net_args[MKLDNN_ARG_MEAN]     = *(moving_mean.GetMKLDNNData());
-      net_args[MKLDNN_ARG_VARIANCE] = *(moving_var.GetMKLDNNData());
+      net_args[DNNL_ARG_MEAN]     = *(moving_mean.GetDNNLData());
+      net_args[DNNL_ARG_VARIANCE] = *(moving_var.GetDNNLData());
     }
-    MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
+    DNNLStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
     CommitOutput(gradIn, gradi_mem);
-    MKLDNNStream::Get()->Submit();
+    DNNLStream::Get()->Submit();
 
     // copy data from gradw_mem to in_grad[1] and in_grad[2]
     DType* gw_buf   = reinterpret_cast<DType*>(bwd.GetGradw().get_data_handle());
@@ -480,10 +478,10 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
       }
     }
   } else {
-    LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ...";
+    LOG(FATAL) << "DNNL batch normalization backward: should not reach here ...";
   }
 }
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_BATCH_NORM_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_concat-inl.h b/src/operator/nn/dnnl/dnnl_concat-inl.h
similarity index 58%
rename from src/operator/nn/mkldnn/mkldnn_concat-inl.h
rename to src/operator/nn/dnnl/dnnl_concat-inl.h
index a78b5a6..4646137 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat-inl.h
+++ b/src/operator/nn/dnnl/dnnl_concat-inl.h
@@ -18,46 +18,45 @@
  */
 
 /*!
- * \file mkldnn_concat-inl.h
+ * \file dnnl_concat-inl.h
  * \brief
  * \author
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_CONCAT_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_CONCAT_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <utility>
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../concat-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNConcatFwd {
+class DNNLConcatFwd {
  public:
-  mkldnn::concat::primitive_desc fwd_pd;
+  dnnl::concat::primitive_desc fwd_pd;
 
-  MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md);
+  DNNLConcatFwd(int concat_dim, const std::vector<dnnl::memory::desc>& data_md);
 
-  const mkldnn::concat& GetFwd() const {
+  const dnnl::concat& GetFwd() const {
     return *fwd_;
   }
 
  private:
-  std::shared_ptr<mkldnn::concat> fwd_;
+  std::shared_ptr<dnnl::concat> fwd_;
 };
 
-static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
-                                         const std::vector<NDArray>& in_data,
-                                         const std::vector<mkldnn::memory::desc>& data_md) {
+static DNNLConcatFwd& GetConcatForward(int concat_dim,
+                                       const std::vector<NDArray>& in_data,
+                                       const std::vector<dnnl::memory::desc>& data_md) {
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
+  static thread_local std::unordered_map<OpSignature, DNNLConcatFwd, OpHash> fwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
+  static MX_THREAD_LOCAL std::unordered_map<OpSignature, DNNLConcatFwd, OpHash> fwds;
 #endif
   OpSignature key;
   key.AddSign(concat_dim);
@@ -65,7 +64,7 @@ static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    MKLDNNConcatFwd fwd(concat_dim, data_md);
+    DNNLConcatFwd fwd(concat_dim, data_md);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
@@ -75,4 +74,4 @@ static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_CONCAT_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_concat.cc b/src/operator/nn/dnnl/dnnl_concat.cc
new file mode 100644
index 0000000..1214a31
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_concat.cc
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_concat.cc
+ * \brief
+ * \author
+ */
+
+#if MXNET_USE_ONEDNN == 1
+#include "dnnl_concat-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static inline bool IsUsingPadding(const dnnl::memory::desc& dst_md) {
+  // make sure a blocked format is used (at least one dimension is blocked)
+  bool is_blocked_format =
+      dst_md.data.format_kind == dnnl_blocked && dst_md.data.format_desc.blocking.inner_nblks > 0;
+  return is_blocked_format &&
+         !std::equal(
+             dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims, dst_md.data.padded_dims);
+}
+
+DNNLConcatFwd::DNNLConcatFwd(int concat_dim, const std::vector<dnnl::memory::desc>& data_md)
+    : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
+  // DNNL introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, DNNL operators
+  // still reuse memory from memory planning, so here we need to select a
+  // format that has the expected memory size requirements (a plain format)
+
+  // When fwd_pd uses padding, impose a plain format
+  const auto& dst_md = fwd_pd.dst_desc();
+  if (IsUsingPadding(dst_md)) {
+    auto plain_dst_tag = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(dst_md.data.ndims));
+    auto plain_dst_md  = dnnl::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag);
+    fwd_pd             = dnnl::concat::primitive_desc(
+        plain_dst_md, concat_dim, data_md, CpuEngine::Get()->get_engine());
+  }
+  fwd_ = std::make_shared<dnnl::concat>(fwd_pd);
+}
+
+void DNNLConcatForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<NDArray>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<NDArray>& out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  const int num_in_data    = param.num_args;
+  const int concat_dim     = param.dim;
+  std::vector<dnnl::memory::desc> data_md;
+  std::vector<const dnnl::memory*> data_mem;
+  data_md.reserve(num_in_data);
+  data_mem.reserve(num_in_data);
+  for (int i = 0; i < num_in_data; i++) {
+    const dnnl::memory* tmp_mem = in_data[i].GetDNNLData();
+    dnnl::memory::desc tmp_md   = tmp_mem->get_desc();
+    data_md.push_back(tmp_md);
+    data_mem.push_back(tmp_mem);
+  }
+  DNNLConcatFwd& fwd = GetConcatForward(concat_dim, in_data, data_md);
+  mxnet::dnnl_output_t out_mem =
+      CreateDNNLMem(out_data[concat_enum::kOut], fwd.fwd_pd.dst_desc(), req[concat_enum::kOut]);
+  std::unordered_map<int, dnnl::memory> net_args;
+  net_args.insert({DNNL_ARG_DST, *out_mem.second});
+  for (int i = 0; i < num_in_data; i++) {
+    net_args.insert({DNNL_ARG_MULTIPLE_SRC + i, *data_mem[i]});
+  }
+  DNNLStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
+  CommitOutput(out_data[concat_enum::kOut], out_mem);
+  DNNLStream::Get()->Submit();
+}
+
+void DNNLConcatBackward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  const int num_in_data    = param.num_args;
+  const int axis           = param.dim;
+  const auto gradz_mem     = inputs[0].GetDNNLData();
+  /* init the offset */
+  dnnl::memory::dims offsets(outputs[0].shape().ndim());
+  for (auto& v : offsets) {
+    v = 0;
+  }
+
+  for (int i = 0; i < num_in_data; i++) {
+    dnnl::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
+    auto diff_src_md = outputs[i].GetDNNLData()->get_desc();
+    auto gradi_mem   = CreateDNNLMem(outputs[i], diff_src_md, req[i]);
+
+    auto from_md = gradz_mem->get_desc().submemory_desc(diff_src_tz, offsets);
+    auto from_mem =
+        new dnnl::memory(from_md, gradz_mem->get_engine(), gradz_mem->get_data_handle());
+    offsets[axis] += diff_src_tz[axis];
+
+    std::unordered_map<int, dnnl::memory> net_args(
+        {{DNNL_ARG_FROM, *gradz_mem}, {DNNL_ARG_TO, *gradi_mem.second}});
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(*from_mem, *gradi_mem.second), net_args);
+    CommitOutput(outputs[i], gradi_mem);
+  }
+
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/dnnl/dnnl_convolution-inl.h b/src/operator/nn/dnnl/dnnl_convolution-inl.h
new file mode 100644
index 0000000..529b6c3
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_convolution-inl.h
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_convolution-inl.h
+ * \brief
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_CONVOLUTION_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+
+#include <utility>
+#include <vector>
+
+#include "../convolution-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct DNNLConvParam : public dmlc::Parameter<DNNLConvParam> {
+  bool with_bn;
+  bool with_act;
+  bool with_sum;
+  bool with_postsum_act;
+  bool quantized;
+  bool dedup_sum;
+
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+
+  DMLC_DECLARE_PARAMETER(DNNLConvParam) {
+    DMLC_DECLARE_FIELD(with_bn).set_default(false).describe("Add post batchnorm.");
+    DMLC_DECLARE_FIELD(with_act).set_default(false).describe("Add post activation");
+    DMLC_DECLARE_FIELD(with_sum).set_default(false).describe("Add post sum");
+    DMLC_DECLARE_FIELD(with_postsum_act)
+        .set_default(false)
+        .describe("Add post activation after sum");
+    DMLC_DECLARE_FIELD(quantized).set_default(false).describe("enable quantization");
+    DMLC_DECLARE_FIELD(dedup_sum).set_default(false).describe("deduplicated sum input");
+    DMLC_DECLARE_FIELD(min_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The minimum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized convolution op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(max_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The maximum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized convolution op to calculate primitive scale");
+  }
+};
+
+struct DNNLConvFullParam {
+  ConvolutionParam conv_param;
+  DNNLConvParam dnnl_param;
+  float sum_scale = 1.f;
+  std::vector<float> requantize_scales;
+  DNNLPostEltwiseParam act_param;
+  DNNLPostEltwiseParam postsum_act_param;
+};
+
+std::shared_ptr<dnnl::convolution_forward::primitive_desc> GetConvFwdImpl(
+    const ConvolutionParam& param,
+    const bool is_train,
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray* bias,
+    const NDArray& output);
+
+class DNNLConvForward {
+ public:
+  DNNLConvForward(const DNNLConvFullParam& param,
+                  const bool is_train,
+                  const NDArray& data,
+                  const NDArray& weight,
+                  const NDArray* bias,
+                  const NDArray& output);
+
+  const dnnl::convolution_forward& GetFwd() const {
+    return *fwd_;
+  }
+
+  const dnnl::convolution_forward::primitive_desc& GetPd() const {
+    return *pd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::convolution_forward> fwd_;
+  std::shared_ptr<dnnl::convolution_forward::primitive_desc> pd_;
+};
+
+typedef ParamOpSign<ConvolutionParam> DNNLConvSignature;
+
+DNNLConvForward& GetConvFwd(const DNNLConvFullParam& param,
+                            const bool is_train,
+                            const NDArray& data,
+                            const NDArray& weight,
+                            const NDArray* bias,
+                            const NDArray& output);
+
+void DNNLConvolutionForwardFullFeature(const DNNLConvFullParam& param,
+                                       const OpContext& ctx,
+                                       DNNLConvForward* fwd,
+                                       const std::vector<NDArray>& in_data,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& out_data);
+
+void DNNLConvolutionForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data);
+
+class DNNLConvBackward {
+ public:
+  DNNLConvBackward(const DNNLConvFullParam& param,
+                   const NDArray& data,
+                   const NDArray& weight,
+                   const NDArray* bias,
+                   const NDArray& output);
+
+  const dnnl::convolution_backward_data& GetBwdData() const {
+    return *bwd_data_;
+  }
+
+  const dnnl::convolution_backward_weights& GetBwdWeights() const {
+    return *bwd_weight_;
+  }
+
+  const dnnl::convolution_backward_data::primitive_desc& GetDataPd() const {
+    return *bwd_data_pd_;
+  }
+
+  const dnnl::convolution_backward_weights::primitive_desc& GetWeightsPd() const {
+    return *bwd_weight_pd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::convolution_backward_data::primitive_desc> bwd_data_pd_;
+  std::shared_ptr<dnnl::convolution_backward_weights::primitive_desc> bwd_weight_pd_;
+  std::shared_ptr<dnnl::convolution_backward_data> bwd_data_;
+  std::shared_ptr<dnnl::convolution_backward_weights> bwd_weight_;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/dnnl/dnnl_convolution.cc
similarity index 54%
rename from src/operator/nn/mkldnn/mkldnn_convolution.cc
rename to src/operator/nn/dnnl/dnnl_convolution.cc
index ef2c57e..9754f7f 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/dnnl/dnnl_convolution.cc
@@ -18,50 +18,47 @@
  */
 
 /*!
- * \file mkldnn_convolution.cc
+ * \file dnnl_convolution.cc
  * \brief
  * \author Da Zheng
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
-
 #include "../convolution-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_convolution-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
+DMLC_REGISTER_PARAMETER(DNNLConvParam);
 
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input) {
+bool SupportDNNLConv(const ConvolutionParam& params, const NDArray& input) {
   if (params.kernel.ndim() > 3 || params.kernel.ndim() == 0)
     return false;
-  return IsMKLDNNType(input.dtype()) &&
-         input.shape().ndim() >= 3 && input.shape().ndim() <= 5;
+  return IsDNNLType(input.dtype()) && input.shape().ndim() >= 3 && input.shape().ndim() <= 5;
 }
 
-std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
-    const MKLDNNConvFullParam& param,
+std::shared_ptr<dnnl::convolution_forward::primitive_desc> GetConvFwdImpl(
+    const DNNLConvFullParam& param,
     const bool is_train,
     const NDArray& data,
     const NDArray& weights,
     const NDArray* bias,
     const NDArray& output) {
-  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+  auto prop      = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
   auto data_md   = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.mkldnn_param.quantized);
+  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.dnnl_param.quantized);
   auto out_md    = GetMemDesc(output);
   auto bias_md =
-      bias ? (param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
-           : mkldnn::memory::desc{
-                 {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
+      bias ? (param.dnnl_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
+           : dnnl::memory::desc{{}, dnnl::memory::data_type::undef, dnnl::memory::format_tag::any};
   auto bias_md_ptr = bias ? &bias_md : nullptr;
 
-  mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
-  mkldnn::memory::dims padding(param.conv_param.kernel.ndim());
+  dnnl::memory::dims strides(param.conv_param.kernel.ndim());
+  dnnl::memory::dims padding(param.conv_param.kernel.ndim());
   if (param.conv_param.kernel.ndim() == 1) {
     CHECK_GE(param.conv_param.stride.ndim(), 1);
     CHECK_GE(param.conv_param.pad.ndim(), 1);
@@ -87,48 +84,48 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
     padding[1] = param.conv_param.pad[1];
     padding[2] = param.conv_param.pad[2];
   } else {
-    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.conv_param.kernel.ndim()
+    LOG(FATAL) << "Unexpected DNNL Conv kernel size " << param.conv_param.kernel.ndim()
                << ", supporting only 1 or 2 or 3.";
   }
-  mkldnn::primitive_attr attr;
-  mkldnn::post_ops ops;
-  if (param.mkldnn_param.with_act) {
+  dnnl::primitive_attr attr;
+  dnnl::post_ops ops;
+  if (param.dnnl_param.with_act) {
     const auto& act_param = param.act_param;
     ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
-  if (param.mkldnn_param.with_sum) {
+  if (param.dnnl_param.with_sum) {
     ops.append_sum(param.sum_scale);
   }
-  if (param.mkldnn_param.with_postsum_act) {
+  if (param.dnnl_param.with_postsum_act) {
     const auto& act_param = param.postsum_act_param;
     ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
   attr.set_post_ops(ops);
 
-  if (param.mkldnn_param.quantized && param.requantize_scales.size()) {
+  if (param.dnnl_param.quantized && param.requantize_scales.size()) {
     int mask = (param.requantize_scales.size() > 1) ? 2 : 0;
     attr.set_output_scales(mask, param.requantize_scales);
   }
   auto GetConvFwdPd =
-      [&param, &data, &weights, &output, &attr](const mkldnn::convolution_forward::desc& desc) {
+      [&param, &data, &weights, &output, &attr](const dnnl::convolution_forward::desc& desc) {
         auto engine = CpuEngine::Get()->get_engine();
         try {
-          // MKL-DNN introduced padded formats since 0.15 which require more memory
-          // compared to the actual size of the tensor. Currently, MKL-DNN operators
+          // DNNL introduced padded formats since 0.15 which require more memory
+          // compared to the actual size of the tensor. Currently, DNNL operators
           // still reuse memory from memory planning, so here we need to select a
           // suboptimal kernel for computation that has the expected memory size requirements
           auto conv_pd =
-              std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
+              std::make_shared<dnnl::convolution_forward::primitive_desc>(desc, attr, engine);
           while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
                  conv_pd->src_desc().get_size() != GetArraySize(data) ||
-                 (!param.mkldnn_param.quantized &&
+                 (!param.dnnl_param.quantized &&
                   conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
             // next_impl() will visit desc and engine, please make sure they are still alive here.
             CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
           }
           return conv_pd;
-        } catch (mkldnn::error& e) {
-          if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
+        } catch (dnnl::error& e) {
+          if (e.status == dnnl_unimplemented && param.dnnl_param.quantized) {
             LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
                           "required for int8 convolution";
           } else {
@@ -139,28 +136,28 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
       };
 
   if (param.conv_param.dilate.ndim() == 0 && bias_md_ptr == nullptr) {
-    mkldnn::convolution_forward::desc desc(prop,
-                                           mkldnn::algorithm::convolution_direct,
-                                           data_md,
-                                           weight_md,
-                                           out_md,
-                                           strides,
-                                           padding,
-                                           padding);
+    dnnl::convolution_forward::desc desc(prop,
+                                         dnnl::algorithm::convolution_direct,
+                                         data_md,
+                                         weight_md,
+                                         out_md,
+                                         strides,
+                                         padding,
+                                         padding);
     return GetConvFwdPd(desc);
   } else if (param.conv_param.dilate.ndim() == 0) {
-    mkldnn::convolution_forward::desc desc(prop,
-                                           mkldnn::algorithm::convolution_direct,
-                                           data_md,
-                                           weight_md,
-                                           *bias_md_ptr,
-                                           out_md,
-                                           strides,
-                                           padding,
-                                           padding);
+    dnnl::convolution_forward::desc desc(prop,
+                                         dnnl::algorithm::convolution_direct,
+                                         data_md,
+                                         weight_md,
+                                         *bias_md_ptr,
+                                         out_md,
+                                         strides,
+                                         padding,
+                                         padding);
     return GetConvFwdPd(desc);
   } else {
-    mkldnn::memory::dims dilates(param.conv_param.kernel.ndim());
+    dnnl::memory::dims dilates(param.conv_param.kernel.ndim());
     if (param.conv_param.dilate.ndim() == 1) {
       dilates[0] = param.conv_param.dilate[0] - 1;
     } else if (param.conv_param.dilate.ndim() == 2) {
@@ -171,48 +168,48 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
       dilates[1] = param.conv_param.dilate[1] - 1;
       dilates[2] = param.conv_param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.conv_param.dilate.ndim()
+      LOG(FATAL) << "Unexpected DNNL Conv dilate size " << param.conv_param.dilate.ndim()
                  << ", supporting only 1 or 2 or 3.";
     }
     if (bias_md_ptr == nullptr) {
-      mkldnn::convolution_forward::desc desc(prop,
-                                             mkldnn::algorithm::convolution_direct,
-                                             data_md,
-                                             weight_md,
-                                             out_md,
-                                             strides,
-                                             dilates,
-                                             padding,
-                                             padding);
+      dnnl::convolution_forward::desc desc(prop,
+                                           dnnl::algorithm::convolution_direct,
+                                           data_md,
+                                           weight_md,
+                                           out_md,
+                                           strides,
+                                           dilates,
+                                           padding,
+                                           padding);
       return GetConvFwdPd(desc);
     } else {
-      mkldnn::convolution_forward::desc desc(prop,
-                                             mkldnn::algorithm::convolution_direct,
-                                             data_md,
-                                             weight_md,
-                                             *bias_md_ptr,
-                                             out_md,
-                                             strides,
-                                             dilates,
-                                             padding,
-                                             padding);
+      dnnl::convolution_forward::desc desc(prop,
+                                           dnnl::algorithm::convolution_direct,
+                                           data_md,
+                                           weight_md,
+                                           *bias_md_ptr,
+                                           out_md,
+                                           strides,
+                                           dilates,
+                                           padding,
+                                           padding);
       return GetConvFwdPd(desc);
     }
   }
 }
 
-static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetConvBwdData(
+static std::shared_ptr<dnnl::convolution_backward_data::primitive_desc> GetConvBwdData(
     const ConvolutionParam& param,
     const NDArray& data,
     const NDArray& weight,
     const NDArray& output,
-    const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+    const dnnl::convolution_forward::primitive_desc& fwd_pd) {
   auto data_md   = GetMemDesc(data);
   auto weight_md = GetWeightDesc(weight, param.num_group);
   auto out_md    = GetMemDesc(output);
   auto engine    = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides(param.kernel.ndim());
-  mkldnn::memory::dims padding(param.kernel.ndim());
+  dnnl::memory::dims strides(param.kernel.ndim());
+  dnnl::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
     CHECK_GE(param.stride.ndim(), 1);
     CHECK_GE(param.pad.ndim(), 1);
@@ -238,20 +235,20 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
     padding[1] = param.pad[1];
     padding[2] = param.pad[2];
   } else {
-    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+    LOG(FATAL) << "Unexpected DNNL Conv kernel size " << param.kernel.ndim()
                << ", supporting only 1 or 2 or 3.";
   }
 
   auto GetConvBwdDataPd = [&data, &weight, &output, &fwd_pd](
-                              const mkldnn::convolution_backward_data::desc& desc) {
+                              const dnnl::convolution_backward_data::desc& desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
-      // MKL-DNN introduced padded formats since 0.15 which require more memory
-      // compared to the actual size of the tensor. Currently, MKL-DNN operators
+      // DNNL introduced padded formats since 0.15 which require more memory
+      // compared to the actual size of the tensor. Currently, DNNL operators
       // still reuse memory from memory planning, so here we need to select a
       // suboptimal kernel for computation that has the expected memory size requirements
       auto conv_pd =
-          std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(desc, engine, fwd_pd);
+          std::make_shared<dnnl::convolution_backward_data::primitive_desc>(desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
              conv_pd->diff_src_desc().get_size() != GetArraySize(data) ||
              conv_pd->weights_desc().get_size() != GetArraySize(weight)) {
@@ -259,23 +256,18 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
         CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
       }
       return conv_pd;
-    } catch (mkldnn::error& e) {
+    } catch (dnnl::error& e) {
       LOG(ERROR) << e.message;
       throw;
     }
   };
 
   if (param.dilate.ndim() == 0) {
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
-                                                 data_md,
-                                                 weight_md,
-                                                 out_md,
-                                                 strides,
-                                                 padding,
-                                                 padding);
+    dnnl::convolution_backward_data::desc desc(
+        dnnl::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding);
     return GetConvBwdDataPd(desc);
   } else {
-    mkldnn::memory::dims dilates(param.kernel.ndim());
+    dnnl::memory::dims dilates(param.kernel.ndim());
     if (param.dilate.ndim() == 1) {
       dilates[0] = param.dilate[0] - 1;
     } else if (param.dilate.ndim() == 2) {
@@ -286,34 +278,34 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
       dilates[1] = param.dilate[1] - 1;
       dilates[2] = param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+      LOG(FATAL) << "Unexpected DNNL Conv dilate size " << param.dilate.ndim()
                  << ", supporting only 1 or 2 or 3.";
     }
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
-                                                 data_md,
-                                                 weight_md,
-                                                 out_md,
-                                                 strides,
-                                                 dilates,
-                                                 padding,
-                                                 padding);
+    dnnl::convolution_backward_data::desc desc(dnnl::algorithm::convolution_direct,
+                                               data_md,
+                                               weight_md,
+                                               out_md,
+                                               strides,
+                                               dilates,
+                                               padding,
+                                               padding);
     return GetConvBwdDataPd(desc);
   }
 }
 
-static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> GetConvBwdWeights(
+static std::shared_ptr<dnnl::convolution_backward_weights::primitive_desc> GetConvBwdWeights(
     const ConvolutionParam& param,
     const NDArray& data,
     const NDArray& weight,
     const NDArray* bias,
     const NDArray& output,
-    const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+    const dnnl::convolution_forward::primitive_desc& fwd_pd) {
   auto data_md   = GetMemDesc(data);
   auto weight_md = GetWeightDesc(weight, param.num_group);
   auto out_md    = GetMemDesc(output);
   auto engine    = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides(param.kernel.ndim());
-  mkldnn::memory::dims padding(param.kernel.ndim());
+  dnnl::memory::dims strides(param.kernel.ndim());
+  dnnl::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
     CHECK_GE(param.stride.ndim(), 1);
     CHECK_GE(param.pad.ndim(), 1);
@@ -339,19 +331,19 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
     padding[1] = param.pad[1];
     padding[2] = param.pad[2];
   } else {
-    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+    LOG(FATAL) << "Unexpected DNNL Conv kernel size " << param.kernel.ndim()
                << ", supporting only 1 or 2 or 3.";
   }
 
   auto GetConvBwdWeightsPd = [&data, &weight, &output, &fwd_pd](
-                                 const mkldnn::convolution_backward_weights::desc& desc) {
+                                 const dnnl::convolution_backward_weights::desc& desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
-      // MKL-DNN introduced padded formats since 0.15 which require more memory
-      // compared to the actual size of the tensor. Currently, MKL-DNN operators
+      // DNNL introduced padded formats since 0.15 which require more memory
+      // compared to the actual size of the tensor. Currently, DNNL operators
       // still reuse memory from memory planning, so here we need to select a
       // suboptimal kernel for computation that has the expected memory size requirements
-      auto conv_pd = std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
+      auto conv_pd = std::make_shared<dnnl::convolution_backward_weights::primitive_desc>(
           desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
              conv_pd->src_desc().get_size() != GetArraySize(data) ||
@@ -360,34 +352,29 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
         CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
       }
       return conv_pd;
-    } catch (mkldnn::error& e) {
+    } catch (dnnl::error& e) {
       LOG(ERROR) << e.message;
       throw;
     }
   };
 
   if (param.dilate.ndim() == 0 && bias == nullptr) {
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                    data_md,
-                                                    weight_md,
-                                                    out_md,
-                                                    strides,
-                                                    padding,
-                                                    padding);
+    dnnl::convolution_backward_weights::desc desc(
+        dnnl::algorithm::convolution_direct, data_md, weight_md, out_md, strides, padding, padding);
     return GetConvBwdWeightsPd(desc);
   } else if (param.dilate.ndim() == 0) {
     auto bias_md = GetMemDesc(*bias);
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                    data_md,
-                                                    weight_md,
-                                                    bias_md,
-                                                    out_md,
-                                                    strides,
-                                                    padding,
-                                                    padding);
+    dnnl::convolution_backward_weights::desc desc(dnnl::algorithm::convolution_direct,
+                                                  data_md,
+                                                  weight_md,
+                                                  bias_md,
+                                                  out_md,
+                                                  strides,
+                                                  padding,
+                                                  padding);
     return GetConvBwdWeightsPd(desc);
   } else {
-    mkldnn::memory::dims dilates(param.kernel.ndim());
+    dnnl::memory::dims dilates(param.kernel.ndim());
     if (param.dilate.ndim() == 1) {
       dilates[0] = param.dilate[0] - 1;
     } else if (param.dilate.ndim() == 2) {
@@ -398,52 +385,52 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
       dilates[1] = param.dilate[1] - 1;
       dilates[2] = param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+      LOG(FATAL) << "Unexpected DNNL Conv dilate size " << param.dilate.ndim()
                  << ", supporting only 1 or 2 or 3.";
     }
     if (bias == nullptr) {
-      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                      data_md,
-                                                      weight_md,
-                                                      out_md,
-                                                      strides,
-                                                      dilates,
-                                                      padding,
-                                                      padding);
+      dnnl::convolution_backward_weights::desc desc(dnnl::algorithm::convolution_direct,
+                                                    data_md,
+                                                    weight_md,
+                                                    out_md,
+                                                    strides,
+                                                    dilates,
+                                                    padding,
+                                                    padding);
       return GetConvBwdWeightsPd(desc);
     } else {
       auto bias_md = GetMemDesc(*bias);
-      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                      data_md,
-                                                      weight_md,
-                                                      bias_md,
-                                                      out_md,
-                                                      strides,
-                                                      dilates,
-                                                      padding,
-                                                      padding);
+      dnnl::convolution_backward_weights::desc desc(dnnl::algorithm::convolution_direct,
+                                                    data_md,
+                                                    weight_md,
+                                                    bias_md,
+                                                    out_md,
+                                                    strides,
+                                                    dilates,
+                                                    padding,
+                                                    padding);
       return GetConvBwdWeightsPd(desc);
     }
   }
 }
 
-MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam& param,
-                                     const bool is_train,
-                                     const NDArray& data,
-                                     const NDArray& weight,
-                                     const NDArray* bias,
-                                     const NDArray& output)
+DNNLConvForward::DNNLConvForward(const DNNLConvFullParam& param,
+                                 const bool is_train,
+                                 const NDArray& data,
+                                 const NDArray& weight,
+                                 const NDArray* bias,
+                                 const NDArray& output)
     : pd_(GetConvFwdImpl(param, is_train, data, weight, bias, output)) {
-  fwd_ = std::make_shared<mkldnn::convolution_forward>(GetPd());
+  fwd_ = std::make_shared<dnnl::convolution_forward>(GetPd());
 }
 
-MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
-                              const bool is_train,
-                              const NDArray& data,
-                              const NDArray& weight,
-                              const NDArray* bias,
-                              const NDArray& output) {
-  using conv_fwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash>;
+DNNLConvForward& GetConvFwd(const DNNLConvFullParam& param,
+                            const bool is_train,
+                            const NDArray& data,
+                            const NDArray& weight,
+                            const NDArray* bias,
+                            const NDArray& output) {
+  using conv_fwd_map = std::unordered_map<DNNLConvSignature, DNNLConvForward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local conv_fwd_map fwds;
 #else
@@ -451,7 +438,7 @@ MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
 #endif
   // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
   // fused conv
-  MKLDNNConvSignature key(param.conv_param);
+  DNNLConvSignature key(param.conv_param);
   key.AddSign(is_train);
   // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
   // for the, so we only need to get the shape and the data type of the arrays.
@@ -463,30 +450,30 @@ MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    auto fwd = MKLDNNConvForward(param, is_train, data, weight, bias, output);
+    auto fwd = DNNLConvForward(param, is_train, data, weight, bias, output);
     it       = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
-                                         const OpContext& ctx,
-                                         MKLDNNConvForward* fwd,
-                                         const std::vector<NDArray>& in_data,
-                                         const std::vector<OpReqType>& req,
-                                         const std::vector<NDArray>& out_data) {
+void DNNLConvolutionForwardFullFeature(const DNNLConvFullParam& param,
+                                       const OpContext& ctx,
+                                       DNNLConvForward* fwd,
+                                       const std::vector<NDArray>& in_data,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
 
   auto& data   = in_data[conv::kData];
   auto& weight = in_data[conv::kWeight];
-  bool no_bias = param.conv_param.no_bias && !param.mkldnn_param.with_bn;
+  bool no_bias = param.conv_param.no_bias && !param.dnnl_param.with_bn;
 
-  auto data_mem = data.GetMKLDNNDataReorder(fwd->GetPd().src_desc());
-  const mkldnn::memory* weight_mem;
+  auto data_mem = data.GetDNNLDataReorder(fwd->GetPd().src_desc());
+  const dnnl::memory* weight_mem;
   if (ctx.is_train) {
-    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it to the default format
+    // TODO(zhengda) kvstore doesn't handle DNNL correctly. Let's reorder it to the default format
     // for now.
-    if (weight.IsMKLDNNData())
+    if (weight.IsDNNLData())
       // This asks the engine to change the layout of the weight array after it's used.
       weight.Reorder2DefaultAsync();
     weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
@@ -496,77 +483,77 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
     if (weight.IsDefaultData()) {
       // We also need to modify the layout on the original weight array. The data conversion happens
       // after the weight array is used.
-      weight.MKLDNNDataReorderAsync(fwd->GetPd().weights_desc());
+      weight.DNNLDataReorderAsync(fwd->GetPd().weights_desc());
       weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
     } else {
-      weight_mem = weight.GetMKLDNNDataReorder(fwd->GetPd().weights_desc());
+      weight_mem = weight.GetDNNLDataReorder(fwd->GetPd().weights_desc());
     }
   }
-  mkldnn_output_t out_mem;
-  if (param.mkldnn_param.with_sum) {
-    out_mem = mkldnn_output_t(OutDataOp::Noop,
-                              const_cast<mkldnn::memory*>(out_data[conv::kOut].GetMKLDNNData()));
+  dnnl_output_t out_mem;
+  if (param.dnnl_param.with_sum) {
+    out_mem = dnnl_output_t(OutDataOp::Noop,
+                            const_cast<dnnl::memory*>(out_data[conv::kOut].GetDNNLData()));
   } else {
-    out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd->GetPd().dst_desc(), req[conv::kOut]);
+    out_mem = CreateDNNLMem(out_data[conv::kOut], fwd->GetPd().dst_desc(), req[conv::kOut]);
   }
 
-  mkldnn_args_map_t net_args;
+  dnnl_args_map_t net_args;
   if (!no_bias) {
-    const mkldnn::memory* bias_mem = in_data[conv::kBias].GetMKLDNNData();
-    net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
+    const dnnl::memory* bias_mem = in_data[conv::kBias].GetDNNLData();
+    net_args.insert({DNNL_ARG_BIAS, *bias_mem});
   }
 
-  net_args.insert({MKLDNN_ARG_SRC, *data_mem});
-  net_args.insert({MKLDNN_ARG_WEIGHTS, *weight_mem});
-  net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd->GetFwd(), net_args);
+  net_args.insert({DNNL_ARG_SRC, *data_mem});
+  net_args.insert({DNNL_ARG_WEIGHTS, *weight_mem});
+  net_args.insert({DNNL_ARG_DST, *out_mem.second});
+  DNNLStream::Get()->RegisterPrimArgs(fwd->GetFwd(), net_args);
   CommitOutput(out_data[conv::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& in_data,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& out_data) {
-  MKLDNNConvFullParam param;
+void DNNLConvolutionForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data) {
+  DNNLConvFullParam param;
   param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
   auto& fwd = GetConvFwd(param,
                          ctx.is_train,
                          in_data[conv::kData],
                          in_data[conv::kWeight],
                          param.conv_param.no_bias ? nullptr : &in_data[conv::kBias],
                          out_data[conv::kOut]);
-  MKLDNNConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
+  DNNLConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
 }
 
-MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam& param,
-                                       const NDArray& data,
-                                       const NDArray& weight,
-                                       const NDArray* bias,
-                                       const NDArray& output) {
+DNNLConvBackward::DNNLConvBackward(const DNNLConvFullParam& param,
+                                   const NDArray& data,
+                                   const NDArray& weight,
+                                   const NDArray* bias,
+                                   const NDArray& output) {
   const auto fwd_pd = GetConvFwdImpl(param, true, data, weight, bias, output);
   bwd_data_pd_      = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
   bwd_weight_pd_    = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
-  bwd_data_         = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
-  bwd_weight_       = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
+  bwd_data_         = std::make_shared<dnnl::convolution_backward_data>(GetDataPd());
+  bwd_weight_       = std::make_shared<dnnl::convolution_backward_weights>(GetWeightsPd());
 }
 
-static inline MKLDNNConvBackward& GetConvBwd(const MKLDNNConvFullParam& param,
-                                             const NDArray& data,
-                                             const NDArray& weight,
-                                             const NDArray* bias,
-                                             const NDArray& output) {
-  using mkldnn_conv_bwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvBackward, OpHash>;
+static inline DNNLConvBackward& GetConvBwd(const DNNLConvFullParam& param,
+                                           const NDArray& data,
+                                           const NDArray& weight,
+                                           const NDArray* bias,
+                                           const NDArray& output) {
+  using dnnl_conv_bwd_map = std::unordered_map<DNNLConvSignature, DNNLConvBackward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local mkldnn_conv_bwd_map bwds;
+  static thread_local dnnl_conv_bwd_map bwds;
 #else
-  static MX_THREAD_LOCAL mkldnn_conv_bwd_map bwds;
+  static MX_THREAD_LOCAL dnnl_conv_bwd_map bwds;
 #endif
   // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
   // fused conv
-  MKLDNNConvSignature key(param.conv_param);
+  DNNLConvSignature key(param.conv_param);
   // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
   // for the, so we only need to get the shape and the data type of the arrays.
   key.AddSign(data);
@@ -577,22 +564,22 @@ static inline MKLDNNConvBackward& GetConvBwd(const MKLDNNConvFullParam& param,
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
-    auto bwd = MKLDNNConvBackward(param, data, weight, bias, output);
+    auto bwd = DNNLConvBackward(param, data, weight, bias, output);
     it       = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx,
-                               const std::vector<NDArray>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<NDArray>& outputs) {
+void DNNLConvolutionBackward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
   const std::vector<NDArray>& in_grad = outputs;
-  MKLDNNConvFullParam full_param;
+  DNNLConvFullParam full_param;
   full_param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
+  full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
 
   auto& data       = inputs[conv::kData + 1];
   auto& weight     = inputs[conv::kWeight + 1];
@@ -602,16 +589,16 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
   const ConvolutionParam& param = full_param.conv_param;
 
   CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
-  MKLDNNConvBackward& convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
-  auto out_grad_mem           = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
+  DNNLConvBackward& convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
+  auto out_grad_mem         = out_grad.GetDNNLDataReorder(convBwd.GetDataPd().diff_dst_desc());
   if (req[conv::kData]) {
-    auto weight_mem  = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
-    auto in_grad_mem = CreateMKLDNNMem(
-        in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(), req[conv::kData]);
-    MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdData(),
-                                          {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-                                           {MKLDNN_ARG_WEIGHTS, *weight_mem},
-                                           {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}});
+    auto weight_mem = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
+    auto in_grad_mem =
+        CreateDNNLMem(in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(), req[conv::kData]);
+    DNNLStream::Get()->RegisterPrimArgs(convBwd.GetBwdData(),
+                                        {{DNNL_ARG_DIFF_DST, *out_grad_mem},
+                                         {DNNL_ARG_WEIGHTS, *weight_mem},
+                                         {DNNL_ARG_DIFF_SRC, *in_grad_mem.second}});
     CommitOutput(in_grad[conv::kData], in_grad_mem);
   }
 
@@ -619,28 +606,28 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
   auto req_bias   = req.size() > conv::kBias ? req.at(conv::kBias) : kNullOp;
   if (req_weight || req_bias) {
     if (convBwd.GetDataPd().diff_dst_desc() != convBwd.GetWeightsPd().diff_dst_desc())
-      out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetWeightsPd().diff_dst_desc());
-    auto data_mem       = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
-    auto in_grad_weight = CreateMKLDNNWeightGrad(
+      out_grad_mem = out_grad.GetDNNLDataReorder(convBwd.GetWeightsPd().diff_dst_desc());
+    auto data_mem       = data.GetDNNLDataReorder(convBwd.GetWeightsPd().src_desc());
+    auto in_grad_weight = CreateDNNLWeightGrad(
         in_grad[conv::kWeight], convBwd.GetWeightsPd().diff_weights_desc(), req[conv::kWeight]);
 
-    mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-                                  {MKLDNN_ARG_SRC, *data_mem},
-                                  {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
-    mkldnn_output_t in_grad_bias;
+    dnnl_args_map_t net_args = {{DNNL_ARG_DIFF_DST, *out_grad_mem},
+                                {DNNL_ARG_SRC, *data_mem},
+                                {DNNL_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
+    dnnl_output_t in_grad_bias;
     if (!param.no_bias) {
-      in_grad_bias = CreateMKLDNNMem(
+      in_grad_bias = CreateDNNLMem(
           in_grad[conv::kBias], convBwd.GetWeightsPd().diff_bias_desc(), req[conv::kBias]);
-      net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second});
+      net_args.insert({DNNL_ARG_DIFF_BIAS, *in_grad_bias.second});
     }
-    MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdWeights(), net_args);
+    DNNLStream::Get()->RegisterPrimArgs(convBwd.GetBwdWeights(), net_args);
     CommitOutput(in_grad[conv::kWeight], in_grad_weight);
     // CommitOutput Should run after RegisterPrimArgs for memory dependency
     if (!param.no_bias) {
       CommitOutput(in_grad[conv::kBias], in_grad_bias);
     }
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/dnnl/dnnl_copy.cc
similarity index 69%
rename from src/operator/nn/mkldnn/mkldnn_copy.cc
rename to src/operator/nn/dnnl/dnnl_copy.cc
index 813016d..2b78103 100644
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ b/src/operator/nn/dnnl/dnnl_copy.cc
@@ -18,40 +18,40 @@
  */
 
 /*!
- * \file mkldnn_copy.cc
+ * \file dnnl_copy.cc
  * \brief
  * \author
  */
 
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 #if MXNET_USE_ONEDNN == 1
 namespace mxnet {
 namespace op {
 
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs,
-                const OpContext& ctx,
-                const NDArray& in_data,
-                const OpReqType& req,
-                const NDArray& out_data) {
+void DNNLCopy(const nnvm::NodeAttrs& attrs,
+              const OpContext& ctx,
+              const NDArray& in_data,
+              const OpReqType& req,
+              const NDArray& out_data) {
   if (req == kNullOp || req == kWriteInplace)
     return;
   TmpMemMgr::Get()->Init(ctx.requested[0]);
-  auto in_mem = in_data.GetMKLDNNData();
+  auto in_mem = in_data.GetDNNLData();
   if (req == kAddTo) {
     TmpMemMgr::Get()->Init(ctx.requested[0]);
     // We should try and force the input memory has the same format
     // as the input output. If not, we'll have to reorder memory.
-    auto out_mem = out_data.GetMKLDNNData();
-    in_mem       = in_data.GetMKLDNNData(out_mem->get_desc());
+    auto out_mem = out_data.GetDNNLData();
+    in_mem       = in_data.GetDNNLData(out_mem->get_desc());
     if (in_mem == nullptr)
-      in_mem = in_data.GetMKLDNNDataReorder(out_mem->get_desc());
-    MKLDNNSum(*out_mem, *in_mem, *out_mem);
+      in_mem = in_data.GetDNNLDataReorder(out_mem->get_desc());
+    DNNLSum(*out_mem, *in_mem, *out_mem);
   } else {
     const_cast<NDArray&>(out_data).CopyFrom(*in_mem);
   }
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/dnnl/dnnl_deconvolution-inl.h
similarity index 56%
rename from src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
rename to src/operator/nn/dnnl/dnnl_deconvolution-inl.h
index a66d3a8..3015379 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/dnnl/dnnl_deconvolution-inl.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file mkldnn_deconvolution-inl.h
+ * \file dnnl_deconvolution-inl.h
  * Naming convention:
  *                 ________
  *  (src) data --->|Deconv|
@@ -31,10 +31,10 @@
  *                                 |______|<--- bias
  *
  * "out" in this (and .cc) file will always refer to the output of Deconv FWD and
- * "out_grad" to its gradient. The corresponding MKLDNN names are in parentheses.
+ * "out_grad" to its gradient. The corresponding DNNL names are in parentheses.
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_DECONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_DECONVOLUTION_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <numeric>
@@ -42,25 +42,25 @@
 #include <vector>
 
 #include "../deconvolution-inl.h"
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using deconv_fwd_t    = mkldnn::deconvolution_forward;
-using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc;
+using deconv_fwd_t    = dnnl::deconvolution_forward;
+using deconv_fwd_pd_t = dnnl::deconvolution_forward::primitive_desc;
 
-using deconv_bwd_data_t    = mkldnn::deconvolution_backward_data;
-using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc;
+using deconv_bwd_data_t    = dnnl::deconvolution_backward_data;
+using deconv_bwd_data_pd_t = dnnl::deconvolution_backward_data::primitive_desc;
 
-using deconv_bwd_weights_t    = mkldnn::deconvolution_backward_weights;
-using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
+using deconv_bwd_weights_t    = dnnl::deconvolution_backward_weights;
+using deconv_bwd_weights_pd_t = dnnl::deconvolution_backward_weights::primitive_desc;
 
 // Swaps the logical order of dimensions that in plain format would correspond to input and output
 // channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
-inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc& desc,
-                                              const uint32_t num_group) {
+inline dnnl::memory::desc IOLogicalSwapDesc(const dnnl::memory::desc& desc,
+                                            const uint32_t num_group) {
   std::vector<int> order(desc.data.ndims);
   std::iota(std::begin(order), std::end(order), 0);
   const int offset = static_cast<int>(num_group > 1);
@@ -68,29 +68,29 @@ inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc& desc,
   return desc.permute_axes(order);
 }
 
-// Applies IOLogicalSwapDesc to MKLDNN memory of arr
-inline void IOLogicalSwapMKLDNNMem(const NDArray& arr, const uint32_t num_group) {
-  mkldnn::memory::desc desc;
-  if (arr.IsMKLDNNData()) {
-    desc = arr.GetMKLDNNData()->get_desc();
+// Applies IOLogicalSwapDesc to DNNL memory of arr
+inline void IOLogicalSwapDNNLMem(const NDArray& arr, const uint32_t num_group) {
+  dnnl::memory::desc desc;
+  if (arr.IsDNNLData()) {
+    desc = arr.GetDNNLData()->get_desc();
   } else {
-    // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use
+    // GetDNNLData won't take groups into account when creating dnnl::memory, we need to use
     // descriptor from GetWeightDesc but with default format
     const auto& temp = GetWeightDesc(arr, num_group);
-    desc             = mkldnn::memory::desc(
+    desc             = dnnl::memory::desc(
         temp.dims(),
         temp.data_type(),
-        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
+        static_cast<dnnl::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
   }
-  const_cast<NDArray&>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group));
+  const_cast<NDArray&>(arr).UpdateDNNLMemDesc(IOLogicalSwapDesc(desc, num_group));
 }
 
 // Version of GetWeightsDesc for deconvolution (with swap)
-inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray& weights, const uint32_t num_group) {
+inline dnnl::memory::desc GetDeconvWeightsDesc(const NDArray& weights, const uint32_t num_group) {
   return IOLogicalSwapDesc(GetWeightDesc(weights, num_group), num_group);
 }
 
-class MKLDNNDeconvFwd {
+class DNNLDeconvFwd {
  public:
   struct Tensors {
     Tensors(const NDArray& data,
@@ -107,65 +107,65 @@ class MKLDNNDeconvFwd {
     const NDArray& out;
   };
 
-  static MKLDNNDeconvFwd& GetCached(const DeconvolutionParam& param, const Tensors& tensors);
+  static DNNLDeconvFwd& GetCached(const DeconvolutionParam& param, const Tensors& tensors);
   static std::shared_ptr<deconv_fwd_pd_t> CreatePrimitiveDesc(const DeconvolutionParam& param,
                                                               const Tensors& tensors);
 
-  MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors);
+  DNNLDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors);
   void ControlWeightsFormat(const uint32_t num_group,
                             const bool is_train,
                             const NDArray& weights) const;
   void Execute(const uint32_t num_group, const OpReqType req, const Tensors& tensors) const;
 
  private:
-  const mkldnn::memory* DataMem(const NDArray& data) const;
-  const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
-  const mkldnn::memory* BiasMem(const NDArray& bias) const;
+  const dnnl::memory* DataMem(const NDArray& data) const;
+  const dnnl::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
+  const dnnl::memory* BiasMem(const NDArray& bias) const;
 
-  mkldnn_output_t OutMem(const OpReqType req, const NDArray& out) const;
+  dnnl_output_t OutMem(const OpReqType req, const NDArray& out) const;
 
  private:
   std::shared_ptr<deconv_fwd_t> fwd;
   std::shared_ptr<deconv_fwd_pd_t> fwd_pd;
 };
 
-MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias,
-                                  const std::vector<NDArray>& inputs,
-                                  const std::vector<NDArray>& outputs)
+DNNLDeconvFwd::Tensors::Tensors(const bool no_bias,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<NDArray>& outputs)
     : data(inputs[deconv::kData]),
       weights(inputs[deconv::kWeight]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias]),
       out(outputs[deconv::kOut]) {}
 
-MKLDNNDeconvFwd::Tensors::Tensors(const NDArray& data,
-                                  const NDArray& weights,
-                                  const NDArray* const bias,
-                                  const NDArray& out)
+DNNLDeconvFwd::Tensors::Tensors(const NDArray& data,
+                                const NDArray& weights,
+                                const NDArray* const bias,
+                                const NDArray& out)
     : data(data), weights(weights), bias(bias), out(out) {}
 
-MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors)
+DNNLDeconvFwd::DNNLDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors)
     : fwd_pd(CreatePrimitiveDesc(param, tensors)) {
   fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
 }
 
-inline const mkldnn::memory* MKLDNNDeconvFwd::DataMem(const NDArray& data) const {
-  return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
+inline const dnnl::memory* DNNLDeconvFwd::DataMem(const NDArray& data) const {
+  return data.GetDNNLDataReorder(fwd_pd->src_desc());
 }
 
-inline const mkldnn::memory* MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
-                                                         const NDArray& weights) const {
+inline const dnnl::memory* DNNLDeconvFwd::WeightsMem(const uint32_t num_group,
+                                                     const NDArray& weights) const {
   return GetWeights(weights, fwd_pd->weights_desc(), num_group);
 }
 
-inline const mkldnn::memory* MKLDNNDeconvFwd::BiasMem(const NDArray& bias) const {
-  return bias.GetMKLDNNData();
+inline const dnnl::memory* DNNLDeconvFwd::BiasMem(const NDArray& bias) const {
+  return bias.GetDNNLData();
 }
 
-inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray& out) const {
-  return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
+inline dnnl_output_t DNNLDeconvFwd::OutMem(const OpReqType req, const NDArray& out) const {
+  return CreateDNNLMem(out, fwd_pd->dst_desc(), req);
 }
 
-class MKLDNNDeconvBwd {
+class DNNLDeconvBwd {
  public:
   struct ReadTensors {
     ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs);
@@ -181,8 +181,7 @@ class MKLDNNDeconvBwd {
     const NDArray* const bias_grad;
   };
 
-  static MKLDNNDeconvBwd& GetCached(const DeconvolutionParam& param,
-                                    const ReadTensors& read_tensors);
+  static DNNLDeconvBwd& GetCached(const DeconvolutionParam& param, const ReadTensors& read_tensors);
 
   static std::shared_ptr<deconv_bwd_data_pd_t> CreateDataPrimitiveDesc(
       const DeconvolutionParam& param,
@@ -194,7 +193,7 @@ class MKLDNNDeconvBwd {
       const ReadTensors& read_tensors,
       const deconv_fwd_pd_t& fwd_pd);
 
-  MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors);
+  DNNLDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors);
 
   void Execute(const uint32_t num_group,
                const std::vector<OpReqType>& req,
@@ -209,31 +208,31 @@ class MKLDNNDeconvBwd {
 
   // returns the output gradient memory used to calculate the data (input) gradient,
   // which might be reused when calculating the gradient of weights
-  const mkldnn::memory* ScheduleBwdData(const uint32_t num_group,
-                                        const OpReqType req,
-                                        const ReadTensors& read_tensors,
-                                        const WriteTensors& write_tensors) const;
+  const dnnl::memory* ScheduleBwdData(const uint32_t num_group,
+                                      const OpReqType req,
+                                      const ReadTensors& read_tensors,
+                                      const WriteTensors& write_tensors) const;
 
   void ScheduleBwdWeights(const uint32_t num_group,
                           const std::vector<OpReqType>& req,
                           const ReadTensors& read_tensors,
                           const WriteTensors& write_tensors,
-                          const mkldnn::memory* const out_grad_mem) const;
+                          const dnnl::memory* const out_grad_mem) const;
 
-  const mkldnn::memory* DataMem(const NDArray& data) const;
-  const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
+  const dnnl::memory* DataMem(const NDArray& data) const;
+  const dnnl::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
 
   // for calculating the gradient of data (input)
-  const mkldnn::memory* OutGradMem(const NDArray& out_grad) const;
+  const dnnl::memory* OutGradMem(const NDArray& out_grad) const;
   // for calculating the gradient of weights
-  const mkldnn::memory* OutGradMem(const NDArray& out_grad,
-                                   const mkldnn::memory* const out_grad_mem) const;
+  const dnnl::memory* OutGradMem(const NDArray& out_grad,
+                                 const dnnl::memory* const out_grad_mem) const;
 
-  mkldnn_output_t DataGradMem(const OpReqType req, const NDArray& data_grad) const;
-  mkldnn_output_t WeightsGradMem(const uint32_t num_group,
-                                 const OpReqType req,
-                                 const NDArray& weights_grad) const;
-  mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray* const bias) const;
+  dnnl_output_t DataGradMem(const OpReqType req, const NDArray& data_grad) const;
+  dnnl_output_t WeightsGradMem(const uint32_t num_group,
+                               const OpReqType req,
+                               const NDArray& weights_grad) const;
+  dnnl_output_t BiasGradMem(const OpReqType req, const NDArray* const bias) const;
 
   std::shared_ptr<deconv_bwd_data_pd_t> bwd_data_pd;
   std::shared_ptr<deconv_bwd_weights_pd_t> bwd_weights_pd;
@@ -241,21 +240,21 @@ class MKLDNNDeconvBwd {
   std::shared_ptr<deconv_bwd_weights_t> bwd_weights;
 };
 
-MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs)
+DNNLDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs)
     : data(inputs[deconv::kData + 1]),
       weights(inputs[deconv::kWeight + 1]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
       out_grad(inputs[deconv::kOut]) {}
 
-MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs)
+DNNLDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs)
     : data_grad(outputs[deconv::kData]),
       weights_grad(outputs[deconv::kWeight]),
       bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
 
-MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors) {
-  const auto& fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc(
+DNNLDeconvBwd::DNNLDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors) {
+  const auto& fwd_pd = DNNLDeconvFwd::CreatePrimitiveDesc(
       param,
-      MKLDNNDeconvFwd::Tensors(
+      DNNLDeconvFwd::Tensors(
           read_tensors.data, read_tensors.weights, read_tensors.bias, read_tensors.out_grad));
   bwd_data_pd    = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd);
   bwd_weights_pd = CreateWeightsPrimitiveDesc(param, read_tensors, *fwd_pd);
@@ -263,62 +262,61 @@ MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTens
   bwd_weights    = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
 }
 
-inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
-                                                  const std::vector<OpReqType>& req,
-                                                  const NDArray& weights,
-                                                  const NDArray& weights_grad) const {
+inline void DNNLDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
+                                                const std::vector<OpReqType>& req,
+                                                const NDArray& weights,
+                                                const NDArray& weights_grad) const {
   if (req[deconv::kData]) {
-    IOLogicalSwapMKLDNNMem(weights, num_group);
+    IOLogicalSwapDNNLMem(weights, num_group);
   }
   if (req[deconv::kWeight] || (req.size() < deconv::kBias && req[deconv::kBias])) {
-    IOLogicalSwapMKLDNNMem(weights_grad, num_group);
+    IOLogicalSwapDNNLMem(weights_grad, num_group);
   }
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::DataMem(const NDArray& data) const {
-  return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc());
+inline const dnnl::memory* DNNLDeconvBwd::DataMem(const NDArray& data) const {
+  return data.GetDNNLDataReorder(bwd_weights_pd->src_desc());
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
-                                                         const NDArray& weights) const {
+inline const dnnl::memory* DNNLDeconvBwd::WeightsMem(const uint32_t num_group,
+                                                     const NDArray& weights) const {
   return GetWeights(weights, bwd_data_pd->weights_desc(), num_group);
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(const NDArray& out_grad) const {
-  return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc());
+inline const dnnl::memory* DNNLDeconvBwd::OutGradMem(const NDArray& out_grad) const {
+  return out_grad.GetDNNLDataReorder(bwd_data_pd->diff_dst_desc());
 }
 
-inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(
-    const NDArray& out_grad,
-    const mkldnn::memory* const out_grad_mem) const {
+inline const dnnl::memory* DNNLDeconvBwd::OutGradMem(const NDArray& out_grad,
+                                                     const dnnl::memory* const out_grad_mem) const {
   return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc())
              ? out_grad_mem
-             : out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc());
+             : out_grad.GetDNNLDataReorder(bwd_weights_pd->diff_dst_desc());
 }
 
-inline mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req,
-                                                    const NDArray& data_grad) const {
-  return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
+inline dnnl_output_t DNNLDeconvBwd::DataGradMem(const OpReqType req,
+                                                const NDArray& data_grad) const {
+  return CreateDNNLMem(data_grad, bwd_data_pd->diff_src_desc(), req);
 }
 
-inline mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group,
-                                                       const OpReqType req,
-                                                       const NDArray& weights_grad) const {
-  // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
-  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad
+inline dnnl_output_t DNNLDeconvBwd::WeightsGradMem(const uint32_t num_group,
+                                                   const OpReqType req,
+                                                   const NDArray& weights_grad) const {
+  // CreateDNNLWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
+  // of the logical swap - explained in DNNLDeconvFwd::Execute). We try to reuse weights_grad
   // memory (which, when not swapped, is always in default format), so here we check if after a
   // swap, weights_md will have a default format
   const auto& weights_md = bwd_weights_pd->diff_weights_desc();
   if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(weights_md, num_group))) {
-    return {OutDataOp::Noop, const_cast<NDArray&>(weights_grad).CreateMKLDNNData(weights_md)};
+    return {OutDataOp::Noop, const_cast<NDArray&>(weights_grad).CreateDNNLData(weights_md)};
   }
-  return CreateMKLDNNWeightGrad(weights_grad, weights_md, req);
+  return CreateDNNLWeightGrad(weights_grad, weights_md, req);
 }
 
-inline mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req,
-                                                    const NDArray* const bias) const {
-  return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
-              : mkldnn_output_t(OutDataOp::Noop, nullptr);
+inline dnnl_output_t DNNLDeconvBwd::BiasGradMem(const OpReqType req,
+                                                const NDArray* const bias) const {
+  return bias ? CreateDNNLMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
+              : dnnl_output_t(OutDataOp::Noop, nullptr);
 }
 
 // Utility class for creating operation descriptors of deconvolution primitives
@@ -349,21 +347,21 @@ class DeconvDescCreator {
   deconv_bwd_weights_t::desc CreateBwdWeightsDesc() const;
 
  private:
-  mkldnn::memory::desc data_md;
-  mkldnn::memory::desc weights_md;
-  mkldnn::memory::desc bias_md;
-  mkldnn::memory::desc out_md;
-
-  mkldnn::memory::dims strides;
-  mkldnn::memory::dims padding;
-  mkldnn::memory::dims dilates;
+  dnnl::memory::desc data_md;
+  dnnl::memory::desc weights_md;
+  dnnl::memory::desc bias_md;
+  dnnl::memory::desc out_md;
+
+  dnnl::memory::dims strides;
+  dnnl::memory::dims padding;
+  dnnl::memory::dims dilates;
 };
 
 inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size,
                                                 const size_t weights_size,
                                                 const size_t out_size) const {
-  // MKLDNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKLDNN operators
+  // DNNL introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, DNNL operators
   // still reuse memory from memory planning, so here we need to accept only a
   // kernel that has the expected memory size requirements (which is suboptimal)
   return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) &&
@@ -371,8 +369,8 @@ inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size,
 }
 
 inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const {
-  return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
-                            mkldnn::algorithm::deconvolution_direct,
+  return deconv_fwd_t::desc(dnnl::prop_kind::forward_training,
+                            dnnl::algorithm::deconvolution_direct,
                             data_md,
                             weights_md,
                             bias_md,
@@ -384,7 +382,7 @@ inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const {
 }
 
 inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const {
-  return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct,
+  return deconv_bwd_data_t::desc(dnnl::algorithm::deconvolution_direct,
                                  data_md,
                                  weights_md,
                                  out_md,
@@ -395,7 +393,7 @@ inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const {
 }
 
 inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() const {
-  return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct,
+  return deconv_bwd_weights_t::desc(dnnl::algorithm::deconvolution_direct,
                                     data_md,
                                     weights_md,
                                     bias_md,
@@ -409,4 +407,4 @@ inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() cons
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H__
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_DECONVOLUTION_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/dnnl/dnnl_deconvolution.cc
similarity index 69%
rename from src/operator/nn/mkldnn/mkldnn_deconvolution.cc
rename to src/operator/nn/dnnl/dnnl_deconvolution.cc
index 7621a51..f4766a1 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/dnnl/dnnl_deconvolution.cc
@@ -18,40 +18,39 @@
  */
 
 /*!
- * \file mkldnn_deconvolution.cc
+ * \file dnnl_deconvolution.cc
  */
 
 #if MXNET_USE_ONEDNN == 1
 
 #include "../deconvolution-inl.h"
-#include "./mkldnn_deconvolution-inl.h"
+#include "./dnnl_deconvolution-inl.h"
 
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input) {
+bool SupportDNNLDeconv(const DeconvolutionParam& params, const NDArray& input) {
   return params.kernel.ndim() >= 1 && params.kernel.ndim() <= 3 &&
          input.shape().ndim() == (params.kernel.ndim() + 2) &&
          (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);
 }
 
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& outputs) {
+void DNNLDeconvolutionForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
   const auto& param  = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
-  const auto& fwd    = MKLDNNDeconvFwd::GetCached(param, tensors);
+  const auto tensors = DNNLDeconvFwd::Tensors(param.no_bias, inputs, outputs);
+  const auto& fwd    = DNNLDeconvFwd::GetCached(param, tensors);
 
   fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights);
   fwd.Execute(param.num_group, req[deconv::kOut], tensors);
 }
 
-MKLDNNDeconvFwd& MKLDNNDeconvFwd::GetCached(const DeconvolutionParam& param,
-                                            const Tensors& tensors) {
-  using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvFwd, OpHash>;
+DNNLDeconvFwd& DNNLDeconvFwd::GetCached(const DeconvolutionParam& param, const Tensors& tensors) {
+  using deconv_fwd_map = std::unordered_map<DeconvSignature, DNNLDeconvFwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local deconv_fwd_map fwds;
 #else
@@ -67,15 +66,14 @@ MKLDNNDeconvFwd& MKLDNNDeconvFwd::GetCached(const DeconvolutionParam& param,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    const MKLDNNDeconvFwd fwd(param, tensors);
+    const DNNLDeconvFwd fwd(param, tensors);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
-    const DeconvolutionParam& param,
-    const Tensors& tensors) {
+std::shared_ptr<deconv_fwd_pd_t> DNNLDeconvFwd::CreatePrimitiveDesc(const DeconvolutionParam& param,
+                                                                    const Tensors& tensors) {
   DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out);
   const auto& engine          = CpuEngine::Get()->get_engine();
   const auto pd               = std::make_shared<deconv_fwd_pd_t>(ddc.CreateFwdDesc(), engine);
@@ -93,13 +91,13 @@ std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
   return pd;
 }
 
-void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
-                                           const bool is_train,
-                                           const NDArray& weights) const {
+void DNNLDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
+                                         const bool is_train,
+                                         const NDArray& weights) const {
   if (is_train) {
-    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+    // TODO(zhengda) kvstore doesn't handle DNNL correctly. Let's reorder it
     // to the default format for now.
-    if (weights.IsMKLDNNData()) {
+    if (weights.IsDNNLData()) {
       // This asks the engine to change the layout of the weights array after it's used.
       weights.Reorder2DefaultAsync();
     }
@@ -109,17 +107,17 @@ void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
     if (weights.IsDefaultData()) {
       // We also need to modify the layout on the original weights array.
       // The data conversion happens after the weights array is used.
-      weights.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
+      weights.DNNLDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     } else {
-      CHECK(weights.GetMKLDNNData()->get_desc() ==
+      CHECK(weights.GetDNNLData()->get_desc() ==
             IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     }
   }
 }
 
-void MKLDNNDeconvFwd::Execute(const uint32_t num_group,
-                              const OpReqType req,
-                              const Tensors& tensors) const {
+void DNNLDeconvFwd::Execute(const uint32_t num_group,
+                            const OpReqType req,
+                            const Tensors& tensors) const {
   // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
   // For that, we would pass input tensor in place of output and output tensor in place of input
   // (for appropriate convolution primitives: deconvolution forward = convolution backward data,
@@ -129,56 +127,56 @@ void MKLDNNDeconvFwd::Execute(const uint32_t num_group,
   // primitive_out_channels = deconv_in_channels, primitive_in_channels = deconv_out_channels,
   // so it becomes (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such tensor.
   //
-  // MKLDNN deconvolution primitive also (as convolution) expects weights tensor with the shape of
+  // DNNL deconvolution primitive also (as convolution) expects weights tensor with the shape of
   // (primitive_out_channels, primitive_in_channels, h, w), but this time we don't swap input and
   // output tensors, so:
   // primitive_out_channels = deconv_out_channels, primitive_in_channels = deconv_in_channels,
   // thus the current weights tensor won't fit (when deconv_out_channels != deconv_in_channels).
-  // However, underneath deconvolution MKLDNN also uses convolution, so even though it expects the
+  // However, underneath deconvolution DNNL also uses convolution, so even though it expects the
   // weights tensor with the logical order of oihw, it wants its physical representation to
   // match the order of iohw, which is the same as current weights tensor.
   //
   // So here we swap logical order of input and output dimensions for weights tensor just for
-  // MKLDNN operations.
-  IOLogicalSwapMKLDNNMem(tensors.weights, num_group);
+  // DNNL operations.
+  IOLogicalSwapDNNLMem(tensors.weights, num_group);
   {
-    mkldnn_args_map_t net_args;
+    dnnl_args_map_t net_args;
     const auto& out_mem = OutMem(req, tensors.out);
 
-    net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)});
-    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
-    net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
+    net_args.insert({DNNL_ARG_SRC, *DataMem(tensors.data)});
+    net_args.insert({DNNL_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
+    net_args.insert({DNNL_ARG_DST, *out_mem.second});
     if (tensors.bias) {
-      net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)});
+      net_args.insert({DNNL_ARG_BIAS, *BiasMem(*tensors.bias)});
     }
 
     // CommitOutput should run after RegisterPrimArgs for memory dependency
-    MKLDNNStream::Get()->RegisterPrimArgs(*fwd, net_args);
+    DNNLStream::Get()->RegisterPrimArgs(*fwd, net_args);
     CommitOutput(tensors.out, out_mem);
-    MKLDNNStream::Get()->Submit();
+    DNNLStream::Get()->Submit();
   }
-  IOLogicalSwapMKLDNNMem(tensors.weights, num_group);  // swap back from oihw to iohw
+  IOLogicalSwapDNNLMem(tensors.weights, num_group);  // swap back from oihw to iohw
 }
 
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
-                                 const OpContext& ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const std::vector<OpReqType>& req,
-                                 const std::vector<NDArray>& outputs) {
+void DNNLDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
   CHECK_NE(req[deconv::kWeight], kWriteInplace) << "Cannot write weights inplace";
 
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
   const auto& param        = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto read_tensors  = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
-  const auto write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
-  MKLDNNDeconvBwd& bwd     = MKLDNNDeconvBwd::GetCached(param, read_tensors);
+  const auto read_tensors  = DNNLDeconvBwd::ReadTensors(param.no_bias, inputs);
+  const auto write_tensors = DNNLDeconvBwd::WriteTensors(param.no_bias, outputs);
+  DNNLDeconvBwd& bwd       = DNNLDeconvBwd::GetCached(param, read_tensors);
 
   bwd.Execute(param.num_group, req, read_tensors, write_tensors);
 }
 
-MKLDNNDeconvBwd& MKLDNNDeconvBwd::GetCached(const DeconvolutionParam& param,
-                                            const ReadTensors& read_tensors) {
-  using deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
+DNNLDeconvBwd& DNNLDeconvBwd::GetCached(const DeconvolutionParam& param,
+                                        const ReadTensors& read_tensors) {
+  using deconv_bwd_map = std::unordered_map<DeconvSignature, DNNLDeconvBwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local deconv_bwd_map bwds;
 #else
@@ -194,13 +192,13 @@ MKLDNNDeconvBwd& MKLDNNDeconvBwd::GetCached(const DeconvolutionParam& param,
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
-    const MKLDNNDeconvBwd bwd(param, read_tensors);
+    const DNNLDeconvBwd bwd(param, read_tensors);
     it = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
-std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
+std::shared_ptr<deconv_bwd_data_pd_t> DNNLDeconvBwd::CreateDataPrimitiveDesc(
     const DeconvolutionParam& param,
     const ReadTensors& read_tensors,
     const deconv_fwd_pd_t& fwd_pd) {
@@ -222,7 +220,7 @@ std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
   return pd;
 }
 
-std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitiveDesc(
+std::shared_ptr<deconv_bwd_weights_pd_t> DNNLDeconvBwd::CreateWeightsPrimitiveDesc(
     const DeconvolutionParam& param,
     const ReadTensors& read_tensors,
     const deconv_fwd_pd_t& fwd_pd) {
@@ -245,64 +243,64 @@ std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitive
   return pd;
 }
 
-void MKLDNNDeconvBwd::Execute(const uint32_t num_group,
-                              const std::vector<OpReqType>& req,
-                              const ReadTensors& read_tensors,
-                              const WriteTensors& write_tensors) const {
-  // swaps are explained in MKLDNNDeconvFwd::Execute
+void DNNLDeconvBwd::Execute(const uint32_t num_group,
+                            const std::vector<OpReqType>& req,
+                            const ReadTensors& read_tensors,
+                            const WriteTensors& write_tensors) const {
+  // swaps are explained in DNNLDeconvFwd::Execute
   IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
   {
     auto* const out_grad_mem =
         ScheduleBwdData(num_group, req[deconv::kData], read_tensors, write_tensors);
     ScheduleBwdWeights(num_group, req, read_tensors, write_tensors, out_grad_mem);
-    MKLDNNStream::Get()->Submit();
+    DNNLStream::Get()->Submit();
   }
   IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
 }
 
-const mkldnn::memory* MKLDNNDeconvBwd::ScheduleBwdData(const uint32_t num_group,
-                                                       const OpReqType req,
-                                                       const ReadTensors& read_tensors,
-                                                       const WriteTensors& write_tensors) const {
+const dnnl::memory* DNNLDeconvBwd::ScheduleBwdData(const uint32_t num_group,
+                                                   const OpReqType req,
+                                                   const ReadTensors& read_tensors,
+                                                   const WriteTensors& write_tensors) const {
   if (req) {
-    mkldnn_args_map_t net_args;
+    dnnl_args_map_t net_args;
     auto* const out_grad_mem  = OutGradMem(read_tensors.out_grad);
     const auto& data_grad_mem = DataGradMem(req, write_tensors.data_grad);
 
-    net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem});
-    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)});
-    net_args.insert({MKLDNN_ARG_DIFF_SRC, *data_grad_mem.second});
+    net_args.insert({DNNL_ARG_DIFF_DST, *out_grad_mem});
+    net_args.insert({DNNL_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)});
+    net_args.insert({DNNL_ARG_DIFF_SRC, *data_grad_mem.second});
 
     // CommitOutput should run after RegisterPrimArgs for memory dependency
-    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_data, net_args);
+    DNNLStream::Get()->RegisterPrimArgs(*bwd_data, net_args);
     CommitOutput(write_tensors.data_grad, data_grad_mem);
     return out_grad_mem;
   }
   return nullptr;
 }
 
-void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group,
-                                         const std::vector<OpReqType>& req,
-                                         const ReadTensors& read_tensors,
-                                         const WriteTensors& write_tensors,
-                                         const mkldnn::memory* const out_grad_mem) const {
+void DNNLDeconvBwd::ScheduleBwdWeights(const uint32_t num_group,
+                                       const std::vector<OpReqType>& req,
+                                       const ReadTensors& read_tensors,
+                                       const WriteTensors& write_tensors,
+                                       const dnnl::memory* const out_grad_mem) const {
   OpReqType weight_req = req[deconv::kWeight];
   OpReqType bias_req   = req.size() > deconv::kBias ? req[deconv::kBias] : OpReqType::kNullOp;
   if (weight_req || bias_req) {
-    mkldnn_args_map_t net_args;
+    dnnl_args_map_t net_args;
     const auto& weights_grad_mem =
         WeightsGradMem(num_group, weight_req, write_tensors.weights_grad);
     const auto& bias_grad_mem = BiasGradMem(bias_req, write_tensors.bias_grad);
 
-    net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)});
-    net_args.insert({MKLDNN_ARG_SRC, *DataMem(read_tensors.data)});
-    net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weights_grad_mem.second});
+    net_args.insert({DNNL_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)});
+    net_args.insert({DNNL_ARG_SRC, *DataMem(read_tensors.data)});
+    net_args.insert({DNNL_ARG_DIFF_WEIGHTS, *weights_grad_mem.second});
     if (bias_grad_mem.second) {
-      net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second});
+      net_args.insert({DNNL_ARG_DIFF_BIAS, *bias_grad_mem.second});
     }
 
     // CommitOutput should run after RegisterPrimArgs for memory dependency
-    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weights, net_args);
+    DNNLStream::Get()->RegisterPrimArgs(*bwd_weights, net_args);
     CommitOutput(write_tensors.weights_grad, weights_grad_mem);
     if (bias_grad_mem.second) {
       CommitOutput(*write_tensors.bias_grad, bias_grad_mem);
@@ -317,7 +315,7 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam& param,
                                      const NDArray& out)
     : data_md(GetMemDesc(data)),
       weights_md(GetDeconvWeightsDesc(weights, param.num_group)),
-      bias_md(bias ? GetMemDesc(*bias) : mkldnn::memory::desc()),
+      bias_md(bias ? GetMemDesc(*bias) : dnnl::memory::desc()),
       out_md(GetMemDesc(out)),
       strides(param.stride.ndim()),
       padding(param.pad.ndim()),
diff --git a/src/operator/nn/dnnl/dnnl_fully_connected-inl.h b/src/operator/nn/dnnl/dnnl_fully_connected-inl.h
new file mode 100644
index 0000000..980b931
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_fully_connected-inl.h
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_fully_connected-inl.h
+ * \brief Common functions used by DNNL (Quantized) FullyConnected operator
+ * \author Ciyong Chen
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_FULLY_CONNECTED_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_FULLY_CONNECTED_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+
+#include <string>
+#include <vector>
+
+#include "../fully_connected-inl.h"
+#include "./dnnl_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct DNNLFCParam : public dmlc::Parameter<DNNLFCParam> {
+  bool quantized;
+  bool enable_float_output;
+  bool with_eltwise;
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+  dmlc::optional<bool> channel_wise_quantize;
+
+  DMLC_DECLARE_PARAMETER(DNNLFCParam) {
+    DMLC_DECLARE_FIELD(quantized).set_default(false).describe(
+        "Whether it's a quantized FullyConnected operator");
+    DMLC_DECLARE_FIELD(enable_float_output)
+        .set_default(false)
+        .describe("Whether to enable float32 output");
+    DMLC_DECLARE_FIELD(with_eltwise)
+        .set_default(false)
+        .describe("Whether there's a post with_eltwise after FullyConnected operator");
+    DMLC_DECLARE_FIELD(min_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The minimum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized fullyconnected op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(max_calib_range)
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The maximum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized fullyconnected op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(channel_wise_quantize)
+        .set_default(dmlc::optional<bool>())
+        .describe("Whether support channel-wise-quantize for weight.");
+  }
+};
+
+struct DNNLFCFullParam {
+  FullyConnectedParam default_param;
+  DNNLFCParam dnnl_param;
+  DNNLPostEltwiseParam eltwise_param;
+  std::vector<float> output_scales = {0.0f};
+};
+
+dnnl::inner_product_forward::primitive_desc GetFCFwdImpl(const DNNLFCFullParam& full_param,
+                                                         const bool is_train,
+                                                         const NDArray& data,
+                                                         const NDArray& weight,
+                                                         const NDArray* bias,
+                                                         const dnnl::memory::desc& out_md);
+
+class DNNLFullyConnectedForward {
+ public:
+  dnnl::inner_product_forward::primitive_desc fwd_pd;
+
+  DNNLFullyConnectedForward(const DNNLFCFullParam& full_param,
+                            const bool is_train,
+                            const NDArray& data,
+                            const NDArray& weight,
+                            const NDArray* bias,
+                            const dnnl::memory::desc& out_md)
+      : fwd_pd(GetFCFwdImpl(full_param, is_train, data, weight, bias, out_md)) {
+    fwd_ = std::make_shared<dnnl::inner_product_forward>(fwd_pd);
+  }
+
+  const dnnl::inner_product_forward& GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::inner_product_forward> fwd_;
+};
+
+typedef ParamOpSign<FullyConnectedParam> DNNLFullyconSignature;
+
+DNNLFullyConnectedForward& GetFCFwd(const FullyConnectedParam& param,
+                                    const bool is_train,
+                                    const NDArray& data,
+                                    const NDArray& weight,
+                                    const NDArray* bias,
+                                    const dnnl::memory::desc& out_md);
+
+void DNNLFCFlattenData(const FullyConnectedParam& param,
+                       const NDArray& out_data,
+                       NDArray* in_data,
+                       dnnl::memory::desc* out_md);
+
+void DNNLFCForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& in_data,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& out_data);
+
+void DNNLFCForwardFullFeature(const DNNLFCFullParam& param,
+                              const OpContext& ctx,
+                              DNNLFullyConnectedForward* fwd,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_fully_connected.cc b/src/operator/nn/dnnl/dnnl_fully_connected.cc
new file mode 100644
index 0000000..5bb3c9d
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_fully_connected.cc
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_fully_connected.cc
+ * \brief DNNL FullyConnected operator
+ * \author Da Zheng, Ciyong Chen
+ */
+
+#if MXNET_USE_ONEDNN == 1
+#include "dnnl_fully_connected-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(DNNLFCParam);
+
+dnnl::inner_product_forward::primitive_desc GetFCFwdImpl(const DNNLFCFullParam& full_param,
+                                                         const bool is_train,
+                                                         const NDArray& data,
+                                                         const NDArray& weight,
+                                                         const NDArray* bias,
+                                                         const dnnl::memory::desc& out_md) {
+  auto engine    = CpuEngine::Get()->get_engine();
+  auto data_md   = GetMemDesc(data);
+  auto weight_md = full_param.dnnl_param.quantized
+                       ? GetFCWeightDesc(weight, data.shape()[0], mshadow::kInt8)
+                       : GetFCWeightDesc(weight, data.shape()[0]);
+  auto propagation =
+      is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+
+  dnnl::primitive_attr attr;
+  dnnl::post_ops ops;
+  if (full_param.dnnl_param.with_eltwise) {
+    ops.append_eltwise(full_param.eltwise_param.scale,
+                       full_param.eltwise_param.alg,
+                       full_param.eltwise_param.alpha,
+                       full_param.eltwise_param.beta);
+  }
+  attr.set_post_ops(ops);
+
+  if (full_param.dnnl_param.quantized && full_param.output_scales.size()) {
+    int mask = (full_param.output_scales.size() == 1) ? 0 : (1 << 1);
+    attr.set_output_scales(mask, full_param.output_scales);
+  }
+
+  auto GetFCFwdPd = [&full_param, &attr, &engine](const dnnl::inner_product_forward::desc& desc) {
+    try {
+      return dnnl::inner_product_forward::primitive_desc(desc, attr, engine);
+    } catch (dnnl::error& e) {
+      if (e.status == dnnl_unimplemented && full_param.dnnl_param.quantized) {
+        LOG(ERROR) << "AVX512-BW support or DNNL v0.18 is required for INT8 fully_connected.";
+      } else {
+        LOG(ERROR) << e.message;
+      }
+      throw;
+    }
+  };
+
+  if (bias) {
+    if ((*bias).shape().ndim() != 1)
+      LOG(FATAL) << "Unexpected shape for bias " << (*bias).shape();
+    auto bias_md =
+        full_param.dnnl_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias);
+    dnnl::inner_product_forward::desc desc(propagation, data_md, weight_md, bias_md, out_md);
+    return GetFCFwdPd(desc);
+  } else {
+    dnnl::inner_product_forward::desc desc(propagation, data_md, weight_md, out_md);
+    return GetFCFwdPd(desc);
+  }
+}
+
+inline static dnnl::inner_product_backward_data::primitive_desc GetFCBwdData(
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray& output,
+    dnnl::inner_product_forward::primitive_desc fwd_pd) {
+  auto data_md   = GetMemDesc(data);
+  auto weight_md = GetFCWeightDesc(weight, data.shape()[0]);
+  auto out_md    = GetMemDesc(output);
+  auto engine    = CpuEngine::Get()->get_engine();
+  dnnl::inner_product_backward_data::desc desc(data_md, weight_md, out_md);
+  return dnnl::inner_product_backward_data::primitive_desc(desc, engine, fwd_pd);
+}
+
+inline static dnnl::inner_product_backward_weights::primitive_desc GetFCBwdWeights(
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray* bias,
+    const NDArray& output,
+    dnnl::inner_product_forward::primitive_desc fwd_pd) {
+  auto data_md   = GetMemDesc(data);
+  auto weight_md = GetFCWeightDesc(weight, data.shape()[0]);
+  auto out_md    = GetMemDesc(output);
+  auto engine    = CpuEngine::Get()->get_engine();
+  if (bias) {
+    auto bias_md = GetMemDesc(*bias);
+    dnnl::inner_product_backward_weights::desc desc(data_md, weight_md, bias_md, out_md);
+    return dnnl::inner_product_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    dnnl::inner_product_backward_weights::desc desc(data_md, weight_md, out_md);
+    return dnnl::inner_product_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  }
+}
+
+DNNLFullyConnectedForward& GetFCFwd(const FullyConnectedParam& param,
+                                    const bool is_train,
+                                    const NDArray& data,
+                                    const NDArray& weight,
+                                    const NDArray* bias,
+                                    const dnnl::memory::desc& out_md) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLFullyconSignature, DNNLFullyConnectedForward, OpHash>
+      fcFwds;
+#else
+  static MX_THREAD_LOCAL
+      std::unordered_map<DNNLFullyconSignature, DNNLFullyConnectedForward, OpHash>
+          fcFwds;
+#endif
+  DNNLFullyconSignature key(param);
+  key.AddSign(is_train);
+  key.AddSign(data);
+  key.AddSign(weight);
+  if (bias)
+    key.AddSign(*bias);
+
+  auto it = fcFwds.find(key);
+  if (it == fcFwds.end()) {
+    DNNLFCFullParam full_param;
+    full_param.default_param = param;
+    full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
+    DNNLFullyConnectedForward fcFwd(full_param, is_train, data, weight, bias, out_md);
+    it = AddToCache(&fcFwds, key, fcFwd);
+  }
+  return it->second;
+}
+
+void DNNLFCFlattenData(const FullyConnectedParam& param,
+                       const NDArray& out_data,
+                       NDArray* in_data,
+                       dnnl::memory::desc* out_md) {
+  const mxnet::TShape ishape = in_data->shape();
+  const mxnet::TShape oshape = out_data.shape();
+  if (ishape.ndim() != 2) {
+    if (!param.flatten) {
+      *in_data = in_data->DNNLDataReshape(
+          Shape2(ishape.ProdShape(0, ishape.ndim() - 1), ishape[ishape.ndim() - 1]));
+      dnnl::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0, oshape.ndim() - 1)),
+                                  static_cast<int>(oshape[ishape.ndim() - 1])};
+      *out_md = dnnl::memory::desc(
+          out_dims, get_dnnl_type(out_data.dtype()), dnnl::memory::format_tag::any);
+    } else {
+      *in_data = in_data->DNNLDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
+      dnnl::memory::dims out_dims{static_cast<int>(oshape[0]),
+                                  static_cast<int>(oshape.ProdShape(1, oshape.ndim()))};
+      *out_md = dnnl::memory::desc(
+          out_dims, get_dnnl_type(out_data.dtype()), dnnl::memory::format_tag::any);
+    }
+  }
+}
+
+void DNNLFCForwardFullFeature(const DNNLFCFullParam& full_param,
+                              const OpContext& ctx,
+                              DNNLFullyConnectedForward* fwd,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  NDArray weight = in_data[fullc::kWeight];
+  NDArray data   = in_data[fullc::kData];
+
+  auto data_mem = data.GetDNNLDataReorder(fwd->fwd_pd.src_desc());
+  const dnnl::memory* weight_mem;
+  if (ctx.is_train) {
+    if (weight.IsDNNLData()) {
+      weight.Reorder2DefaultAsync();
+    }
+    weight_mem = GetWeights(weight, fwd->fwd_pd.weights_desc(), 1);
+  } else {
+    weight_mem = weight.GetDNNLData();
+    if (weight_mem->get_desc() != fwd->fwd_pd.weights_desc()) {
+      weight.DNNLDataReorderAsync(fwd->fwd_pd.weights_desc());
+      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_desc(), 1);
+    }
+  }
+  auto out_mem =
+      CreateDNNLMem(out_data[fullc::kOut], fwd->fwd_pd.dst_desc(), req[fullc::kOut], &data);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, *data_mem},
+      {DNNL_ARG_WEIGHTS, *weight_mem},
+      {DNNL_ARG_DST, *out_mem.second},
+  };
+  if (!full_param.default_param.no_bias) {
+    auto bias_mem       = in_data[fullc::kBias].GetDNNLDataReorder(fwd->fwd_pd.bias_desc());
+    args[DNNL_ARG_BIAS] = *bias_mem;
+  }
+  DNNLStream::Get()->RegisterPrimArgs(fwd->GetFwd(), args);
+  CommitOutput(out_data[fullc::kOut], out_mem);
+  DNNLStream::Get()->Submit();
+}
+
+void DNNLFCForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& in_data,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& out_data) {
+  DNNLFCFullParam full_param;
+  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
+
+  NDArray data              = in_data[fullc::kData];
+  dnnl::memory::desc out_md = GetMemDesc(out_data[fullc::kOut]);
+  DNNLFCFlattenData(full_param.default_param, out_data[fullc::kOut], &data, &out_md);
+  auto& fwd = GetFCFwd(full_param.default_param,
+                       ctx.is_train,
+                       data,
+                       in_data[fullc::kWeight],
+                       full_param.default_param.no_bias ? nullptr : &in_data[fullc::kBias],
+                       out_md);
+  std::vector<NDArray> new_inputs;
+  if (full_param.default_param.no_bias)
+    new_inputs = {data, in_data[fullc::kWeight]};
+  else
+    new_inputs = {data, in_data[fullc::kWeight], in_data[fullc::kBias]};
+  DNNLFCForwardFullFeature(full_param, ctx, &fwd, new_inputs, req, out_data);
+}
+
+void DNNLFCBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  const std::vector<NDArray>& in_grad = outputs;
+  DNNLFCFullParam full_param;
+  full_param.default_param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  full_param.dnnl_param.Init(std::unordered_map<std::string, std::string>());
+  const FullyConnectedParam& param = full_param.default_param;
+  const mxnet::TShape& ishape      = inputs[fullc::kData + 1].shape();
+  const mxnet::TShape& oshape      = inputs[fullc::kOut].shape();
+
+  NDArray weight = inputs[fullc::kWeight + 1];
+  NDArray data   = inputs[fullc::kData + 1];
+  if (data.shape().ndim() != 2 && !param.flatten)
+    data = data.DNNLDataReshape(
+        Shape2(ishape.ProdShape(0, ishape.ndim() - 1), ishape[ishape.ndim() - 1]));
+  else if (data.shape().ndim() != 2)
+    data = data.DNNLDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
+  NDArray out_grad = inputs[fullc::kOut];
+  if (out_grad.shape().ndim() != 2 && !param.flatten)
+    out_grad = out_grad.DNNLDataReshape(
+        Shape2(oshape.ProdShape(0, oshape.ndim() - 1), oshape[oshape.ndim() - 1]));
+  else if (out_grad.shape().ndim() != 2)
+    out_grad = out_grad.DNNLDataReshape(Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())));
+
+  dnnl::inner_product_forward::primitive_desc fwd_pd =
+      GetFCFwdImpl(full_param,
+                   ctx.is_train,
+                   data,
+                   weight,
+                   param.no_bias ? nullptr : &in_grad[fullc::kBias],
+                   GetMemDesc(out_grad));
+
+  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+  if (req[fullc::kWeight]) {
+    dnnl::inner_product_backward_weights::primitive_desc ipBwdWeights_pd = GetFCBwdWeights(
+        data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], out_grad, fwd_pd);
+    auto out_grad_mem   = out_grad.GetDNNLDataReorder(ipBwdWeights_pd.diff_dst_desc());
+    auto data_mem       = data.GetDNNLDataReorder(ipBwdWeights_pd.src_desc());
+    auto in_grad_weight = CreateDNNLWeightGrad(
+        in_grad[fullc::kWeight], ipBwdWeights_pd.diff_weights_desc(), req[fullc::kWeight]);
+    dnnl_args_map_t args = {
+        {DNNL_ARG_DIFF_DST, *out_grad_mem},
+        {DNNL_ARG_SRC, *data_mem},
+        {DNNL_ARG_DIFF_WEIGHTS, *in_grad_weight.second},
+    };
+
+    dnnl_output_t in_grad_bias;
+    if (!param.no_bias) {
+      in_grad_bias =
+          CreateDNNLMem(in_grad[fullc::kBias], ipBwdWeights_pd.diff_bias_desc(), req[fullc::kBias]);
+      args[DNNL_ARG_DIFF_BIAS] = *in_grad_bias.second;
+    }
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::inner_product_backward_weights(ipBwdWeights_pd),
+                                        args);
+    CommitOutput(in_grad[fullc::kWeight], in_grad_weight);
+    if (!param.no_bias) {
+      CommitOutput(in_grad[fullc::kBias], in_grad_bias);
+    }
+  }
+  if (req[fullc::kData]) {
+    dnnl::inner_product_backward_data::primitive_desc ipBwdData_pd =
+        GetFCBwdData(data, weight, out_grad, fwd_pd);
+    auto out_grad_mem = out_grad.GetDNNLDataReorder(ipBwdData_pd.diff_dst_desc());
+    auto weight_mem   = weight.GetDNNLDataReorder(ipBwdData_pd.weights_desc());
+    auto in_grad_mem =
+        CreateDNNLMem(in_grad[fullc::kData], ipBwdData_pd.diff_src_desc(), req[fullc::kData]);
+    dnnl_args_map_t args = {{DNNL_ARG_DIFF_DST, *out_grad_mem},
+                            {DNNL_ARG_WEIGHTS, *weight_mem},
+                            {DNNL_ARG_DIFF_SRC, *in_grad_mem.second}};
+
+    DNNLStream::Get()->RegisterPrimArgs(dnnl::inner_product_backward_data(ipBwdData_pd), args);
+    CommitOutput(in_grad[fullc::kData], in_grad_mem);
+  }
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_layer_norm-inl.h b/src/operator/nn/dnnl/dnnl_layer_norm-inl.h
similarity index 57%
rename from src/operator/nn/mkldnn/mkldnn_layer_norm-inl.h
rename to src/operator/nn/dnnl/dnnl_layer_norm-inl.h
index a14673b..ccd3e9c 100644
--- a/src/operator/nn/mkldnn/mkldnn_layer_norm-inl.h
+++ b/src/operator/nn/dnnl/dnnl_layer_norm-inl.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file mkldnn_layer_norm-inl.h
+ * \file dnnl_layer_norm-inl.h
  * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LAYER_NORM_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LAYER_NORM_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_LAYER_NORM_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_LAYER_NORM_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
@@ -30,31 +30,30 @@
 #include <vector>
 
 #include "../layer_norm-inl.h"
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using layernorm_fwd_t    = mkldnn::layer_normalization_forward;
-using layernorm_fwd_pd_t = mkldnn::layer_normalization_forward::primitive_desc;
+using layernorm_fwd_t    = dnnl::layer_normalization_forward;
+using layernorm_fwd_pd_t = dnnl::layer_normalization_forward::primitive_desc;
 
-using layernorm_bwd_t    = mkldnn::layer_normalization_backward;
-using layernorm_bwd_pd_t = mkldnn::layer_normalization_backward::primitive_desc;
+using layernorm_bwd_t    = dnnl::layer_normalization_backward;
+using layernorm_bwd_pd_t = dnnl::layer_normalization_backward::primitive_desc;
 
 typedef ParamOpSign<LayerNormParam> LayerNormSignature;
 
-class MKLDNNLayerNormFwd {
+class DNNLLayerNormFwd {
  public:
-  static MKLDNNLayerNormFwd& GetCached(const LayerNormParam& param,
-                                       const OpContext& ctx,
-                                       const NDArray& data);
+  static DNNLLayerNormFwd& GetCached(const LayerNormParam& param,
+                                     const OpContext& ctx,
+                                     const NDArray& data);
 
-  MKLDNNLayerNormFwd(const LayerNormParam& param, const NDArray& data);
+  DNNLLayerNormFwd(const LayerNormParam& param, const NDArray& data);
 
-  static std::shared_ptr<layernorm_fwd_pd_t> CreatePrimitiveDesc(
-      const LayerNormParam& param,
-      const mkldnn::memory::desc& src_md);
+  static std::shared_ptr<layernorm_fwd_pd_t> CreatePrimitiveDesc(const LayerNormParam& param,
+                                                                 const dnnl::memory::desc& src_md);
 
   void Execute(const LayerNormParam& param,
                const OpContext& ctx,
@@ -62,34 +61,34 @@ class MKLDNNLayerNormFwd {
                const OpReqType& req,
                const std::vector<NDArray>& outputs) const;
 
-  ~MKLDNNLayerNormFwd() {}
+  ~DNNLLayerNormFwd() {}
 
  private:
   std::shared_ptr<layernorm_fwd_t> fwd;
   std::shared_ptr<layernorm_fwd_pd_t> fwd_pd;
 };
 
-class MKLDNNLayerNormBwd {
+class DNNLLayerNormBwd {
  public:
-  static MKLDNNLayerNormBwd& GetCached(const LayerNormParam& param,
-                                       const std::vector<NDArray>& inputs);
+  static DNNLLayerNormBwd& GetCached(const LayerNormParam& param,
+                                     const std::vector<NDArray>& inputs);
 
-  MKLDNNLayerNormBwd(const LayerNormParam& param,
-                     const std::vector<NDArray>& inputs,
-                     const mkldnn::memory::desc& data_md,
-                     const mkldnn::memory::desc& diff_md);
+  DNNLLayerNormBwd(const LayerNormParam& param,
+                   const std::vector<NDArray>& inputs,
+                   const dnnl::memory::desc& data_md,
+                   const dnnl::memory::desc& diff_md);
 
   static std::shared_ptr<layernorm_bwd_pd_t> CreatePrimitiveDesc(
       const LayerNormParam& param,
-      const mkldnn::memory::desc& data_md,
-      const mkldnn::memory::desc& diff_md,
+      const dnnl::memory::desc& data_md,
+      const dnnl::memory::desc& diff_md,
       const layernorm_fwd_pd_t& layernorm_fwd_pd);
 
   void Execute(const std::vector<NDArray>& inputs,
                const std::vector<NDArray>& outputs,
                const std::vector<OpReqType>& req) const;
 
-  ~MKLDNNLayerNormBwd() {}
+  ~DNNLLayerNormBwd() {}
 
  private:
   std::shared_ptr<layernorm_bwd_t> bwd;
@@ -100,4 +99,4 @@ class MKLDNNLayerNormBwd {
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LAYER_NORM_INL_H__
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_LAYER_NORM_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_layer_norm.cc b/src/operator/nn/dnnl/dnnl_layer_norm.cc
similarity index 52%
rename from src/operator/nn/mkldnn/mkldnn_layer_norm.cc
rename to src/operator/nn/dnnl/dnnl_layer_norm.cc
index 2b63319..2e720d0 100644
--- a/src/operator/nn/mkldnn/mkldnn_layer_norm.cc
+++ b/src/operator/nn/dnnl/dnnl_layer_norm.cc
@@ -18,29 +18,29 @@
  */
 
 /*!
- * \file mkldnn_layer_norm.cc
+ * \file dnnl_layer_norm.cc
  * \author: Bartosz Kuncer, bartosz.kuncer@intel.com
  */
 
 #if MXNET_USE_ONEDNN == 1
 
-#include "./mkldnn_layer_norm-inl.h"
+#include "./dnnl_layer_norm-inl.h"
 
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs) {
+bool SupportDNNLLayerNorm(const LayerNormParam& param, const std::vector<NDArray>& inputs) {
   const mxnet::TShape& shape = inputs[layernorm::kData].shape();
 
   // Native implementation (which can be found in function LayerNormCPU) is faster than oneDNN's one
   // for small tensors. Below is the heuristic based on measurements on clx machine deciding whether
   // the shape is better for oneDNN or native implementation.
-  auto ShapeBetterForMKLDNN = [](const mxnet::TShape& shape) {
+  auto ShapeBetterForDNNL = [](const mxnet::TShape& shape) {
     constexpr size_t shapeLimit = 1024;
     return shape.Size() / shape[0] >= shapeLimit && shape[0] >= shapeLimit;
   };
 
-  return (ShapeBetterForMKLDNN(shape) &&
+  return (ShapeBetterForDNNL(shape) &&
           (GetRealAxis(param.axis, shape.ndim()) == shape.ndim() - 1) && (shape.ndim() >= 2) &&
           (shape.ndim() <= 5) &&
           (inputs[layernorm::kData].dtype() == mshadow::kFloat32 ||
@@ -49,20 +49,20 @@ bool SupportMKLDNNLayerNorm(const LayerNormParam& param, const std::vector<NDArr
           inputs[layernorm::kBeta].dtype() == mshadow::kFloat32);
 }
 
-void MKLDNNLayerNormForward(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const std::vector<NDArray>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<NDArray>& outputs) {
+void DNNLLayerNormForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  const auto& fwd             = MKLDNNLayerNormFwd::GetCached(param, ctx, inputs[layernorm::kData]);
+  const auto& fwd             = DNNLLayerNormFwd::GetCached(param, ctx, inputs[layernorm::kData]);
   fwd.Execute(param, ctx, inputs, req[layernorm::kOut], outputs);
 }
 
-MKLDNNLayerNormFwd& MKLDNNLayerNormFwd::GetCached(const LayerNormParam& param,
-                                                  const OpContext& ctx,
-                                                  const NDArray& data) {
-  using layernorm_fwd_map = std::unordered_map<LayerNormSignature, MKLDNNLayerNormFwd, OpHash>;
+DNNLLayerNormFwd& DNNLLayerNormFwd::GetCached(const LayerNormParam& param,
+                                              const OpContext& ctx,
+                                              const NDArray& data) {
+  using layernorm_fwd_map = std::unordered_map<LayerNormSignature, DNNLLayerNormFwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local layernorm_fwd_map layer_norm_fwds;
 #else
@@ -74,52 +74,52 @@ MKLDNNLayerNormFwd& MKLDNNLayerNormFwd::GetCached(const LayerNormParam& param,
 
   auto it = layer_norm_fwds.find(key);
   if (it == layer_norm_fwds.end()) {
-    MKLDNNLayerNormFwd fwd(param, data);
+    DNNLLayerNormFwd fwd(param, data);
     it = AddToCache(&layer_norm_fwds, key, fwd);
   }
   return it->second;
 }
 
-MKLDNNLayerNormFwd::MKLDNNLayerNormFwd(const LayerNormParam& param, const NDArray& data) {
-  const mkldnn::memory::desc data_md = data.GetMKLDNNData()->get_desc();
-  fwd_pd                             = CreatePrimitiveDesc(param, data_md);
-  fwd                                = std::make_shared<layernorm_fwd_t>(*fwd_pd);
+DNNLLayerNormFwd::DNNLLayerNormFwd(const LayerNormParam& param, const NDArray& data) {
+  const dnnl::memory::desc data_md = data.GetDNNLData()->get_desc();
+  fwd_pd                           = CreatePrimitiveDesc(param, data_md);
+  fwd                              = std::make_shared<layernorm_fwd_t>(*fwd_pd);
 }
 
-std::shared_ptr<layernorm_fwd_pd_t> MKLDNNLayerNormFwd::CreatePrimitiveDesc(
+std::shared_ptr<layernorm_fwd_pd_t> DNNLLayerNormFwd::CreatePrimitiveDesc(
     const LayerNormParam& param,
-    const mkldnn::memory::desc& src_md) {
-  layernorm_fwd_t::desc fwd_desc(mkldnn::prop_kind::forward_training,
+    const dnnl::memory::desc& src_md) {
+  layernorm_fwd_t::desc fwd_desc(dnnl::prop_kind::forward_training,
                                  src_md,
                                  param.eps,
                                  dnnl::normalization_flags::use_scale_shift);
-  mkldnn::engine& engine = CpuEngine::Get()->get_engine();
+  dnnl::engine& engine = CpuEngine::Get()->get_engine();
   return std::make_shared<layernorm_fwd_pd_t>(fwd_desc, engine);
 }
 
-inline mkldnn::memory::desc GetMeanVarDesc(const mkldnn::memory::data_type& dtype,
-                                           const mxnet::TShape& _shape) {
+inline dnnl::memory::desc GetMeanVarDesc(const dnnl::memory::data_type& dtype,
+                                         const mxnet::TShape& _shape) {
   const auto ndim = _shape.ndim();
 
-  mkldnn::memory::dims shape(ndim, 1), strides(ndim, 1);
+  dnnl::memory::dims shape(ndim, 1), strides(ndim, 1);
   shape[0] = _shape[0];
   for (int i = ndim - 1; i > 0; --i) {
     shape[i]       = _shape[i];
     strides[i - 1] = strides[i] * shape[i];
   }
 
-  return mkldnn::memory::desc{shape, dtype, strides};
+  return dnnl::memory::desc{shape, dtype, strides};
 }
 
-inline mkldnn::memory GetScaleShiftMem(const NDArray& gamma, const NDArray& beta) {
+inline dnnl::memory GetScaleShiftMem(const NDArray& gamma, const NDArray& beta) {
   // OneDNN takes gamma and beta as one SCALE_SHIFT tensor when both scale and shift are used. In
   // mxnet scale is called gamma and shift is called beta.
   constexpr size_t gammaAndBeta = 2;
   CHECK_EQ(gamma.shape()[0], beta.shape()[0]);
-  const mkldnn::memory::desc scale_shift_md(mkldnn::memory::dims{gammaAndBeta, gamma.shape()[0]},
-                                            get_mkldnn_type(gamma.dtype()),
-                                            mkldnn::memory::format_tag::nc);
-  auto scale_shift_mem = mkldnn::memory(scale_shift_md, CpuEngine::Get()->get_engine());
+  const dnnl::memory::desc scale_shift_md(dnnl::memory::dims{gammaAndBeta, gamma.shape()[0]},
+                                          get_dnnl_type(gamma.dtype()),
+                                          dnnl::memory::format_tag::nc);
+  auto scale_shift_mem = dnnl::memory(scale_shift_md, CpuEngine::Get()->get_engine());
   char* ptr            = reinterpret_cast<char*>(scale_shift_mem.get_data_handle());
   const size_t bytes   = scale_shift_md.get_size() / gammaAndBeta;
   memcpy(ptr, gamma.data().dptr_, bytes);
@@ -127,62 +127,60 @@ inline mkldnn::memory GetScaleShiftMem(const NDArray& gamma, const NDArray& beta
   return scale_shift_mem;
 }
 
-void MKLDNNLayerNormFwd::Execute(const LayerNormParam& param,
-                                 const OpContext& ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const OpReqType& req,
-                                 const std::vector<NDArray>& outputs) const {
-  auto mean_var_md = GetMeanVarDesc(get_mkldnn_type(outputs[layernorm::kMean].dtype()),
+void DNNLLayerNormFwd::Execute(const LayerNormParam& param,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const OpReqType& req,
+                               const std::vector<NDArray>& outputs) const {
+  auto mean_var_md = GetMeanVarDesc(get_dnnl_type(outputs[layernorm::kMean].dtype()),
                                     outputs[layernorm::kMean].shape());
-  auto mean_mem    = mkldnn_output_t(
-      OutDataOp::Noop,
-      const_cast<NDArray&>(outputs[layernorm::kMean]).CreateMKLDNNData(mean_var_md));
-  auto variance_mem =
-      mkldnn_output_t(OutDataOp::Noop,
-                      const_cast<NDArray&>(outputs[layernorm::kStd]).CreateMKLDNNData(mean_var_md));
+  auto mean_mem    = dnnl_output_t(
+      OutDataOp::Noop, const_cast<NDArray&>(outputs[layernorm::kMean]).CreateDNNLData(mean_var_md));
+  auto variance_mem = dnnl_output_t(
+      OutDataOp::Noop, const_cast<NDArray&>(outputs[layernorm::kStd]).CreateDNNLData(mean_var_md));
 
-  auto output_mem      = CreateMKLDNNMem(outputs[layernorm::kOut], fwd_pd->dst_desc(), req);
+  auto output_mem      = CreateDNNLMem(outputs[layernorm::kOut], fwd_pd->dst_desc(), req);
   auto scale_shift_mem = GetScaleShiftMem(inputs[layernorm::kGamma], inputs[layernorm::kBeta]);
 
-  mkldnn_args_map_t args = {{MKLDNN_ARG_SRC, *inputs[layernorm::kData].GetMKLDNNData()},
-                            {MKLDNN_ARG_DST, *output_mem.second},
-                            {MKLDNN_ARG_MEAN, *mean_mem.second},
-                            {MKLDNN_ARG_VARIANCE, *variance_mem.second},
-                            {MKLDNN_ARG_SCALE_SHIFT, scale_shift_mem}};
+  dnnl_args_map_t args = {{DNNL_ARG_SRC, *inputs[layernorm::kData].GetDNNLData()},
+                          {DNNL_ARG_DST, *output_mem.second},
+                          {DNNL_ARG_MEAN, *mean_mem.second},
+                          {DNNL_ARG_VARIANCE, *variance_mem.second},
+                          {DNNL_ARG_SCALE_SHIFT, scale_shift_mem}};
 
-  MKLDNNStream::Get()->RegisterPrimArgs(*fwd, args);
+  DNNLStream::Get()->RegisterPrimArgs(*fwd, args);
   CommitOutput(outputs[layernorm::kOut], output_mem);
   CommitOutput(outputs[layernorm::kMean], mean_mem);
   CommitOutput(outputs[layernorm::kStd], variance_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
 }
 
-MKLDNNLayerNormBwd::MKLDNNLayerNormBwd(const LayerNormParam& param,
-                                       const std::vector<NDArray>& inputs,
-                                       const mkldnn::memory::desc& data_md,
-                                       const mkldnn::memory::desc& diff_md)
-    : fwd_pd(MKLDNNLayerNormFwd::CreatePrimitiveDesc(param, data_md)),
+DNNLLayerNormBwd::DNNLLayerNormBwd(const LayerNormParam& param,
+                                   const std::vector<NDArray>& inputs,
+                                   const dnnl::memory::desc& data_md,
+                                   const dnnl::memory::desc& diff_md)
+    : fwd_pd(DNNLLayerNormFwd::CreatePrimitiveDesc(param, data_md)),
       bwd_pd(CreatePrimitiveDesc(param, data_md, diff_md, *fwd_pd)) {
   bwd = std::make_shared<layernorm_bwd_t>(*bwd_pd);
 }
 
-std::shared_ptr<layernorm_bwd_pd_t> MKLDNNLayerNormBwd::CreatePrimitiveDesc(
+std::shared_ptr<layernorm_bwd_pd_t> DNNLLayerNormBwd::CreatePrimitiveDesc(
     const LayerNormParam& param,
-    const mkldnn::memory::desc& data_md,
-    const mkldnn::memory::desc& diff_md,
+    const dnnl::memory::desc& data_md,
+    const dnnl::memory::desc& diff_md,
     const layernorm_fwd_pd_t& layernorm_fwd_pd) {
   layernorm_bwd_t::desc layernorm_bwd_desc(dnnl::prop_kind::backward,
                                            diff_md,
                                            data_md,
                                            param.eps,
                                            dnnl::normalization_flags::use_scale_shift);
-  mkldnn::engine& engine = CpuEngine::Get()->get_engine();
+  dnnl::engine& engine = CpuEngine::Get()->get_engine();
   return std::make_shared<layernorm_bwd_pd_t>(layernorm_bwd_desc, engine, layernorm_fwd_pd);
 }
 
-void MKLDNNLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
-                                 const std::vector<NDArray>& outputs,
-                                 const std::vector<OpReqType>& req) const {
+void DNNLLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
+                               const std::vector<NDArray>& outputs,
+                               const std::vector<OpReqType>& req) const {
   auto scale_shift_mem =
       GetScaleShiftMem(inputs[layernorm::kBwdGamma], inputs[layernorm::kBwdBeta]);
   auto diff_weights_ndarray = NDArray(scale_shift_mem.get_desc());
@@ -197,21 +195,21 @@ void MKLDNNLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
            outputs[layernorm::kBwdBetaGrad].data().dptr_,
            bytes);
   }
-  mkldnn_output_t diff_src_mem = CreateMKLDNNMem(
+  dnnl_output_t diff_src_mem = CreateDNNLMem(
       outputs[layernorm::kBwdDataGrad], bwd_pd->diff_src_desc(), req[layernorm::kBwdDataGrad]);
-  mkldnn_output_t diff_weights_mem = CreateMKLDNNMem(
+  dnnl_output_t diff_weights_mem = CreateDNNLMem(
       diff_weights_ndarray, bwd_pd->diff_weights_desc(), req[layernorm::kBwdGammaGrad]);
-  mkldnn_args_map_t args = {{MKLDNN_ARG_DIFF_DST, *inputs[layernorm::kBwdOutGrad].GetMKLDNNData()},
-                            {MKLDNN_ARG_SRC, *inputs[layernorm::kBwdData].GetMKLDNNData()},
-                            {MKLDNN_ARG_SCALE_SHIFT, scale_shift_mem},
-                            {MKLDNN_ARG_MEAN, *inputs[layernorm::kBwdMean].GetMKLDNNData()},
-                            {MKLDNN_ARG_VARIANCE, *inputs[layernorm::kBwdStd].GetMKLDNNData()},
-                            {MKLDNN_ARG_DIFF_SRC, *diff_src_mem.second},
-                            {MKLDNN_ARG_DIFF_SCALE_SHIFT, *diff_weights_mem.second}};
-  MKLDNNStream::Get()->RegisterPrimArgs(*bwd, args);
+  dnnl_args_map_t args = {{DNNL_ARG_DIFF_DST, *inputs[layernorm::kBwdOutGrad].GetDNNLData()},
+                          {DNNL_ARG_SRC, *inputs[layernorm::kBwdData].GetDNNLData()},
+                          {DNNL_ARG_SCALE_SHIFT, scale_shift_mem},
+                          {DNNL_ARG_MEAN, *inputs[layernorm::kBwdMean].GetDNNLData()},
+                          {DNNL_ARG_VARIANCE, *inputs[layernorm::kBwdStd].GetDNNLData()},
+                          {DNNL_ARG_DIFF_SRC, *diff_src_mem.second},
+                          {DNNL_ARG_DIFF_SCALE_SHIFT, *diff_weights_mem.second}};
+  DNNLStream::Get()->RegisterPrimArgs(*bwd, args);
   CommitOutput(outputs[layernorm::kBwdDataGrad], diff_src_mem);
   CommitOutput(diff_weights_ndarray, diff_weights_mem);
-  MKLDNNStream::Get()->Submit();
+  DNNLStream::Get()->Submit();
   // Commit scale_shift diff
   memcpy(outputs[layernorm::kBwdGammaGrad].data().dptr_, diff_weights_ndarray.data().dptr_, bytes);
   memcpy(outputs[layernorm::kBwdBetaGrad].data().dptr_,
@@ -219,9 +217,9 @@ void MKLDNNLayerNormBwd::Execute(const std::vector<NDArray>& inputs,
          bytes);
 }
 
-MKLDNNLayerNormBwd& MKLDNNLayerNormBwd::GetCached(const LayerNormParam& param,
-                                                  const std::vector<NDArray>& inputs) {
-  using layernorm_bwd_map = std::unordered_map<LayerNormSignature, MKLDNNLayerNormBwd, OpHash>;
+DNNLLayerNormBwd& DNNLLayerNormBwd::GetCached(const LayerNormParam& param,
+                                              const std::vector<NDArray>& inputs) {
+  using layernorm_bwd_map = std::unordered_map<LayerNormSignature, DNNLLayerNormBwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local layernorm_bwd_map layer_norm_bwds;
 #else
@@ -237,21 +235,21 @@ MKLDNNLayerNormBwd& MKLDNNLayerNormBwd::GetCached(const LayerNormParam& param,
 
   auto it = layer_norm_bwds.find(key);
   if (it == layer_norm_bwds.end()) {
-    const mkldnn::memory::desc data_md = inputs[layernorm::kBwdData].GetMKLDNNData()->get_desc();
-    const mkldnn::memory::desc diff_md = inputs[layernorm::kBwdOutGrad].GetMKLDNNData()->get_desc();
-    MKLDNNLayerNormBwd bwd(param, inputs, data_md, diff_md);
+    const dnnl::memory::desc data_md = inputs[layernorm::kBwdData].GetDNNLData()->get_desc();
+    const dnnl::memory::desc diff_md = inputs[layernorm::kBwdOutGrad].GetDNNLData()->get_desc();
+    DNNLLayerNormBwd bwd(param, inputs, data_md, diff_md);
     it = AddToCache(&layer_norm_bwds, key, bwd);
   }
   return it->second;
 }
 
-void MKLDNNLayerNormBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<NDArray>& outputs) {
+void DNNLLayerNormBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  MKLDNNLayerNormBwd& bwd     = MKLDNNLayerNormBwd::GetCached(param, inputs);
+  DNNLLayerNormBwd& bwd       = DNNLLayerNormBwd::GetCached(param, inputs);
   bwd.Execute(inputs, outputs, req);
 }
 
diff --git a/src/operator/nn/dnnl/dnnl_log_softmax.cc b/src/operator/nn/dnnl/dnnl_log_softmax.cc
new file mode 100644
index 0000000..9408e60
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_log_softmax.cc
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_log_softmax.cc
+ * \brief Implementation of log_softmax function with DNNL support
+ */
+
+#include "../softmax-inl.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+
+#if MXNET_USE_ONEDNN == 1
+namespace mxnet {
+namespace op {
+
+static dnnl::logsoftmax_forward::primitive_desc GetLogSoftmaxFwdPd(bool is_train,
+                                                                   const int axis,
+                                                                   const dnnl::memory& input_mem) {
+  dnnl::memory::desc data_md = input_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto prop = is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+  auto desc = dnnl::logsoftmax_forward::desc(prop, data_md, axis);
+  return dnnl::logsoftmax_forward::primitive_desc(desc, cpu_engine);
+}
+
+static dnnl::logsoftmax_backward::primitive_desc GetLogSoftmaxBwdPd(
+    const dnnl::memory& diff_mem,
+    const dnnl::memory& data_mem,
+    const int axis,
+    const dnnl::logsoftmax_forward::primitive_desc& hint_fwd_pd) {
+  dnnl::memory::desc diff_md = diff_mem.get_desc();
+  dnnl::memory::desc data_md = data_mem.get_desc();
+  auto cpu_engine            = CpuEngine::Get()->get_engine();
+  auto desc                  = dnnl::logsoftmax_backward::desc(diff_md, data_md, axis);
+  return dnnl::logsoftmax_backward::primitive_desc(desc, cpu_engine, hint_fwd_pd);
+}
+
+bool SupportDNNLLogSoftmax(const SoftmaxParam& param, const NDArray& data, const NDArray& output) {
+  const int ndim      = data.shape().ndim();
+  const int in_dtype  = data.dtype();
+  const int out_dtype = output.dtype();
+  const int axis      = CheckAxis(param.axis, ndim);
+  // DNNL does not support temperature argument in their log_softmax function
+  // now. Need update this once they start to support it.
+  // Currently, DNNL shows bad performance when log_softmax is not performed on the last dimension
+  if (param.temperature.has_value() || in_dtype != mshadow::kFloat32 || in_dtype != out_dtype ||
+      axis != (ndim - 1)) {
+    return false;
+  }
+
+  // only supports ndim = 1, 2, 3, 4 for now
+  return (ndim >= 1 && ndim <= 4);
+}
+
+class DNNLLogSoftmaxFwd {
+ public:
+  dnnl::logsoftmax_forward::primitive_desc pd;
+
+  DNNLLogSoftmaxFwd(const bool is_train, const int axis, const dnnl::memory& input)
+      : pd(GetLogSoftmaxFwdPd(is_train, axis, input)) {
+    fwd_ = std::make_shared<dnnl::logsoftmax_forward>(pd);
+  }
+
+  const dnnl::logsoftmax_forward& GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::logsoftmax_forward> fwd_;
+};
+
+typedef ParamOpSign<SoftmaxParam> DNNLSoftmaxSignature;
+
+static DNNLLogSoftmaxFwd& GetLogSoftmaxFwd(const SoftmaxParam& param,
+                                           const int real_axis,
+                                           const bool is_train,
+                                           const NDArray& data,
+                                           const NDArray& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxFwd, OpHash> fwds;
+#endif
+
+  DNNLSoftmaxSignature key(param);
+  key.AddSign(real_axis);
+  key.AddSign(is_train);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLLogSoftmaxFwd fwd(is_train, real_axis, *(data.GetDNNLData()));
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLLogSoftmaxForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data) {
+  if (req == kNullOp)
+    return;
+  // same as the FCompute path, log_softmax only supports kWriteTo and kWriteInplace for now.
+  CHECK_NE(req, kAddTo);
+
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis                  = CheckAxis(param.axis, in_data.shape().ndim());
+  auto fwd                  = GetLogSoftmaxFwd(param, axis, ctx.is_train, in_data, out_data);
+
+  auto in_mem        = in_data.GetDNNLData();
+  auto out_mem       = out_data.GetDNNLData(fwd.pd.dst_desc());
+  DNNLStream* stream = DNNLStream::Get();
+  stream->RegisterPrimArgs(fwd.GetFwd(), {{DNNL_ARG_SRC, *in_mem}, {DNNL_ARG_DST, *out_mem}});
+  stream->Submit();
+}
+
+class DNNLLogSoftmaxBwd {
+ public:
+  dnnl::logsoftmax_backward::primitive_desc pd;
+
+  DNNLLogSoftmaxBwd(const dnnl::memory& diff_mem,
+                    const dnnl::memory& data_mem,
+                    const int axis,
+                    const dnnl::logsoftmax_forward::primitive_desc& hint_fwd_pd)
+      : pd(GetLogSoftmaxBwdPd(diff_mem, data_mem, axis, hint_fwd_pd)) {
+    bwd_ = std::make_shared<dnnl::logsoftmax_backward>(pd);
+  }
+
+  const dnnl::logsoftmax_backward& GetBwd() const {
+    return *bwd_;
+  }
+
+ private:
+  std::shared_ptr<dnnl::logsoftmax_backward> bwd_;
+};
+
+static DNNLLogSoftmaxBwd& GetLogSoftmaxBwd(const SoftmaxParam& param,
+                                           const int real_axis,
+                                           const std::vector<NDArray>& data,
+                                           const std::vector<NDArray>& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxBwd, OpHash> bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLSoftmaxSignature, DNNLLogSoftmaxBwd, OpHash> bwds;
+#endif
+
+  DNNLSoftmaxSignature key(param);
+  key.AddSign(real_axis);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = bwds.find(key);
+  if (it == bwds.end()) {
+    auto diff_mem = data[0].GetDNNLData();
+    auto data_mem = data[1].GetDNNLData();
+    auto fwd_pd   = GetLogSoftmaxFwdPd(true, real_axis, *data_mem);
+    DNNLLogSoftmaxBwd bwd(*diff_mem, *data_mem, real_axis, fwd_pd);
+    it = AddToCache(&bwds, key, bwd);
+  }
+  return it->second;
+}
+
+void DNNLLogSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data) {
+  if (req[0] == kNullOp)
+    return;
+  CHECK_EQ(in_data.size(), 2U);
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis                  = CheckAxis(param.axis, in_data[1].shape().ndim());
+  auto diff_mem             = in_data[0].GetDNNLData();
+  auto data_mem             = in_data[1].GetDNNLData();
+  auto bwd                  = GetLogSoftmaxBwd(param, axis, in_data, out_data);
+
+  auto out_mem         = CreateDNNLMem(out_data[0], bwd.pd.diff_src_desc(), req[0]);
+  DNNLStream* stream   = DNNLStream::Get();
+  dnnl_args_map_t args = {{DNNL_ARG_DST, *data_mem},
+                          {DNNL_ARG_DIFF_DST, *diff_mem},
+                          {DNNL_ARG_DIFF_SRC, *out_mem.second}};
+
+  stream->RegisterPrimArgs(bwd.GetBwd(), args);
+  CommitOutput(out_data[0], out_mem);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/dnnl/dnnl_lrn-inl.h b/src/operator/nn/dnnl/dnnl_lrn-inl.h
new file mode 100644
index 0000000..842705b
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_lrn-inl.h
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_lrn-inl.h
+ * \brief
+ * \Author: Patric Zhao, patric.zhao@intel.com
+ */
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_LRN_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_LRN_INL_H_
+
+#if MXNET_USE_ONEDNN == 1
+#include <dnnl.hpp>
+#include <utility>
+#include <vector>
+
+#include "../lrn-inl.h"
+#include "./dnnl_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline dnnl::algorithm GetDNNLLRNAlgo(const LRNParam& param) {
+  // TODO(Patric): lrn_within_channel will cause core dump in DNNL backward
+  //               Need to confirm with DNNL team and fix later
+  return dnnl::algorithm::lrn_across_channels;
+}
+
+inline dnnl::lrn_forward::primitive_desc GetLRNFwdDesc(const LRNParam& param,
+                                                       const bool is_train,
+                                                       const dnnl::memory::desc& src_md) {
+  dnnl::engine& engine      = CpuEngine::Get()->get_engine();
+  const dnnl::algorithm alg = GetDNNLLRNAlgo(param);
+  const float alpha         = param.alpha;
+  const float beta          = param.beta;
+  const int nsize           = param.nsize;
+  const float k             = param.knorm;
+  auto kind                 = dnnl::prop_kind::forward_training;
+  if (is_train) {
+    kind = dnnl::prop_kind::forward_training;
+  } else {
+    kind = dnnl::prop_kind::forward_scoring;
+  }
+  dnnl::lrn_forward::desc fwd_desc(kind, alg, src_md, nsize, alpha, beta, k);
+  return dnnl::lrn_forward::primitive_desc(fwd_desc, engine);
+}
+
+inline dnnl::lrn_backward::primitive_desc GetLRNBwdDesc(
+    const LRNParam& param,
+    const dnnl::memory::desc& data_in_md,
+    const dnnl::memory::desc& diff_md,
+    const dnnl::lrn_forward::primitive_desc& lrnFwd_desc) {
+  dnnl::engine& engine      = CpuEngine::Get()->get_engine();
+  const dnnl::algorithm alg = GetDNNLLRNAlgo(param);
+  const float alpha         = param.alpha;
+  const float beta          = param.beta;
+  const int nsize           = param.nsize;
+  const float k             = param.knorm;
+
+  dnnl::lrn_backward::desc lrnBwd_desc(alg, data_in_md, diff_md, nsize, alpha, beta, k);
+  return dnnl::lrn_backward::primitive_desc(lrnBwd_desc, engine, lrnFwd_desc);
+}
+
+typedef ParamOpSign<LRNParam> DNNLLRNSignature;
+
+// LRN Forward Class
+class DNNLLRNFwd {
+ public:
+  DNNLLRNFwd(const LRNParam& param, bool is_train, const NDArray& in_data) {
+    _Init(param, is_train, in_data);
+  }
+
+  ~DNNLLRNFwd() {}
+
+  void Execute(const OpContext& ctx,
+               const NDArray& in_data,
+               const OpReqType req,
+               const NDArray& out_data);
+
+  dnnl::lrn_forward& GetFwd();
+  const dnnl::memory* GetWs();
+  dnnl::lrn_forward::primitive_desc& GetFwdPd();
+
+ private:
+  std::shared_ptr<dnnl::lrn_forward> fwd;
+  dnnl::lrn_forward::primitive_desc fwd_pd;
+
+ private:
+  void _Init(const LRNParam& param, bool is_train, const NDArray& in_data);
+};  // End of LRN Forword Class
+
+void DNNLLRNFwd::_Init(const LRNParam& param, bool is_train, const NDArray& in_data) {
+  dnnl::memory::desc in_data_md = in_data.GetDNNLData()->get_desc();
+  this->fwd_pd                  = GetLRNFwdDesc(param, is_train, in_data_md);
+
+  this->fwd = std::shared_ptr<dnnl::lrn_forward>(new dnnl::lrn_forward(this->fwd_pd));
+}
+
+void DNNLLRNFwd::Execute(const OpContext& ctx,
+                         const NDArray& in_data,
+                         const OpReqType req,
+                         const NDArray& out_data) {
+  auto output_mem_t = CreateDNNLMem(out_data, (this->fwd_pd).dst_desc(), req);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, *in_data.GetDNNLData()},
+      {DNNL_ARG_DST, *output_mem_t.second},
+  };
+  std::shared_ptr<dnnl::memory> workspace;
+  if (ctx.is_train) {
+    auto engine = CpuEngine::Get()->get_engine();
+    workspace   = std::make_shared<dnnl::memory>((this->fwd_pd).workspace_desc(), engine);
+    args[DNNL_ARG_WORKSPACE] = *(workspace);
+  }
+  DNNLStream::Get()->RegisterPrimArgs(*(this->fwd), args);
+  CommitOutput(out_data, output_mem_t);
+  DNNLStream::Get()->Submit();
+}
+
+dnnl::lrn_forward& DNNLLRNFwd::GetFwd() {
+  return *this->fwd;
+}
+dnnl::lrn_forward::primitive_desc& DNNLLRNFwd::GetFwdPd() {
+  return this->fwd_pd;
+}
+
+// End of LRN Class and its functions
+
+static DNNLLRNFwd& GetLRNFwd(const LRNParam& param, const OpContext& ctx, const NDArray& in_data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLLRNSignature, DNNLLRNFwd, OpHash> lrn_fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLLRNSignature, DNNLLRNFwd, OpHash> lrn_fwds;
+#endif
+  auto kind_ = ctx.is_train ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_scoring;
+
+  DNNLLRNSignature key(param);
+  key.AddSign(static_cast<int>(kind_));
+  key.AddSign(in_data);
+
+  auto it = lrn_fwds.find(key);
+  if (it == lrn_fwds.end()) {
+    DNNLLRNFwd fwd(param, ctx.is_train, in_data);
+    it = AddToCache(&lrn_fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLLRNForward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const NDArray& in_data,
+                    const OpReqType req,
+                    const NDArray& out_data) {
+  const LRNParam& param = nnvm::get<LRNParam>(attrs.parsed);
+  auto in_buffer        = in_data;
+  if (in_buffer.IsView() && in_buffer.IsDNNLData())
+    in_buffer = in_buffer.Reorder2Default();
+  DNNLLRNFwd fwd = GetLRNFwd(param, ctx, in_buffer);
+  fwd.Execute(ctx, in_buffer, req, out_data);
+}
+
+// LRN Backward Class
+class DNNLLRNBwd {
+  std::shared_ptr<dnnl::lrn_backward> bwd;
+
+ public:
+  const dnnl::lrn_forward::primitive_desc fwd_pd;
+  const dnnl::lrn_backward::primitive_desc bwd_pd;
+
+  ~DNNLLRNBwd() {}
+
+  DNNLLRNBwd(const LRNParam& param,
+             const dnnl::memory::desc in_data_md,
+             const dnnl::memory::desc diff_md)
+      : fwd_pd(GetLRNFwdDesc(param, true, in_data_md)),
+        bwd_pd(GetLRNBwdDesc(param, in_data_md, diff_md, this->fwd_pd)) {
+    bwd = std::make_shared<dnnl::lrn_backward>(bwd_pd);
+  }
+
+  const dnnl::lrn_backward& GetBwd() const {
+    return *bwd;
+  }
+
+  void Execute(const NDArray& out_grad,
+               const NDArray& in_data,
+               const NDArray& in_grad,
+               const dnnl_output_t& diff_src_mem) {
+    auto engine          = CpuEngine::Get()->get_engine();
+    auto workspace       = std::make_shared<dnnl::memory>((this->fwd_pd).workspace_desc(), engine);
+    dnnl_args_map_t args = {{DNNL_ARG_SRC, *in_data.GetDNNLData()},
+                            {DNNL_ARG_DIFF_DST, *out_grad.GetDNNLData()},
+                            {DNNL_ARG_WORKSPACE, *workspace},
+                            {DNNL_ARG_DIFF_SRC, *diff_src_mem.second}};
+    DNNLStream::Get()->RegisterPrimArgs(*(this->bwd), args);
+    CommitOutput(in_grad, diff_src_mem);
+    DNNLStream::Get()->Submit();
+  }
+};  // End of LRN Class
+
+static DNNLLRNBwd& GetLRNBwd(const LRNParam& param,
+                             const NDArray& in_data,
+                             const NDArray& in_grad,
+                             const NDArray& out_grad) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLLRNSignature, DNNLLRNBwd, OpHash> lrn_bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLLRNSignature, DNNLLRNBwd, OpHash> lrn_bwds;
+#endif
+  DNNLLRNSignature key(param);
+  key.AddSign(in_data);
+  key.AddSign(in_grad);
+  key.AddSign(out_grad);
+
+  auto it = lrn_bwds.find(key);
+  if (it == lrn_bwds.end()) {
+    const dnnl::memory::desc in_data_md = in_data.GetDNNLData()->get_desc();
+    const dnnl::memory::desc diff_md    = out_grad.GetDNNLData()->get_desc();
+    DNNLLRNBwd bwd(param, in_data_md, diff_md);
+    it = AddToCache(&lrn_bwds, key, bwd);
+  }
+  return it->second;
+}
+
+void DNNLLRNBackward(const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  const LRNParam& param   = nnvm::get<LRNParam>(attrs.parsed);
+  const NDArray& out_grad = inputs[0];
+  const NDArray& in_data  = inputs[1];
+  const NDArray& in_grad  = outputs[0];
+  // TODO(alex): (MXNET-846) figure out why in_grad output incorrect when in_data is nchw8c
+  const auto in_buffer       = in_data.Reorder2Default();
+  DNNLLRNBwd& bwd            = GetLRNBwd(param, in_buffer, in_grad, out_grad);
+  dnnl_output_t diff_src_mem = CreateDNNLMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
+
+  bwd.Execute(out_grad, in_buffer, in_grad, diff_src_mem);
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_LRN_INL_H__
diff --git a/src/operator/nn/dnnl/dnnl_ops-inl.h b/src/operator/nn/dnnl/dnnl_ops-inl.h
new file mode 100644
index 0000000..8816c3c
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_ops-inl.h
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_ops-inl.h
+ * \brief
+ * \author Da Zheng
+ */
+
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_OPS_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_OPS_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+#include <mxnet/base.h>
+#include <mxnet/io.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+
+#include <vector>
+
+#if MXNET_USE_ONEDNN == 1
+#include <dnnl.hpp>
+
+namespace mxnet {
+namespace op {
+
+/* For fully connected. */
+void DNNLFCForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& in_data,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& out_data);
+void DNNLFCBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs);
+
+/* For convolution. */
+void DNNLConvolutionForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data);
+void DNNLConvolutionBackward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs);
+
+/* For deconvolution */
+void DNNLDeconvolutionForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data);
+void DNNLDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs);
+
+/* For activation */
+void DNNLActivationForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data);
+void DNNLActivationBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs);
+
+void DNNLLeakyReluForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const NDArray& in_data,
+                          const OpReqType& req,
+                          const NDArray& out_data);
+void DNNLLeakyReluBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs);
+
+/* For softmax */
+void DNNLSoftmaxForward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const NDArray& in_data,
+                        const OpReqType& req,
+                        const NDArray& out_data);
+void DNNLSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& in_data,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& out_data);
+
+/* For log_softmax */
+void DNNLLogSoftmaxForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const NDArray& in_data,
+                           const OpReqType& req,
+                           const NDArray& out_data);
+void DNNLLogSoftmaxBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& out_data);
+
+/* For softmax_output */
+void DNNLSoftmaxOutputForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data);
+
+/* For sum */
+void DNNLSumForward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs);
+
+/* For copy */
+void DNNLCopy(const nnvm::NodeAttrs& attrs,
+              const OpContext& ctx,
+              const NDArray& in_data,
+              const OpReqType& req,
+              const NDArray& out_data);
+
+/* For concat */
+void DNNLConcatForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<NDArray>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<NDArray>& out_data);
+void DNNLConcatBackward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs);
+
+/* For batch dot */
+void DNNLBatchDotForward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& outputs);
+
+/* For layer normalization */
+void DNNLLayerNormForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs);
+void DNNLLayerNormBackward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs);
+
+void DNNLSum(const dnnl::memory& arr1, const dnnl::memory& arr2, const dnnl::memory& out);
+
+void DNNLTransposeForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const NDArray& data,
+                          const OpReqType& req,
+                          const NDArray& output);
+
+void DNNLReshapeForward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const NDArray& input,
+                        const OpReqType& req,
+                        const NDArray& output);
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_ONEDNN == 1
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_OPS_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/dnnl/dnnl_pooling-inl.h
similarity index 54%
rename from src/operator/nn/mkldnn/mkldnn_pooling-inl.h
rename to src/operator/nn/dnnl/dnnl_pooling-inl.h
index be2c9f2..83d27e5 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
+++ b/src/operator/nn/dnnl/dnnl_pooling-inl.h
@@ -18,41 +18,39 @@
  */
 
 /*!
- * \file mkldnn_pooling-inl.h
+ * \file dnnl_pooling-inl.h
  * \brief
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_POOLING_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_POOLING_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
-#include <mkldnn.hpp>
-
+#include <dnnl.hpp>
 #include <utility>
 
-#include "./mkldnn_base-inl.h"
-
 #include "../pooling-inl.h"
+#include "./dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNPoolingFwd {
+class DNNLPoolingFwd {
  public:
-  MKLDNNPoolingFwd(const mxnet::NDArray& input,
-                   const mxnet::NDArray& output,
-                   const mkldnn::memory::dims& kernel,
-                   const mkldnn::memory::dims& strides,
-                   const mkldnn::memory::dims& pad_l,
-                   const mkldnn::memory::dims& pad_r,
-                   const mkldnn::algorithm alg_kind,
-                   const bool with_workspace,
-                   const bool is_train)
+  DNNLPoolingFwd(const mxnet::NDArray& input,
+                 const mxnet::NDArray& output,
+                 const dnnl::memory::dims& kernel,
+                 const dnnl::memory::dims& strides,
+                 const dnnl::memory::dims& pad_l,
+                 const dnnl::memory::dims& pad_r,
+                 const dnnl::algorithm alg_kind,
+                 const bool with_workspace,
+                 const bool is_train)
       : with_workspace_(with_workspace), fwd_(nullptr) {
     Init(input, output, kernel, strides, pad_l, pad_r, is_train, alg_kind);
   }
 
-  ~MKLDNNPoolingFwd() {}
+  ~DNNLPoolingFwd() {}
   void Execute(const NDArray& in_data,
                const OpReqType req,
                const NDArray& out_data,
@@ -61,32 +59,32 @@ class MKLDNNPoolingFwd {
  private:
   bool with_workspace_;
 
-  std::shared_ptr<mkldnn::pooling_forward::primitive_desc> fwd_pd_;
-  std::shared_ptr<mkldnn::pooling_forward> fwd_;
+  std::shared_ptr<dnnl::pooling_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<dnnl::pooling_forward> fwd_;
 
  private:
   void Init(const mxnet::NDArray& input,
             const mxnet::NDArray& output,
-            const mkldnn::memory::dims& kernel,
-            const mkldnn::memory::dims& strides,
-            const mkldnn::memory::dims& pad_l,
-            const mkldnn::memory::dims& pad_r,
+            const dnnl::memory::dims& kernel,
+            const dnnl::memory::dims& strides,
+            const dnnl::memory::dims& pad_l,
+            const dnnl::memory::dims& pad_r,
             const bool is_train,
-            const mkldnn::algorithm alg_kind);
+            const dnnl::algorithm alg_kind);
 };
 
-class MKLDNNPoolingBwd {
-  std::shared_ptr<const mkldnn::pooling_backward> bwd;
+class DNNLPoolingBwd {
+  std::shared_ptr<const dnnl::pooling_backward> bwd;
   bool with_workspace;
 
  public:
-  const mkldnn::pooling_backward::primitive_desc pd;
+  const dnnl::pooling_backward::primitive_desc pd;
 
-  MKLDNNPoolingBwd(const mkldnn::pooling_backward::primitive_desc& pdesc, bool with_ws);
+  DNNLPoolingBwd(const dnnl::pooling_backward::primitive_desc& pdesc, bool with_ws);
 
-  ~MKLDNNPoolingBwd() {}
-  const mkldnn::pooling_backward& GetBwd();
-  const mkldnn::pooling_backward::primitive_desc& GetPd();
+  ~DNNLPoolingBwd() {}
+  const dnnl::pooling_backward& GetBwd();
+  const dnnl::pooling_backward::primitive_desc& GetPd();
 };
 
 inline int GetPaddingSizeFull(dim_t x, int padl, int padr, int k, int s) {
@@ -97,7 +95,7 @@ inline int GetPaddingSizeFull(dim_t x, int padl, int padr, int k, int s) {
   }
 }
 
-inline bool SupportMKLDNNPooling(const PoolingParam& param) {
+inline bool SupportDNNLPooling(const PoolingParam& param) {
   return (param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3) &&
          (param.pool_type == pool_enum::kMaxPooling || param.pool_type == pool_enum::kAvgPooling) &&
          (!param.layout.has_value() ||
@@ -105,23 +103,23 @@ inline bool SupportMKLDNNPooling(const PoolingParam& param) {
            param.layout.value() == mshadow::kNCDHW));
 }
 
-inline bool SupportMKLDNNPooling(const PoolingParam& param, const NDArray& input) {
+inline bool SupportDNNLPooling(const PoolingParam& param, const NDArray& input) {
   const auto dshape = input.shape();
   const auto ndim   = dshape.ndim();
   const auto dtype  = input.dtype();
 
-  if (!(SupportStorageMKLDNN(input.storage_type()) && (ndim == 3 || ndim == 4 || ndim == 5) &&
+  if (!(SupportStorageDNNL(input.storage_type()) && (ndim == 3 || ndim == 4 || ndim == 5) &&
         (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16)))
     return false;
 
-  if (!SupportMKLDNNPooling(param))
+  if (!SupportDNNLPooling(param))
     return false;
 
   if (param.pooling_convention == pool_enum::kValid) {
     return true;
   } else {
     if (param.pool_type == pool_enum::kAvgPooling) {
-      // mkldnn works differently when padding is asymmetric, so let's skip this case.
+      // dnnl works differently when padding is asymmetric, so let's skip this case.
       bool is_symmetric = true;
       switch (ndim) {
         case 5:
@@ -149,30 +147,30 @@ inline bool SupportMKLDNNPooling(const PoolingParam& param, const NDArray& input
   }
 }
 
-inline bool MKLDNNRequireWorkspace(const PoolingParam& param) {
+inline bool DNNLRequireWorkspace(const PoolingParam& param) {
   return param.pool_type != pool_enum::kAvgPooling;
 }
 
-typedef ParamOpSign<PoolingParam> MKLDNNPoolingSignature;
-void MKLDNNPoolingCompute(const OpContext& ctx,
-                          const PoolingParam& param,
-                          const NDArray& in_data,
-                          const OpReqType req,
-                          const NDArray& out_data,
-                          const NDArray* workspace);
-
-void MKLDNNPoolingGradCompute(const OpContext& ctx,
-                              const PoolingParam& param,
-                              const NDArray& out_grad,
-                              const NDArray& in_data,
-                              const NDArray* workspace,
-                              const OpReqType req,
-                              const NDArray& in_grad);
-MKLDNNPoolingFwd& GetPoolingFwd(const PoolingParam& param,
-                                const bool is_train,
-                                const NDArray& data,
-                                const NDArray& output);
+typedef ParamOpSign<PoolingParam> DNNLPoolingSignature;
+void DNNLPoolingCompute(const OpContext& ctx,
+                        const PoolingParam& param,
+                        const NDArray& in_data,
+                        const OpReqType req,
+                        const NDArray& out_data,
+                        const NDArray* workspace);
+
+void DNNLPoolingGradCompute(const OpContext& ctx,
+                            const PoolingParam& param,
+                            const NDArray& out_grad,
+                            const NDArray& in_data,
+                            const NDArray* workspace,
+                            const OpReqType req,
+                            const NDArray& in_grad);
+DNNLPoolingFwd& GetPoolingFwd(const PoolingParam& param,
+                              const bool is_train,
+                              const NDArray& data,
+                              const NDArray& output);
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_POOLING_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_pooling.cc b/src/operator/nn/dnnl/dnnl_pooling.cc
new file mode 100644
index 0000000..252bf05
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_pooling.cc
@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_pooling.cc
+ * \brief
+ * \author Tao Lv
+ */
+
+#if MXNET_USE_ONEDNN == 1
+
+#include "./dnnl_pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static inline dnnl::memory::data_type get_data_type(const dnnl::memory::desc& md) {
+  return static_cast<dnnl::memory::data_type>(md.data_type());
+}
+
+void DNNLPoolingFwd::Init(const mxnet::NDArray& input,
+                          const mxnet::NDArray& output,
+                          const dnnl::memory::dims& kernel,
+                          const dnnl::memory::dims& strides,
+                          const dnnl::memory::dims& pad_l,
+                          const dnnl::memory::dims& pad_r,
+                          const bool is_train,
+                          const dnnl::algorithm alg_kind) {
+  const auto src_md         = input.GetDNNLData()->get_desc();
+  const auto dst_md         = GetMemDesc(output);
+  const dnnl::engine engine = CpuEngine::Get()->get_engine();
+  if (alg_kind != dnnl::algorithm::pooling_max && alg_kind != dnnl::algorithm::pooling_avg &&
+      alg_kind != dnnl::algorithm::pooling_avg_include_padding &&
+      alg_kind != dnnl::algorithm::pooling_avg_exclude_padding) {
+    LOG(FATAL) << "DNNL Pooling: algorithm is not supported";
+  }
+
+  dnnl::prop_kind prop = dnnl::prop_kind::forward_scoring;
+  if (is_train && alg_kind != dnnl::algorithm::pooling_avg) {
+    prop = dnnl::prop_kind::forward_training;
+  }
+  if (is_train && prop == dnnl::prop_kind::forward_scoring) {
+    LOG(INFO) << "DNNL Pooling: training with prop_kind is forward_scoring";
+  }
+
+  const auto fwd_desc =
+      dnnl::pooling_forward::desc(prop, alg_kind, src_md, dst_md, strides, kernel, pad_l, pad_r);
+  this->fwd_pd_.reset(new dnnl::pooling_forward::primitive_desc(fwd_desc, engine));
+  this->fwd_.reset(new dnnl::pooling_forward(*(this->fwd_pd_)));
+
+  return;
+}
+
+void DNNLPoolingFwd::Execute(const NDArray& in_data,
+                             const OpReqType req,
+                             const NDArray& out_data,
+                             const NDArray* workspace) {
+  NDArray in_buffer = in_data;
+  if (in_data.IsView() && in_data.IsDNNLData())
+    in_buffer = in_data.Reorder2Default();
+
+  auto input_mem     = in_buffer.GetDNNLData();
+  auto output_mem_t_ = CreateDNNLMem(out_data, this->fwd_pd_->dst_desc(), req);
+
+  dnnl_args_map_t args = {
+      {DNNL_ARG_SRC, *input_mem},
+      {DNNL_ARG_DST, *(output_mem_t_.second)},
+  };
+
+  if (this->with_workspace_) {
+    auto engine = CpuEngine::Get()->get_engine();
+
+    if (workspace == nullptr) {
+      LOG(FATAL) << "DNNL Pooling: incorrect workspace input";
+    }
+
+    auto ws = std::make_shared<dnnl::memory>(
+        (*(this->fwd_pd_)).workspace_desc(), engine, workspace->GetDNNLData()->get_data_handle());
+    args[DNNL_ARG_WORKSPACE] = *ws;
+  }
+  if (this->fwd_) {
+    DNNLStream::Get()->RegisterPrimArgs(*(this->fwd_), args);
+    CommitOutput(out_data, output_mem_t_);
+    DNNLStream::Get()->Submit();
+  } else {
+    LOG(FATAL) << "DNNL Pooling: forward primitive is nullptr";
+  }
+}
+
+dnnl::algorithm GetDNNLPoolAlgo(const PoolingParam& param) {
+  switch (param.pool_type) {
+    case pool_enum::kMaxPooling:
+      return dnnl::algorithm::pooling_max;
+      break;
+    case pool_enum::kAvgPooling:
+      if (param.count_include_pad.has_value() && !param.count_include_pad.value()) {
+        return dnnl::algorithm::pooling_avg_exclude_padding;
+      } else {
+        return dnnl::algorithm::pooling_avg_include_padding;
+      }
+      break;
+    default:
+      LOG(FATAL) << "DNNL Pooling: Unknown pooling method.";
+      return dnnl::algorithm::pooling_max;
+  }
+}
+
+void InitPoolingPrimitiveParams(const PoolingParam& param,
+                                const dnnl::memory::desc& data_md,
+                                const dnnl::memory::dims& new_kernel,
+                                const dnnl::memory::dims& new_strides,
+                                const dnnl::memory::dims& new_pad_l,
+                                const dnnl::memory::dims& new_pad_r) {
+  const int kernel_ndims      = param.kernel.ndim();
+  dnnl::memory::dims& kernel  = const_cast<dnnl::memory::dims&>(new_kernel);
+  dnnl::memory::dims& strides = const_cast<dnnl::memory::dims&>(new_strides);
+  dnnl::memory::dims& pad_l   = const_cast<dnnl::memory::dims&>(new_pad_l);
+  dnnl::memory::dims& pad_r   = const_cast<dnnl::memory::dims&>(new_pad_r);
+  if (kernel_ndims == 1) {
+    CHECK_GE(param.pad.ndim(), 1);
+    CHECK_GE(param.stride.ndim(), 1);
+    kernel[0]  = param.kernel[0];
+    pad_l[0]   = param.pad[0];
+    pad_r[0]   = param.pad[0];
+    strides[0] = param.stride[0];
+
+    if (param.pooling_convention == pool_enum::kFull) {
+      pad_r[0] =
+          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
+    }
+
+    if (param.global_pool) {
+      kernel[0]  = data_md.data.dims[2];
+      strides[0] = 1;
+      pad_l[0] = pad_r[0] = 0;
+    }
+
+    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
+  } else if (kernel_ndims == 2) {
+    CHECK_GE(param.pad.ndim(), 2);
+    CHECK_GE(param.stride.ndim(), 2);
+    kernel[0]  = param.kernel[0];
+    kernel[1]  = param.kernel[1];
+    pad_l[0]   = param.pad[0];
+    pad_l[1]   = param.pad[1];
+    pad_r[0]   = param.pad[0];
+    pad_r[1]   = param.pad[1];
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+
+    if (param.pooling_convention == pool_enum::kFull) {
+      pad_r[0] =
+          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
+      pad_r[1] =
+          GetPaddingSizeFull(data_md.data.dims[3], pad_l[1], pad_r[1], kernel[1], strides[1]);
+    }
+
+    if (param.global_pool) {
+      kernel[0]  = data_md.data.dims[2];
+      kernel[1]  = data_md.data.dims[3];
+      strides[0] = strides[1] = 1;
+      pad_l[0] = pad_l[1] = pad_r[0] = pad_r[1] = 0;
+    }
+
+    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel[1], 0) << "Filter dimensions cannot be zero.";
+  } else {
+    CHECK_GE(param.pad.ndim(), 3);
+    CHECK_GE(param.stride.ndim(), 3);
+    kernel[0]  = param.kernel[0];
+    kernel[1]  = param.kernel[1];
+    kernel[2]  = param.kernel[2];
+    pad_l[0]   = param.pad[0];
+    pad_l[1]   = param.pad[1];
+    pad_l[2]   = param.pad[2];
+    pad_r[0]   = param.pad[0];
+    pad_r[1]   = param.pad[1];
+    pad_r[2]   = param.pad[2];
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+    strides[2] = param.stride[2];
+
+    if (param.pooling_convention == pool_enum::kFull) {
+      pad_r[0] =
+          GetPaddingSizeFull(data_md.data.dims[2], pad_l[0], pad_r[0], kernel[0], strides[0]);
+      pad_r[1] =
+          GetPaddingSizeFull(data_md.data.dims[3], pad_l[1], pad_r[1], kernel[1], strides[1]);
+      pad_r[2] =
+          GetPaddingSizeFull(data_md.data.dims[4], pad_l[2], pad_r[2], kernel[2], strides[2]);
+    }
+
+    if (param.global_pool) {
+      kernel[0]  = data_md.data.dims[2];
+      kernel[1]  = data_md.data.dims[3];
+      kernel[2]  = data_md.data.dims[4];
+      strides[0] = strides[1] = strides[2] = 1;
+      pad_l[0] = pad_l[1] = pad_l[2] = pad_r[0] = pad_r[1] = pad_r[2] = 0;
+    }
+
+    CHECK_GT(kernel[0], 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel[1], 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel[2], 0) << "Filter dimensions cannot be zero.";
+  }
+
+  if (pad_l[0] != 0 || (kernel_ndims == 2 && pad_l[1] != 0) ||
+      (kernel_ndims == 3 && pad_l[2] != 0)) {
+    CHECK(param.pool_type == pool_enum::kAvgPooling || param.pool_type == pool_enum::kMaxPooling)
+        << "Padding implemented only for average and max pooling.";
+    CHECK_LT(pad_l[0], kernel[0]);
+    if (kernel_ndims > 1)
+      CHECK_LT(pad_l[1], kernel[1]);
+    if (kernel_ndims > 2)
+      CHECK_LT(pad_l[2], kernel[2]);
+  }
+}
+
+dnnl::pooling_forward::primitive_desc GetPoolingFwdPdesc(const PoolingParam& param,
+                                                         const bool is_train,
+                                                         const dnnl::memory::desc& data_md,
+                                                         const dnnl::memory::desc& out_md) {
+  CHECK(param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3)
+      << "Not Implemented";
+
+  const int kernel_ndims = param.kernel.ndim();
+  dnnl::memory::dims kernel(kernel_ndims);
+  dnnl::memory::dims strides(kernel_ndims);
+  dnnl::memory::dims pad_l(kernel_ndims);
+  dnnl::memory::dims pad_r(kernel_ndims);
+
+  InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
+
+  const dnnl::algorithm alg = GetDNNLPoolAlgo(param);
+  dnnl::prop_kind kind      = dnnl::prop_kind::forward_scoring;
+  if (is_train && alg != dnnl::algorithm::pooling_avg) {
+    kind = dnnl::prop_kind::forward_training;
+  }
+
+  const dnnl::pooling_forward::desc poolingFwd_desc(
+      kind, alg, data_md, out_md, strides, kernel, pad_l, pad_r);
+  return dnnl::pooling_forward::primitive_desc(poolingFwd_desc, CpuEngine::Get()->get_engine());
+}
+
+DNNLPoolingFwd& GetPoolingFwd(const PoolingParam& param,
+                              const bool is_train,
+                              const NDArray& data,
+                              const NDArray& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLPoolingSignature, DNNLPoolingFwd, OpHash> pooling_fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLPoolingSignature, DNNLPoolingFwd, OpHash>
+      pooling_fwds;
+#endif
+
+  bool with_workspace = is_train && DNNLRequireWorkspace(param);
+  DNNLPoolingSignature key(param);
+  key.AddSign(is_train);
+  key.AddSign(with_workspace);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = pooling_fwds.find(key);
+  if (it == pooling_fwds.end()) {
+    CHECK(param.kernel.ndim() == 1 || param.kernel.ndim() == 2 || param.kernel.ndim() == 3)
+        << "Not Implemented";
+    auto data_md = data.GetDNNLData()->get_desc();
+
+    const auto kernel_ndims = param.kernel.ndim();
+    dnnl::memory::dims kernel(kernel_ndims);
+    dnnl::memory::dims strides(kernel_ndims);
+    dnnl::memory::dims pad_l(kernel_ndims);
+    dnnl::memory::dims pad_r(kernel_ndims);
+    InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
+
+    const dnnl::algorithm alg = GetDNNLPoolAlgo(param);
+    DNNLPoolingFwd fwd(data, output, kernel, strides, pad_l, pad_r, alg, with_workspace, is_train);
+    it = AddToCache(&pooling_fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLPoolingCompute(const OpContext& ctx,
+                        const PoolingParam& param,
+                        const NDArray& in_data,
+                        const OpReqType req,
+                        const NDArray& out_data,
+                        const NDArray* workspace) {
+  auto& fwd = GetPoolingFwd(param, ctx.is_train, in_data, out_data);
+  fwd.Execute(in_data, req, out_data, workspace);
+}
+
+DNNLPoolingBwd::DNNLPoolingBwd(const dnnl::pooling_backward::primitive_desc& pdesc, bool with_ws)
+    : with_workspace(with_ws), pd(pdesc) {
+  bwd = std::make_shared<dnnl::pooling_backward>(pd);
+}
+
+const dnnl::pooling_backward& DNNLPoolingBwd::GetBwd() {
+  return *this->bwd;
+}
+
+DNNLPoolingBwd& GetPoolingBwd(const PoolingParam& param,
+                              const NDArray& in_data,
+                              const NDArray& in_grad,
+                              const NDArray& out_grad) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLPoolingSignature, DNNLPoolingBwd, OpHash> pooling_bwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLPoolingSignature, DNNLPoolingBwd, OpHash>
+      pooling_bwds;
+#endif
+
+  bool with_workspace = DNNLRequireWorkspace(param);
+  DNNLPoolingSignature key(param);
+  key.AddSign(in_data);
+  key.AddSign(in_grad);
+  key.AddSign(out_grad);
+
+  auto it = pooling_bwds.find(key);
+  if (it == pooling_bwds.end()) {
+    auto input_mem = in_data.GetDNNLData();
+    auto data_md   = input_mem->get_desc();
+
+    auto dst_dims = dnnl::memory::dims(out_grad.shape().begin(), out_grad.shape().end());
+    auto any      = dnnl::memory::format_tag::any;
+    auto dst_md   = dnnl::memory::desc(dst_dims, get_data_type(data_md), any);
+
+    // fwd hint
+    auto fwd_pd = GetPoolingFwdPdesc(param, true, data_md, dst_md);
+
+    // creat bwd desc
+    auto diff_src_dims = dnnl::memory::dims(in_grad.shape().begin(), in_grad.shape().end());
+    auto diff_src_md   = dnnl::memory::desc(diff_src_dims, get_data_type(data_md), any);
+    auto cpu_engine    = CpuEngine::Get()->get_engine();
+    auto alg           = GetDNNLPoolAlgo(param);
+
+    const int kernel_ndims = param.kernel.ndim();
+    dnnl::memory::dims kernel(kernel_ndims);
+    dnnl::memory::dims strides(kernel_ndims);
+    dnnl::memory::dims pad_l(kernel_ndims);
+    dnnl::memory::dims pad_r(kernel_ndims);
+
+    InitPoolingPrimitiveParams(param, data_md, kernel, strides, pad_l, pad_r);
+
+    // use dst_md as diff_dst_md with any format
+    auto bwd_desc =
+        dnnl::pooling_backward::desc(alg, diff_src_md, dst_md, strides, kernel, pad_l, pad_r);
+    auto pdesc = dnnl::pooling_backward::primitive_desc(bwd_desc, cpu_engine, fwd_pd);
+
+    DNNLPoolingBwd bwd(pdesc, with_workspace);
+    it = AddToCache(&pooling_bwds, key, bwd);
+  }
+  return it->second;
+}
+
+void DNNLPoolingGradCompute(const OpContext& ctx,
+                            const PoolingParam& param,
+                            const NDArray& out_grad,
+                            const NDArray& in_data,
+                            const NDArray* workspace,
+                            const OpReqType req,
+                            const NDArray& in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+
+  auto& bwd            = GetPoolingBwd(param, in_data, in_grad, out_grad);
+  auto diff_dst_mem    = out_grad.GetDNNLDataReorder(bwd.pd.diff_dst_desc());
+  auto diff_src_mem    = CreateDNNLMem(in_grad, bwd.pd.diff_src_desc(), req);
+  dnnl_args_map_t args = {
+      {DNNL_ARG_DIFF_DST, *diff_dst_mem},
+      {DNNL_ARG_DIFF_SRC, *diff_src_mem.second},
+  };
+  if (DNNLRequireWorkspace(param) && workspace != nullptr) {
+    args[DNNL_ARG_WORKSPACE] = *(workspace->GetDNNLData());
+  }
+
+  DNNLStream::Get()->RegisterPrimArgs(bwd.GetBwd(), args);
+  CommitOutput(in_grad, diff_src_mem);
+  DNNLStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_ONEDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_reshape-inl.h b/src/operator/nn/dnnl/dnnl_reshape-inl.h
similarity index 62%
rename from src/operator/nn/mkldnn/mkldnn_reshape-inl.h
rename to src/operator/nn/dnnl/dnnl_reshape-inl.h
index cab6ec1..a814c1d 100644
--- a/src/operator/nn/mkldnn/mkldnn_reshape-inl.h
+++ b/src/operator/nn/dnnl/dnnl_reshape-inl.h
@@ -18,31 +18,30 @@
  */
 
 /*!
- * \file mkldnn_reshape-inl.h
- * \brief Function definition of mkldnn reshape operator
+ * \file dnnl_reshape-inl.h
+ * \brief Function definition of dnnl reshape operator
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_RESHAPE_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_RESHAPE_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 #include <vector>
 
 #include "../../tensor/matrix_op-inl.h"
-
-#include "mkldnn_base-inl.h"
+#include "dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-class MKLDNNReshapeFwd {
+class DNNLReshapeFwd {
  protected:
-  std::shared_ptr<mkldnn::memory> out_;
-  std::shared_ptr<mkldnn::memory> temp_;
-  std::vector<mkldnn::primitive> prims_;
+  std::shared_ptr<dnnl::memory> out_;
+  std::shared_ptr<dnnl::memory> temp_;
+  std::vector<dnnl::primitive> prims_;
 
  public:
-  MKLDNNReshapeFwd(const OpReqType& req, const NDArray& input, const NDArray& output);
+  DNNLReshapeFwd(const OpReqType& req, const NDArray& input, const NDArray& output);
   int GetWorkspaceSize();
   void Execute(const NDArray& input,
                const NDArray& output,
@@ -50,12 +49,12 @@ class MKLDNNReshapeFwd {
                void* workspace = nullptr);
 };
 
-typedef OpSignature MKLDNNReshapeSignature;
-MKLDNNReshapeFwd& GetReshapeForward(const OpReqType& req,
-                                    const NDArray& input,
-                                    const NDArray& output);
+typedef OpSignature DNNLReshapeSignature;
+DNNLReshapeFwd& GetReshapeForward(const OpReqType& req,
+                                  const NDArray& input,
+                                  const NDArray& output);
 }  // namespace op
 }  // namespace mxnet
 
 #endif  // MXNET_USE_ONEDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
+#endif  // MXNET_OPERATOR_NN_DNNL_DNNL_RESHAPE_INL_H_
diff --git a/src/operator/nn/dnnl/dnnl_reshape.cc b/src/operator/nn/dnnl/dnnl_reshape.cc
new file mode 100644
index 0000000..5d25919
--- /dev/null
+++ b/src/operator/nn/dnnl/dnnl_reshape.cc
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dnnl_reshape.cc
+ * \brief Implement reshape operator via DNNL reorder primitive
+ * \author Tao Lv
+ */
+
+#if MXNET_USE_ONEDNN == 1
+#include "../../tensor/elemwise_unary_op.h"
+#include "./dnnl_base-inl.h"
+#include "./dnnl_ops-inl.h"
+#include "./dnnl_reshape-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportDNNLReshape(const NDArray& input, const NDArray& output) {
+  const int input_ndims  = input.shape().ndim();
+  const int output_ndims = output.shape().ndim();
+  return input.shape().Size() > 0 && input_ndims >= 1 && input_ndims <= 6 && output_ndims >= 1 &&
+         output_ndims <= 6 && IsDNNLType(input.dtype());
+}
+
+DNNLReshapeFwd::DNNLReshapeFwd(const OpReqType& req, const NDArray& input, const NDArray& output) {
+  const auto engine = CpuEngine::Get()->get_engine();
+  auto in_mem       = input.GetDNNLData();
+
+  // Create temp memory
+  auto temp_dims = dnnl::memory::dims(input.shape().begin(), input.shape().end());
+  auto temp_type = static_cast<dnnl::memory::data_type>(get_dnnl_type(input.dtype()));
+  auto temp_fmt  = static_cast<dnnl::memory::format_tag>(GetDefaultFormat(input.shape().ndim()));
+  auto temp_desc = dnnl::memory::desc(temp_dims, temp_type, temp_fmt);
+
+  out_ = std::make_shared<dnnl::memory>(temp_desc, engine, nullptr);
+  if (req == kWriteInplace) {
+    // If the input has DNNL internal layout, we need reorder it to a temporal buffer with
+    // default layout and copy from the temporal buffer back to output buffer which has the same
+    // address with input buffer.
+    // If the input has default layout, then nothing need to do.
+    if (input.IsDNNLData()) {
+      temp_ = std::make_shared<dnnl::memory>(temp_desc, engine, nullptr);
+      prims_.push_back(dnnl::reorder(*in_mem, *temp_));  // reorder to default
+      prims_.push_back(dnnl::reorder(*temp_, *out_));    // copy back
+    }
+  } else if (req == kWriteTo) {
+    prims_.push_back(dnnl::reorder(*in_mem, *out_));
+  } else {
+    LOG(FATAL) << "not supported req type: " << req;
+  }
+}
+
+int DNNLReshapeFwd::GetWorkspaceSize() {
+  return temp_ ? temp_->get_desc().get_size() : 0;
+}
+
+void DNNLReshapeFwd::Execute(const NDArray& input,
+                             const NDArray& output,
+                             const OpReqType& req,
+                             void* workspace) {
+  auto stream = DNNLStream::Get();
+  auto in_mem = input.GetDNNLData();
+  // register primitives and arguments
+  std::vector<dnnl_args_map_t> args_map;
+  size_t prims_size = prims_.size();
+  if (prims_size == 1) {
+    args_map.push_back({{DNNL_ARG_FROM, *in_mem}, {DNNL_ARG_TO, *output.GetDNNLData()}});
+  } else if (prims_size == 2) {
+    if (workspace) {
+      temp_->set_data_handle(workspace);
+    }
+    args_map.push_back({{DNNL_ARG_FROM, *in_mem}, {DNNL_ARG_TO, *temp_}});
+    args_map.push_back({{DNNL_ARG_FROM, *temp_}, {DNNL_ARG_TO, *output.GetDNNLData()}});
+  } else {
+    CHECK(prims_size == 0 && req != kWriteTo) << "kWriteTo should never reach here.";
+  }
+
+  for (size_t i = 0; i < prims_size; i++) {
+    stream->RegisterPrimArgs(prims_[i], args_map[i]);
+  }
+  stream->Submit();
+  // invalidate dnnl memory in output
+  const_cast<NDArray&>(output).InvalidateDNNLData();
+}
+
+DNNLReshapeFwd& GetReshapeForward(const OpReqType& req,
+                                  const NDArray& input,
+                                  const NDArray& output) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<DNNLReshapeSignature, DNNLReshapeFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DNNLReshapeSignature, DNNLReshapeFwd, OpHash> fwds;
+#endif
+  DNNLReshapeSignature key;
+  key.AddSign(req);
+  key.AddSign(input);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    DNNLReshapeFwd fwd(req, input, output);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void DNNLReshapeForward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const NDArray& input,
+                        const OpReqType& req,
+                        const NDArray& output) {
+  if (req == kNullOp)
+    return;
+  CHECK_NE(req, kAddTo) << "kAddTo is not supported yet";
+  auto fwd     = GetReshapeForward(req, input, output);
+  auto ws_size = fwd.GetWorkspaceSize();
+  void* ws_ptr = nullptr;
+  if (ws_size) {
+    mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+    mshadow::Tensor<cpu, 1, char> ws =
+        ctx.requested[0].get_space_typed<cpu, 1, char>(mshadow::Shape1(ws_size), s);
+    ws_ptr = static_cast<void*>(ws.dptr_);
+  }
+  fwd.Execute(input, output, req, ws_ptr);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h b/src/operator/nn/dnnl/dnnl_rnn-inl.h
similarity index 61%
rename from src/operator/nn/mkldnn/mkldnn_rnn-inl.h
rename to src/operator/nn/dnnl/dnnl_rnn-inl.h
index dee8213..bd2a63f 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
+++ b/src/operator/nn/dnnl/dnnl_rnn-inl.h
@@ -18,28 +18,27 @@
  */
 
 /*!
- * \file mkldnn_rnn-inl.h
- * \brief Common functions used by MKLDNN RNN operator
+ * \file dnnl_rnn-inl.h
+ * \brief Common functions used by DNNL RNN operator
  * \author Zixuan Wei
  */
 
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_INL_H_
+#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_RNN_INL_H_
+#define MXNET_OPERATOR_NN_DNNL_DNNL_RNN_INL_H_
 
 #if MXNET_USE_ONEDNN == 1
 
 #include <vector>
 
-#include "./mkldnn_base-inl.h"
-
 #include "../../rnn-inl.h"
+#include "./dnnl_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-struct MKLDNNRnnLayerParam {
-  using memory = mkldnn::memory;
-  using dims   = mkldnn::memory::dims;
+struct DNNLRnnLayerParam {
+  using memory = dnnl::memory;
+  using dims   = dnnl::memory::dims;
 
   int mode;
   bool bidirectional;
@@ -60,21 +59,21 @@ struct MKLDNNRnnLayerParam {
   dims state_dims;         // Dimensions of the state cell in format_tag::ldnc
   dims cell_dims;          // Dimensions of LSTM cell state in format_tag::ldnc
 
-  size_t workspace_size;        // used for the cached mkl-dnn memory in Forward inference
+  size_t workspace_size;        // used for the cached DNNL memory in Forward inference
   size_t reserve_size;          // used for the reserved cached memory in Backward
   size_t single_w_size;         // weights size of a single cell
-  size_t single_b_size;         // bias size of a single cell from mkl-dnn
+  size_t single_b_size;         // bias size of a single cell from DNNL
   size_t native_single_b_size;  // bias size of a single cell from framework
   size_t single_state_size;     // state size of a single cell, hy, cy
 
-  MKLDNNRnnLayerParam(int num_layer,
-                      index_t batch_size,
-                      index_t seq_len,
-                      index_t input_size,
-                      int state_size,
-                      int proj_size,
-                      int mode,
-                      bool bidirectional = true)
+  DNNLRnnLayerParam(int num_layer,
+                    index_t batch_size,
+                    index_t seq_len,
+                    index_t input_size,
+                    int state_size,
+                    int proj_size,
+                    int mode,
+                    bool bidirectional = true)
       : mode(mode),
         bidirectional(bidirectional),
         state_outputs(true),
@@ -88,33 +87,33 @@ struct MKLDNNRnnLayerParam {
   void SetDims();
 };
 
-typedef std::vector<MKLDNNRnnLayerParam> LayerParamVector;
-struct MKLDNNRnnFullParam {
+typedef std::vector<DNNLRnnLayerParam> LayerParamVector;
+struct DNNLRnnFullParam {
   RNNParam default_param;
   LayerParamVector layer_params;
 };
 
-MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const RNNParam& rnn_param,
-                                            const index_t seq_len,
-                                            const index_t batch_size,
-                                            const index_t input_size);
+DNNLRnnFullParam DNNLRnnFullParamParser(const RNNParam& rnn_param,
+                                        const index_t seq_len,
+                                        const index_t batch_size,
+                                        const index_t input_size);
 
 /*
- * Use this to allocate memory from MKLDNNRnnOp temporary space.
+ * Use this to allocate memory from DNNLRnnOp temporary space.
  */
-class MKLDNNRnnMemMgr {
+class DNNLRnnMemMgr {
   // The memory buffer in NDArray life-cycle
   NDArray workspace_;
   // This points to the memory buffer from a NDArray
   char* curr_mem;
-  // The total bytes of the workspace of a MKLDNNRnnOp
+  // The total bytes of the workspace of a DNNLRnnOp
   size_t mem_size = 0;
   // The current available memory bytes
-  size_t curr_size                 = 0;
-  const size_t alignment           = kMKLDNNAlign;
-  const mkldnn::engine& cpu_engine = CpuEngine::Get()->get_engine();
+  size_t curr_size               = 0;
+  const size_t alignment         = kDNNLAlign;
+  const dnnl::engine& cpu_engine = CpuEngine::Get()->get_engine();
   // Here we hold all memory related to the stateful RNN operators
-  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
+  std::vector<std::shared_ptr<const dnnl::memory> > mem_holder;
 
  public:
   /*!
@@ -129,11 +128,11 @@ class MKLDNNRnnMemMgr {
     return mem_size;
   }
 
-  void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
+  void RegisterMem(std::shared_ptr<const dnnl::memory> mem) {
     mem_holder.push_back(mem);
   }
 
-  mkldnn::memory* Alloc(const mkldnn::memory::desc& md);
+  dnnl::memory* Alloc(const dnnl::memory::desc& md);
 };
 
 /*
@@ -159,7 +158,7 @@ class RnnPrimitive {
     rnn_fwd_prim.weights_proj_desc_  = fwd_pd->weights_projection_desc();
     rnn_fwd_prim.workspace_desc_     = fwd_pd->workspace_desc();
 
-    rnn_fwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new rnn_fwd(*fwd_pd));
+    rnn_fwd_prim.primitive_ = std::shared_ptr<dnnl::primitive>(new rnn_fwd(*fwd_pd));
 
     return rnn_fwd_prim;
   }
@@ -167,10 +166,10 @@ class RnnPrimitive {
   RnnPrimitive() {
     this->fwd_pd_             = nullptr;
     this->primitive_          = nullptr;
-    this->weights_layer_desc_ = mkldnn::memory::desc();
-    this->weights_iter_desc_  = mkldnn::memory::desc();
-    this->weights_proj_desc_  = mkldnn::memory::desc();
-    this->workspace_desc_     = mkldnn::memory::desc();
+    this->weights_layer_desc_ = dnnl::memory::desc();
+    this->weights_iter_desc_  = dnnl::memory::desc();
+    this->weights_proj_desc_  = dnnl::memory::desc();
+    this->workspace_desc_     = dnnl::memory::desc();
   }
 
   RnnPrimitive(const RnnPrimitive& rnn_fwd_prim) {
@@ -198,50 +197,50 @@ class RnnPrimitive {
   const void* GetPrimDesc() const {
     return fwd_pd_.get();
   }
-  const mkldnn::primitive& GetPrim() const {
+  const dnnl::primitive& GetPrim() const {
     return *primitive_;
   }
 
-  const mkldnn::memory::desc& GetLayerDesc() const {
+  const dnnl::memory::desc& GetLayerDesc() const {
     return weights_layer_desc_;
   }
 
-  const mkldnn::memory::desc& GetIterDesc() const {
+  const dnnl::memory::desc& GetIterDesc() const {
     return weights_iter_desc_;
   }
 
-  const mkldnn::memory::desc& GetProjDesc() const {
+  const dnnl::memory::desc& GetProjDesc() const {
     return weights_proj_desc_;
   }
 
-  const mkldnn::memory::desc& GetWorkspaceDesc() const {
+  const dnnl::memory::desc& GetWorkspaceDesc() const {
     return workspace_desc_;
   }
 
  private:
   std::shared_ptr<void> fwd_pd_;
-  std::shared_ptr<mkldnn::primitive> primitive_;
-  mkldnn::memory::desc weights_layer_desc_;
-  mkldnn::memory::desc weights_iter_desc_;
-  mkldnn::memory::desc weights_proj_desc_;
-  mkldnn::memory::desc workspace_desc_;
+  std::shared_ptr<dnnl::primitive> primitive_;
+  dnnl::memory::desc weights_layer_desc_;
+  dnnl::memory::desc weights_iter_desc_;
+  dnnl::memory::desc weights_proj_desc_;
+  dnnl::memory::desc workspace_desc_;
 };
 
-RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam& layer_param,
+RnnPrimitive GetRnnFwdPrim(const DNNLRnnLayerParam& layer_param,
                            const bool is_train,
                            const NDArray& data,
                            const NDArray& params);
 
 /*
- * Use this to manage memory and primitive of MKL-DNN RNN forward inference.
+ * Use this to manage memory and primitive of DNNL RNN forward inference.
  */
-class MKLDNNRnnForward {
+class DNNLRnnForward {
  public:
-  MKLDNNRnnForward(const Context ctx,
-                   const MKLDNNRnnLayerParam& layer_param,
-                   const bool is_train,
-                   const NDArray& data,
-                   const NDArray& params)
+  DNNLRnnForward(const Context ctx,
+                 const DNNLRnnLayerParam& layer_param,
+                 const bool is_train,
+                 const NDArray& data,
+                 const NDArray& params)
       : ctx_(ctx),
         initialized_(false),
         param_(layer_param),
@@ -260,7 +259,7 @@ class MKLDNNRnnForward {
                      const int dtype     = mshadow::kFloat32);
   void ReorderWeights();
 
-  const mkldnn::primitive& GetFwd() const {
+  const dnnl::primitive& GetFwd() const {
     return fwd_inf_.GetPrim();
   }
 
@@ -270,11 +269,11 @@ class MKLDNNRnnForward {
     return size;
   }
 
-  const MKLDNNRnnLayerParam& GetParam() const {
+  const DNNLRnnLayerParam& GetParam() const {
     return param_;
   }
 
-  const mkldnn_args_map_t& GetArgsMap() const {
+  const dnnl_args_map_t& GetArgsMap() const {
     return net_args_;
   }
 
@@ -288,79 +287,79 @@ class MKLDNNRnnForward {
  private:
   Context ctx_;
   bool initialized_;
-  MKLDNNRnnLayerParam param_;
+  DNNLRnnLayerParam param_;
   RnnPrimitive fwd_inf_;  // forward inference primitive
 
-  MKLDNNRnnMemMgr mem_mgr_;
-  mkldnn::memory* weights_layer_ = nullptr;
-  mkldnn::memory* weights_iter_  = nullptr;
-  mkldnn::memory* weights_proj_  = nullptr;
-  mkldnn::memory* bias_          = nullptr;
+  DNNLRnnMemMgr mem_mgr_;
+  dnnl::memory* weights_layer_ = nullptr;
+  dnnl::memory* weights_iter_  = nullptr;
+  dnnl::memory* weights_proj_  = nullptr;
+  dnnl::memory* bias_          = nullptr;
 
-  mkldnn::memory* weights_layer_r_ = nullptr;
-  mkldnn::memory* weights_iter_r_  = nullptr;
-  mkldnn::memory* weights_proj_r_  = nullptr;
+  dnnl::memory* weights_layer_r_ = nullptr;
+  dnnl::memory* weights_iter_r_  = nullptr;
+  dnnl::memory* weights_proj_r_  = nullptr;
 
   /*
    * net_args must contain some keys as below:
-   *   MKLDNN_ARG_SRC
-   *   MKLDNN_ARG_SRC_ITER
-   *   MKLDNN_WEIGHTS_LAYER
-   *   MKLDNN_WEIGHTS_ITER
-   *   MKLDNN_BIAS
-   *   MKLDNN_ARG_DST
-   *   MKLDNN_ARG_DST_ITER
+   *   DNNL_ARG_SRC
+   *   DNNL_ARG_SRC_ITER
+   *   DNNL_WEIGHTS_LAYER
+   *   DNNL_WEIGHTS_ITER
+   *   DNNL_BIAS
+   *   DNNL_ARG_DST
+   *   DNNL_ARG_DST_ITER
    * if mode == Lstm, it also needs two additional key:
-   *   MKLDNN_ARG_SRC_ITER_C
-   *   MKLDNN_ARG_DST_ITER_C
+   *   DNNL_ARG_SRC_ITER_C
+   *   DNNL_ARG_DST_ITER_C
    */
-  mkldnn_args_map_t net_args_;
+  dnnl_args_map_t net_args_;
 
-  friend class MKLDNNRnnForwardTraining;
+  friend class DNNLRnnForwardTraining;
 };
 
-typedef std::shared_ptr<mkldnn::memory> mkldnn_shared_mem_t;
+typedef std::shared_ptr<dnnl::memory> dnnl_shared_mem_t;
 /*
- * Use this to manage memory and primitive of MKL-DNN RNN forward training.
+ * Use this to manage memory and primitive of DNNL RNN forward training.
  */
-class MKLDNNRnnForwardTraining {
+class DNNLRnnForwardTraining {
  public:
-  MKLDNNRnnForwardTraining(const MKLDNNRnnLayerParam& layer_param,
-                           const bool is_train,
-                           const NDArray& data,
-                           const NDArray& params)
+  DNNLRnnForwardTraining(const DNNLRnnLayerParam& layer_param,
... 13616 lines suppressed ...