You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/12/12 22:18:00 UTC
[GitHub] piiswrong closed pull request #7931: MKL-DNN integration: request for reviews

piiswrong closed pull request #7931: MKL-DNN integration: request for reviews
URL: https://github.com/apache/incubator-mxnet/pull/7931
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/MKL_README.md b/MKL_README.md
index 80a31c9a40..ea0c6b0976 100644
--- a/MKL_README.md
+++ b/MKL_README.md
@@ -1,12 +1,68 @@
-# MKL2017 PLUGIN
+# Intel(R) Math Kernel Library Optimizations for Machine Learning
 
 MKL2017 is an INTEL released library to accelerate Deep Neural Network (DNN) applications on Intel architecture.
 
-MKL2017_ML is a subset of MKL2017 and only contains DNN acceleration feature, MKL2017 release cycle is longer then MKL2017_ML and MKL2017_ML support latest feature
+MKL2017_ML is a subset of MKL2017 and only contains DNN acceleration features, MKL2017 release cycle is longer than MKL2017_ML and MKL2017_ML support latest features.
 
-This README shows the user how to setup and install MKL2017 library with mxnet.
+[Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](https://github.com/01org/mkl-dnn) is a new open source performance library specially designed for accelerating Deep Learning (DL) applications on Intel(R) architecture. 
 
-## Build/Install MXNet with MKL:
+Intel MKL-DNN includes functionality similar to Intel(R) Math Kernel Library (Intel(R) MKL) 2017, and adds several new optimizations for Deep Learning workloads.
+
+This README shows the user how to setup and install MXNet with MKL2017 and the newer MKL-DNN. Please choose one, we cannot build with both the options. 
+
+## Quick Start
+-----
+The below quick start instructions assume the user wants to install MKL2017 or MKLDNN under default /usr/local folder, hence requires sudo.
+
+Please refer to detailed instructions for custom options. 
+
+* Quick start assumes you already installed system dependencies. We used the following to validate quick start instructions.
+
+``` bash
+# centos 7
+sudo yum -y install make gcc gcc-c++ cmake git wget curl atlas-devel opencv-devel python-devel python-setuptools graphviz 
+```
+
+``` bash
+# ubuntu 14.04 or ubuntu 16.04
+sudo apt-get install -y build-essential cmake git wget curl libatlas-base-dev libopencv-dev python-dev graphviz
+```
+
+``` bash
+# python dependencies for all platforms
+
+ # install pip if needed
+ pushd /tmp && curl -LO https://bootstrap.pypa.io/get-pip.py && sudo -H python get-pip.py && popd
+
+ # sample python packages
+ sudo -H pip install --upgrade jupyter graphviz cython pandas bokeh matplotlib opencv-python requests
+```
+
+* Quick Start for MXNet with MKL2017
+  ``` bash
+  sudo make -j$(nproc) USE_MKL2017=1 USE_MKL2017_EXPERIMENTAL=1
+  cd python && sudo python setup.py install
+  ```
+* Quick Start for MXNet with MKL-DNN
+  ``` bash
+  sudo make -j$(nproc) USE_MKLDNN=1
+  cd python && sudo python setup.py install
+  
+  # in user mode, without sudo.
+  make -j$(nproc) USE_MKLDNN=1 MKLDNN_ROOT=~/mkldnn
+  cd python && python setup.py install --user --prefix=
+  export LD_LIBRARY_PATH=~/mkldnn/lib:$LD_LIBRARY_PATH
+  ```
+* Quick test, with recommended tuning settings.
+  ``` bash
+  export OMP_NUM_THREADS=$(lscpu | awk -F":" '/^Socket/{s=$2} /^Core/{c=$2} END{print s*c}')
+  export KMP_AFFINITY=granularity=fine,compact,1,0;
+
+  python example/image-classification/benchmark_score.py
+  ```
+
+## Build/Install MXNet with MKL2017:
+-------------
 
   1. Enable USE_MKL2017=1 in make/config.mk
 
@@ -32,7 +88,28 @@ This README shows the user how to setup and install MKL2017 library with mxnet.
 
     1.4 MKL version compatibility
         
-        1.3.2.1 If you already have MKL installed and MKLROOT being set in your system, by default, it will not attempt to download the latest mklml package unless you unset MKLROOT. 
+        1.4.1 If you already have MKL installed and MKLROOT being set in your system, by default, it will not attempt to download the latest mklml package unless you unset MKLROOT. 
+
+  2. Run 'make -jX'
+       
+  3. Navigate into the python directory
+  
+  4. Run 'sudo python setup.py install'
+
+## Build/Install MXNet with MKL-DNN:
+-------------
+
+  1. Enable USE_MKLDNN=1 in make/config.mk
+
+     - MKLDNN_ROOT option in make/config.mk allows user to choose install folder for MKLDNN. By default it is set to /usr/local, hence requires sudo. If set to empty, MKLDNN will be installed under external/mkldnn/install folder. 
+
+     - when you excute make, Makefile will execute "prepare_mkldnn.sh" to download and build MKLDNN with mklml under external/mkldnn folder. It will then install the library into the location specified in above option.
+
+     - If you choose to install MKLDNN in custom folder, please set 
+
+         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$MKLDNN_ROOT/lib
+
+     - By default, mshadow can utilize mkl blas functions in MKLDNN mklml package.  
 
   2. Run 'make -jX'
        
diff --git a/Makefile b/Makefile
index e821c6faa5..87c4531daa 100644
--- a/Makefile
+++ b/Makefile
@@ -40,8 +40,14 @@ endif
 # use customized config file
 include $(config)
 
-ifeq ($(USE_MKL2017), 1)
-# must run ./prepare_mkl before including mshadow.mk
+# Check and prepare either MKLDNN or MKL2017
+ifeq ($(USE_MKLDNN), 1)
+	RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT))
+	MKLDNNROOT := $(firstword $(RETURN_STRING))
+	MKLROOT := $(lastword $(RETURN_STRING))
+	export USE_MKLML = 1
+	USE_MKL2017=0
+else ifeq ($(USE_MKL2017), 1)
 	RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT))
 	MKLROOT := $(firstword $(RETURN_STRING))
 	export USE_MKLML = $(lastword $(RETURN_STRING))
@@ -123,6 +129,19 @@ ifeq ($(USE_MKL2017), 1)
 	else
 		CFLAGS += -DMKL_EXPERIMENTAL=0
 	endif
+else ifeq ($(USE_MKLDNN), 1)
+	CFLAGS += -DMXNET_USE_MKLDNN=1
+	CFLAGS += -DUSE_MKL=1
+	CFLAGS += -I$(ROOTDIR)/src/operator/mkl/
+	ifneq ($(MKLDNNROOT), $(MKLROOT))
+	  CFLAGS += -I$(MKLROOT)/include
+	  LDFLAGS += -L$(MKLROOT)/lib
+	endif
+	CFLAGS += -I$(MKLDNNROOT)/include
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn
+endif
+
+ifeq ($(USE_MKLML), 1)
 	ifeq ($(UNAME_S), Darwin)
 		LDFLAGS += -lmklml
 	else
@@ -138,7 +157,7 @@ endif
 #   -  for Ubuntu, installing atlas will not automatically install the atlas provided lapack library
 # silently switching lapack off instead of letting the build fail because of backward compatibility
 ifeq ($(USE_LAPACK), 1)
-ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
+ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
 ifeq (,$(wildcard /lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib/liblapack.a))
 ifeq (,$(wildcard $(USE_LAPACK_PATH)/liblapack.a))
@@ -154,7 +173,7 @@ ifeq ($(USE_LAPACK), 1)
 	ifneq ($(USE_LAPACK_PATH), )
 		LDFLAGS += -L$(USE_LAPACK_PATH)
 	endif
-	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
+	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
 		LDFLAGS += -llapack
 	endif
 	CFLAGS += -DMXNET_USE_LAPACK
@@ -511,7 +530,8 @@ jnilint:
 ifneq ($(EXTRA_OPERATORS),)
 clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
-		R-package/inst R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
+		R-package/inst R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \
+		external/mkldnn/install/*
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
@@ -521,7 +541,8 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 else
 clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
-		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
+		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \
+		external/mkldnn/install/*
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
@@ -529,7 +550,6 @@ clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 endif
 
 clean_all: clean
-
 -include build/*.d
 -include build/*/*.d
 -include build/*/*/*.d
diff --git a/Makefile.mkldnn b/Makefile.mkldnn
new file mode 100755
index 0000000000..9d4a294a40
--- /dev/null
+++ b/Makefile.mkldnn
@@ -0,0 +1,79 @@
+MXNET_ROOTDIR := $(shell pwd)
+MKLDNN_ROOTDIR := external/mkldnn
+MKLDNN_TMPDIR := $(MKLDNN_ROOTDIR)/tmp
+MKLDNN_SRCDIR := $(MKLDNN_ROOTDIR)/src
+MKLDNN_BUILDDIR := $(MKLDNN_ROOTDIR)/build
+MKLDNN_INSTALLDIR := $(MKLDNN_ROOTDIR)/install
+MKLDNN_COMMIT := `cat ${MXNET_ROOTDIR}/mkldnn.commit`
+MKLDNN_CXX := g++
+MKLDNN_CC := gcc
+
+RETURN_STRING=$(shell ./prepare_mkl.sh $(MKLML_ROOT))
+MKLROOT=$(firstword $(RETURN_STRING))
+MKL_ROOTDIR := $(MKLROOT)
+
+# We do this because earlier versions of CMake have problems with ccache
+ifneq (,$(findstring ccache,$(CXX)))
+	MKLDNN_CXX := $(lastword $(CXX))
+endif
+
+ifneq (,$(findstring ccache,$(CC)))
+	MKLDNN_CC := $(lastword $(CC))
+endif
+
+MKLDNN_GITHUB := https://github.com/01org/mkl-dnn.git
+MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(MXNET_ROOTDIR)/$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(MXNET_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
+
+ifeq ("$(wildcard $(MKLDNN_INSTALLDIR)/include/mkldnn.hpp)", "")
+# mkldnn_download:
+# 	git clone --no-checkout $(MKLDNN_GITHUB) $(MKLDNN_TMPDIR)
+# 	rsync -a $(MKLDNN_TMPDIR)/ $(MKLDNN_SRCDIR) && rm -rf $(MKLDNN_TMPDIR)
+# 	cd $(MKLDNN_SRCDIR) && git reset --hard $(MKLDNN_COMMIT)
+# 	echo "download end"
+# 	cmake $(MKLDNN_CMAKE_FLAGS)
+# 	make -C $(MXNET_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
+# 	make -C $(MXNET_ROOTDIR)/$(MKLDNN_BUILDDIR) install
+# 	echo "build end"
+
+mkldnn:
+	git clone --no-checkout $(MKLDNN_GITHUB) $(MKLDNN_TMPDIR)
+	rsync -a $(MKLDNN_TMPDIR)/ $(MKLDNN_SRCDIR) && rm -rf $(MKLDNN_TMPDIR)
+	cd $(MKLDNN_SRCDIR) && git reset --hard $(MKLDNN_COMMIT)
+	echo "download end"
+	echo "build start"
+	cmake $(MKLDNN_CMAKE_FLAGS)
+	make -C $(MXNET_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
+	make -C $(MXNET_ROOTDIR)/$(MKLDNN_BUILDDIR) install
+# CFLAGS += -I$(MKLDNN_INSTALLDIR)/include
+# LDFLAGS += -lmkldnn -L$(MKLDNN_INSTALLDIR)/lib
+# echo "mkldnnroot_notset"
+# else
+# # mkldnn_download:
+# mkldnn_build:
+endif
+
+mkldnn_clean:
+	@rm -rf $(MKLDNN_SRCDIR) $(MKLDNN_BUILDDIR) $(MKLDNN_INSTALLDIR) $(MKLDNN_TMPDIR)
+
+mkldnnroot_set:
+	# CFLAGS += -DMKLDNN_SUPPORTED
+	CFLAGS += -I$(MKLDNN_INSTALLDIR)/include
+	LDFLAGS += -lmkldnn -L$(MKLDNN_INSTALLDIR)/lib
+	echo "mkldnnroot_set"
+	# LDFLAGS += -Wl,-rpath,$(MKLDNN_INSTALLDIR)/lib
+
+
+mkldnnroot_notset: mkldnn_build
+	# CFLAGS += -DMKLDNN_SUPPORTED
+	CFLAGS += -I$(MKLDNN_INSTALLDIR)/include
+	LDFLAGS += -lmkldnn -L$(MKLDNN_INSTALLDIR)/lib
+	echo "mkldnnroot_notset"
+	# LDFLAGS += -Wl,-rpath,$(MKLDNN_INSTALLDIR)/lib
+
+# ifneq ($(origin MKLDNNROOT), undefined)
+# ifdef MKLDNNROOT
+# mkldnn: mkldnn
+# endif
+# else
+# mkldnn: mkldnn
+# endif
diff --git a/README.md b/README.md
index fc252a7a72..79f2325357 100644
--- a/README.md
+++ b/README.md
@@ -83,4 +83,4 @@ In Neural Information Processing Systems, Workshop on Machine Learning Systems,
 
 History
 -------
-MXNet emerged from a collaboration by the authors of [cxxnet](https://github.com/dmlc/cxxnet), [minerva](https://github.com/dmlc/minerva), and [purine2](https://github.com/purine/purine2). The project reflects what we have learned from the past projects. MXNet combines aspects of each of these projects to achieve flexibility, speed, and memory efficiency.
+MXNet emerged from a collaboration by the authors of [cxxnet](https://github.com/dmlc/cxxnet), [minerva](https://github.com/dmlc/minerva), and [purine2](https://github.com/purine/purine2). The project reflects what we have learned from the past projects. MXNet combines aspects of each of these projects to achieve flexibility, speed, and memory efficiency.
\ No newline at end of file
diff --git a/dlpack b/dlpack
index a6e09b58dc..9422e98f3f 160000
--- a/dlpack
+++ b/dlpack
@@ -1 +1 @@
-Subproject commit a6e09b58dc00ee0065f5b7879800e646fbb01d1e
+Subproject commit 9422e98f3f4dafc6bc3473cf8484543ad376aab6
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 84ee9fa5e4..c6c4f354ec 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -37,7 +37,7 @@
 #include "./base.h"
 #include "./storage.h"
 #include "./engine.h"
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
 #include <mkl_memory.h>
 #endif
 // check c++11
@@ -70,7 +70,7 @@ class NDArray {
  public:
   /*! \brief default constructor */
   NDArray() {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = MKLMemHolder::create();
 #endif
   }
@@ -86,7 +86,7 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
         shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
@@ -132,7 +132,7 @@ class NDArray {
       }
       ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
                                      dtype, aux_types, aux_shapes);
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
       Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
@@ -147,7 +147,7 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
         dtype_(data.type_flag_), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
@@ -166,7 +166,7 @@ class NDArray {
           const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
       : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
         dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
@@ -253,7 +253,7 @@ class NDArray {
             << "Unexpected storage type: " << stype;
       res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
     });
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     res.Mkl_mem_ = Mkl_mem_;
 #endif
     return res;
@@ -497,7 +497,7 @@ class NDArray {
     CHECK_GE(ptr_->shandle.size,
              shape.Size() * mshadow::mshadow_sizeof(dtype))
         << "NDArray.AsArray: target memory size is bigger";
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     if (Mkl_mem_ != nullptr) {
       // convert prv to cpu
       Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr);
@@ -844,12 +844,12 @@ class NDArray {
     tblob_.shape_ = shape;
     tblob_.type_flag_ = dtype_;
     tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     tblob_.Mkl_mem_ = Mkl_mem_;
 #endif
   }
 
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
   std::shared_ptr<MKLMemHolder> Mkl_mem_;
 #endif
   /*! \brief internal data of NDArray */
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 18bf4fa780..811dfc5610 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -35,7 +35,7 @@
 #include <utility>
 #include <algorithm>
 #include "./base.h"
-#if MXNET_USE_MKL2017 == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
 #include <mkl_memory.h>
 #endif
 namespace mxnet {
@@ -66,14 +66,14 @@ class TBlob {
   int type_flag_;
 
   /*! \brief storing mkl chunk buffer blob, use for experimental only */
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
   std::shared_ptr<MKLMemHolder> Mkl_mem_;
 #endif
   /*! \brief default constructor, default copy assign will work */
   TBlob(void)
       : dptr_(NULL),
         type_flag_(mshadow::DataType<real_t>::kFlag) {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = NULL;
 #endif
     SetDLTensor(cpu::kDevMask, 0);
@@ -89,7 +89,7 @@ class TBlob {
   TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1)
       : dptr_(dptr), shape_(shape),
         type_flag_(mshadow::DataType<DType>::kFlag) {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = NULL;
 #endif
     SetDLTensor(dev_mask, dev_id);
@@ -104,7 +104,7 @@ class TBlob {
    */
   TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1)
       : dptr_(dptr), shape_(shape), type_flag_(type_flag) {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = NULL;
 #endif
     SetDLTensor(dev_mask, dev_id);
@@ -134,7 +134,7 @@ class TBlob {
     shape_ = src.shape_;
     type_flag_ = mshadow::DataType<DType>::kFlag;
     SetDLTensor(Device::kDevMask, -1);
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     Mkl_mem_ = NULL;
 #endif
     return *this;
@@ -171,7 +171,7 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     if (Mkl_mem_ != nullptr) {
       Mkl_mem_->check_and_prv_to_cpu(dptr_);
     }
@@ -216,7 +216,7 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     if (Mkl_mem_ != nullptr) {
       Mkl_mem_->check_and_prv_to_cpu(dptr_);
     }
diff --git a/make/config.mk b/make/config.mk
index d47d4d6931..59262bb246 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -85,6 +85,12 @@ USE_MKL2017 = 0
 # Prerequisite USE_MKL2017=1
 USE_MKL2017_EXPERIMENTAL = 0
 
+# Use the new open source Intel MKL-DNN library instead of MKLML
+USE_MKLDNN=0
+# MKLDNN root install folder, need to be root for /usr/local
+# Change to user dir for standard user, or leave empty to use local 'external' dir
+MKLDNN_ROOT=/usr/local
+
 # whether use NNPACK library
 USE_NNPACK = 0
 
@@ -110,11 +116,14 @@ USE_LAPACK_PATH =
 USE_INTEL_PATH = NONE
 
 # If use MKL only for BLAS, choose static link automatically to allow python wrapper
-ifeq ($(USE_MKL2017), 0)
 ifeq ($(USE_BLAS), mkl)
 USE_STATIC_MKL = 1
 endif
-else
+
+ifeq ($(USE_MKL2017), 0)
+USE_STATIC_MKL = NONE
+endif
+ifeq ($(USE_MKLDNN), 0)
 USE_STATIC_MKL = NONE
 endif
 
diff --git a/mshadow b/mshadow
index cb5c9872b5..eced9571c6 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit cb5c9872b542220be5b99f3aca0e1ff56e31b028
+Subproject commit eced9571c601260e3a2deb0e8e9a41e8c0a9f0f1
diff --git a/nnvm b/nnvm
index c86afa8f17..8d79cfd0b4 160000
--- a/nnvm
+++ b/nnvm
@@ -1 +1 @@
-Subproject commit c86afa8f17a44bcd4e6eec41cd49ba87e4f7a635
+Subproject commit 8d79cfd0b42fbe9f6ad75886d495065d5500b9dd
diff --git a/prepare_mkl.sh b/prepare_mkl.sh
index 139a7aadbe..62ba6bbca7 100755
--- a/prepare_mkl.sh
+++ b/prepare_mkl.sh
@@ -115,7 +115,7 @@ if [ -z $MKLROOT ]; then
 fi
 
 # Check what MKL lib we have in MKLROOT
-if [ -z `find $MKLROOT -name libmklml_gnu.so -o -name libmklml.dylib -print -quit` ]; then
+if [ -z `find $MKLROOT -name libmklml* -print -quit` ]; then
   USE_MKLML=0
 elif [ -z `find $MKLROOT -name libmkl_core.so -print -quit` ]; then
   USE_MKLML=1
diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh
new file mode 100755
index 0000000000..5d2e3852b7
--- /dev/null
+++ b/prepare_mkldnn.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# set -ex
+#
+# All modification made by Intel Corporation: ? 2016 Intel Corporation
+#
+# All contributions by the University of California:
+# Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+# All rights reserved.
+#
+# All other contributions:
+# Copyright (c) 2014, 2015, the respective contributors
+# All rights reserved.
+# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+MXNET_ROOTDIR="$(pwd)"
+MKLDNN_ROOTDIR="$MXNET_ROOTDIR/external/mkldnn"
+MKLDNN_GITHUB="https://github.com/01org/mkl-dnn.git"
+MKLDNN_TMPDIR="$MKLDNN_ROOTDIR/tmp"
+MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src"
+MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build"
+MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install"
+
+# MKL DNN release tag, or commit.
+MKLDNN_COMMIT="v0.10"
+
+# MKLDNN install destination
+HOME_MKLDNN=$1
+if [ ! -z "$HOME_MKLDNN" ]; then
+  mkdir -p $HOME_MKLDNN
+  if [ ! -w $HOME_MKLDNN ]; then
+    echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2
+    exit 1
+  fi
+fi
+
+if [ -z $MKLDNNROOT ]; then
+if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then
+    mkdir -p $MKLDNN_INSTALLDIR
+    if [ ! -d $MKLDNN_SRCDIR/.git ]; then
+      echo "Downloading MKLDNN ..." >&2
+      rm -rf $MKLDNN_SRCDIR
+      git clone --quiet --no-checkout $MKLDNN_GITHUB $MKLDNN_TMPDIR
+      rsync -a $MKLDNN_TMPDIR/ $MKLDNN_SRCDIR && rm -rf $MKLDNN_TMPDIR
+    fi
+    cd $MKLDNN_SRCDIR && git fetch --all && git reset --hard $MKLDNN_COMMIT 
+    if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then
+        rm -rf external && cd scripts && ./prepare_mkl.sh && cd ..
+        cp -a external/*/* $MKLDNN_INSTALLDIR/.
+    fi 
+    echo "Building MKLDNN ..." >&2
+    cd $MXNET_ROOTDIR
+    cmake $MKLDNN_SRCDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR
+    make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l)
+    make -C $MKLDNN_BUILDDIR install
+    rm -rf $MKLDNN_BUILDDIR
+fi
+MKLDNNROOT=$MKLDNN_INSTALLDIR
+fi
+
+if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then 
+  MKLROOT=$MKLDNNROOT;
+fi
+
+# user specified MKLDNN install folder
+if [ -d "$HOME_MKLDNN" ]; then
+  # skip if user specificed MKLDNNROOT
+  [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/.
+  [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/.
+  # update ldconfig if possible
+  if [ -w /etc/ld.so.conf.d ]; then
+    echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig
+  fi
+# return value to calling script (Makefile,cmake)
+  echo $HOME_MKLDNN $HOME_MKLDNN
+else
+  echo $MKLDNNROOT $MKLROOT
+fi
+
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index f595b44684..269979fb4a 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -34,6 +34,11 @@
 #include "../operator/mkl/mkl_memory-inl.h"
 #include "../operator/mkl/mkl_util-inl.h"
 #endif
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "../operator/mkl/mkldnn_memory-inl.h"
+#include "../operator/mkl/mkl_util-inl.h"
+#endif
 namespace mxnet {
 
 namespace op {
@@ -105,7 +110,7 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(state_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     mkl_tblobs_prv_to_cpu(in_data_);
     mkl_tblobs_prv_to_cpu(out_data_);
 #endif
@@ -174,7 +179,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     mkl_tblobs_prv_to_cpu(in_data_);
     mkl_tblobs_prv_to_cpu(out_data_);
 #endif
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index a33c11ce54..bf3fef4868 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -29,12 +29,27 @@
 #include "./mkl/mkl_memory-inl.h"
 #include "./mkl/mkl_relu-inl.h"
 #endif  // MXNET_USE_MKL2017
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkldnn_relu-inl.h"
+#endif  // MXNET_USE_MKLDNN
 
 namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<cpu>(ActivationParam param, int dtype, const TShape& dshape) {
   Operator *op = NULL;
+#if MXNET_USE_MKLDNN == 1
+  if (param.act_type == activation::kReLU) {
+    switch (dtype) {
+    case mshadow::kFloat32:
+      return new MKLDNNReluOp<cpu, float>();
+    default:
+      break;
+    }
+  }
+#endif
 #if MXNET_USE_MKL2017 == 1
   if (param.act_type == activation::kReLU && dshape.ndim() <= 4) {
       switch (dtype) {
@@ -46,7 +61,7 @@ Operator *CreateOp<cpu>(ActivationParam param, int dtype, const TShape& dshape)
           break;
       }
   }
-  if (enableMKLWarnGenerated())
+  if (EnableMklWarnGenerated())
     LOG(INFO) << MKLReluOp<cpu, float>::getName() << " Skip MKL optimization";
 #endif
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc
index 866b7fe619..9350c0fba7 100644
--- a/src/operator/batch_norm.cc
+++ b/src/operator/batch_norm.cc
@@ -30,6 +30,11 @@
 #include "./mkl/mkl_memory-inl.h"
 #include "./mkl/mkl_batch_norm-inl.h"
 #endif  // MXNET_USE_MKL2017
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkldnn_batch_norm-inl.h"
+#endif
 
 /*! \brief inverse standard deviation <-> variance */
 #define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
@@ -317,6 +322,30 @@ template<>
 Operator *CreateOp<cpu>(BatchNormParam param, const int dtype, const TShape& shape) {
   param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
   Operator *op = nullptr;
+#if MXNET_USE_MKLDNN == 1
+  if (shape.ndim() == 4
+      && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS
+      && shape[param.axis] % 8 == 0
+      && !mxnet::op::batchnorm::disable_mkl) {
+    switch (dtype) {
+      case mshadow::kFloat32:
+        op = new MKLDNNBatchNormOp<cpu, float>(param);
+        break;
+      default:
+        // MKL operator doesn't support half_t, so fall through
+        break;
+    }
+  }
+#define BATCHNORM_LOG_MKL_INFO() \
+  do { \
+    if (!mxnet::op::batchnorm::disable_mkl) { \
+      LOG(INFO) << MKLDNNBatchNormOp<cpu, float>::getName() \
+        << " Skipping MKL optimization (unsupported dimension, axis or type)"; \
+    } \
+  } while (0)
+#else
+#define BATCHNORM_LOG_MKL_INFO() ((void)0)
+#endif
 #if MXNET_USE_MKL2017 == 1
   if (shape.ndim() == 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
index 03a8b8049f..98049668b1 100644
--- a/src/operator/concat.cc
+++ b/src/operator/concat.cc
@@ -29,12 +29,29 @@
 #include "./mkl/mkl_memory-inl.h"
 #include "./mkl/mkl_concat-inl.h"
 #endif  // MXNET_USE_MKL2017
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkldnn_concat-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 template<>
 Operator* CreateOp<cpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
   Operator *op = NULL;
+#if MXNET_USE_MKLDNN == 1
+  if ((1 == param.dim) && (param.num_args > 1)) {
+    switch (dtype) {
+      case mshadow::kFloat32:
+        return new MKLDNNConcatOp<cpu, float>(param);
+    default:
+      break;
+    }
+  }
+  if (EnableMkldnnWarnGenerated())
+    LOG(INFO) << MKLDNNConcatOp<cpu, float>::getName() << " Skip MKL optimization";
+#endif
 #if MXNET_USE_MKL2017 == 1
   // MKL supports 4D input tensors only for concat operation
   // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h
@@ -52,7 +69,7 @@ Operator* CreateOp<cpu>(ConcatParam param, int dtype, std::vector<TShape> *in_sh
       break;
     }
   }
-  if (enableMKLWarnGenerated())
+  if (EnableMklWarnGenerated())
     LOG(INFO) << MKLConcatOp<cpu, float>::getName() << " Skip MKL optimization";
 #endif
   MSHADOW_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
index 55cfe4e085..fa97dda5a9 100644
--- a/src/operator/convolution.cc
+++ b/src/operator/convolution.cc
@@ -29,6 +29,11 @@
 #include "./mkl/mkl_memory-inl.h"
 #include "./mkl/mkl_convolution-inl.h"
 #endif  // MXNET_USE_MKL2017
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkldnn_convolution-inl.h"
+#endif  // MXNET_USE_MKLDNN
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_convolution-inl.h"
 #endif  // MXNET_USE_NNPACK
@@ -50,6 +55,19 @@ Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
     })
     return op;
   }
+#if MXNET_USE_MKLDNN == 1
+    if ((param.dilate[0] == 1 && param.dilate[1] == 1)
+        && param.kernel.ndim() == 2) {
+        switch (dtype) {
+        case mshadow::kFloat32:
+            return new MKLDNNConvolutionOp<cpu, float>(param);
+        default:
+            break;
+        }
+    }
+    if (EnableMkldnnWarnGenerated())
+      LOG(INFO) << "MKLDNNConvolutionOp Skip MKL DNN optimization";
+#endif
 #if MXNET_USE_MKL2017 == 1
   if ((param.dilate[0] == 1 && param.dilate[1] == 1)
       && param.kernel.ndim() == 2) {
diff --git a/src/operator/deconvolution.cc b/src/operator/deconvolution.cc
index 6a59ff6588..43182d8047 100644
--- a/src/operator/deconvolution.cc
+++ b/src/operator/deconvolution.cc
@@ -24,6 +24,11 @@
 */
 
 #include "./deconvolution-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkldnn_deconvolution-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -33,6 +38,18 @@ Operator* CreateOp<cpu>(DeconvolutionParam param, int dtype,
                         std::vector<TShape> *out_shape,
                         Context ctx) {
   Operator *op = NULL;
+#if MXNET_USE_MKLDNN == 1
+  if (param.kernel.ndim() == 2) {
+    switch (dtype) {
+    case mshadow::kFloat32:
+      return new MKLDNNDeConvolutionOp<cpu, float>(param);
+    default:
+      break;
+    }
+  }
+  if (EnableMkldnnWarnGenerated())
+    LOG(INFO) << "MKLDNNDeConvolutionOp Skip MKL DNN optimization";
+#endif
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     op = new DeconvolutionOp<cpu, DType>(param);
   });
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 82c32a7d25..e0ef7b1ff8 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -25,6 +25,11 @@
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_fully_connected-inl.h"
 #endif  // MXNET_USE_NNPACK
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkldnn_fully_connected-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -34,6 +39,16 @@ Operator* CreateOp<cpu>(FullyConnectedParam param, int dtype,
                         std::vector<TShape> *out_shape,
                         Context ctx) {
   Operator *op = NULL;
+#if MXNET_USE_MKLDNN == 1
+  switch (dtype) {
+  case mshadow::kFloat32:
+    return new MKLDNNFullyConnectedOp<cpu, float>(param);
+  default:
+    break;
+  }
+  if (EnableMkldnnWarnGenerated())
+    LOG(INFO) << "MKLDNNFullyConnectedOp Skip MKL DNN optimization";
+#endif
 #if MXNET_USE_NNPACK == 1
   const size_t batch_size = (*in_shape)[0][0];
   // nnp_fully_connected_inference will do optimization for batch-size = 1
diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc
index 46f4fca486..ed3c305b08 100644
--- a/src/operator/lrn.cc
+++ b/src/operator/lrn.cc
@@ -32,13 +32,32 @@
 #include "./mkl/mkl_memory-inl.h"
 #include "./mkl/mkl_lrn-inl.h"
 #endif
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkl_util-inl.h"
+#include "./mkl/mkldnn_lrn-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 template<>
 Operator* CreateOp<cpu>(LRNParam param, int dtype) {
+#if MXNET_USE_MKLDNN == 1
+  switch (dtype) {
+  case mshadow::kFloat32:
+    return new MKLDNNLRNOp<cpu, float>(param);
+  default:
+    break;
+  }
+#endif
 #if MXNET_USE_MKL2017 == 1
-  return new MKLLRNOp<cpu, float>(param);
+  switch (dtype) {
+  case mshadow::kFloat32:
+    return new MKLLRNOp<cpu, float>(param);
+  default:
+    break;
+  }
 #endif
   return new LocalResponseNormOp<cpu>(param);
 }
diff --git a/src/operator/mkl/mkl_conv-common-inl.h b/src/operator/mkl/mkl_conv-common-inl.h
new file mode 100644
index 0000000000..91b99bd5b9
--- /dev/null
+++ b/src/operator/mkl/mkl_conv-common-inl.h
@@ -0,0 +1,82 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_convolution-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_CONV_COMMON_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_CONV_COMMON_INL_H_
+
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <mxnet/storage.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "mkl_util-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+template <typename xpu, typename DType>
+class MKLConvCommon {
+ public:
+  MKLConvCommon(): width_(0), height_(0), width_out_(0),
+    height_out_(0), kernel_w_(0), kernel_h_(0),
+    stride_w_(0), stride_h_(0), pad_w_(0), pad_h_(0)  {}
+  virtual ~MKLConvCommon() {}
+
+  void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) {
+    int blob_byte_size = blob_size * sizeof(DType);
+    *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU());
+    memcpy(pws->dptr, src, blob_byte_size);
+  }
+  void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) {
+    DType *dst = reinterpret_cast<DType*>(dst_);
+    DType *src = reinterpret_cast<DType*>(pws->dptr);
+    for (int i = 0; i < blob_size; i++) {
+      dst[i] += src[i];
+    }
+    if (pws->dptr)
+      Storage::Get()->Free(*pws);
+    pws->dptr = NULL;
+  }
+
+ protected:
+  int width_,
+    height_,
+    width_out_,
+    height_out_,
+    kernel_w_,
+    kernel_h_,
+    stride_w_,
+    stride_h_;
+  int group_,
+    num_,
+    channel_output_;
+  size_t channels_;
+  int pad_w_,
+    pad_h_;
+};
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_CONV_COMMON_INL_H_
diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc
index 507e5498c8..3deae273ce 100644
--- a/src/operator/mkl/mkl_cppwrapper.cc
+++ b/src/operator/mkl/mkl_cppwrapper.cc
@@ -38,7 +38,7 @@ int getMKLBuildDate() {
     return build;
 }
 
-bool enableMKLWarnGenerated() {
+bool EnableMklWarnGenerated() {
   return false;
 }
 #endif  // MSHADOW_USE_MKL2017
diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h
index 7d66f20ad3..338419a501 100644
--- a/src/operator/mkl/mkl_cppwrapper.h
+++ b/src/operator/mkl/mkl_cppwrapper.h
@@ -32,7 +32,7 @@
 
 
 extern int getMKLBuildDate();
-extern bool enableMKLWarnGenerated();
+extern bool EnableMklWarnGenerated();
 
 
 template <typename Dtype> inline dnnError_t dnnLayoutCreate(
diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h
index 13f1fd27b1..09cf84beb7 100644
--- a/src/operator/mkl/mkl_memory.h
+++ b/src/operator/mkl/mkl_memory.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016 Intel Corporation
+* Copyright 2016-2017 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct PrvMemDescr {
   virtual PrvDescrType get_descr_type() = 0;
 };
 
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
 // Currently HEAD_AT_PRV do not free CPU data
 enum SyncedHead {
   HEAD_AT_CPU,
@@ -64,12 +64,15 @@ struct MKLMemHolder {
     b_eager_mode = eager_mode;
   }
   void set_prv_descriptor(std::shared_ptr<PrvMemDescr> descriptor, bool same_data = false) {
-    head_ = HEAD_AT_PRV;
+    if (descriptor != nullptr) head_ = HEAD_AT_PRV;
     prv_descriptor_ = descriptor;
   }
   std::shared_ptr<PrvMemDescr> get_prv_descriptor() {
     return  prv_descriptor_;
   }
+  bool head_at_cpu() {
+    return (head_ == HEAD_AT_CPU) ? true : false;
+  }
   bool head_at_prv() {
     return (head_ == HEAD_AT_PRV) ? true : false;
   }
@@ -97,10 +100,11 @@ struct MKLMemHolder {
   static std::shared_ptr<MKLMemHolder> create() {
     return std::make_shared<MKLMemHolder>();
   }
-  void  check_and_prv_to_cpu(void *dptr_) {
+  void  check_and_prv_to_cpu(void *dptr_, bool convert = true) {
     if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) {
       CHECK(prv_descriptor_ != nullptr);
-      prv_descriptor_->convert_from_prv(dptr_);
+      if (convert)
+        prv_descriptor_->convert_from_prv(dptr_);
       // Because operator use CPU & maybe change it, change to CPU Flag
       head_ = HEAD_AT_CPU;
     }
@@ -112,6 +116,7 @@ struct MKLMemHolder {
     head_(HEAD_AT_CPU), prv_descriptor_(nullptr),
     b_disable_prv_2_cpu(false), b_eager_mode(false) {}
 };
+
 #else
 struct MKLMemHolder {
  public:
diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h
index 4ad786a2ce..87beab6415 100644
--- a/src/operator/mkl/mkl_util-inl.h
+++ b/src/operator/mkl/mkl_util-inl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016 Intel Corporation
+* Copyright 2016-2017 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,12 @@
 #ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
 #define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
 #include <vector>
+#if MXNET_USE_MKL2017 == 1
+#include "mkl_memory-inl.h"
+#endif
+#if MXNET_USE_MKLDNN == 1
+#include "mkldnn_memory-inl.h"
+#endif
 #define MKLDNN_CALL(func)                                                               \
   {                                                                                     \
     dnnError_t status = (func);                                                                \
@@ -32,7 +38,7 @@
 namespace mxnet {
 namespace op {
 
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
   template<typename DType>
   inline DType * mkl_prv_data(const TBlob &b) {
     std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
@@ -54,7 +60,7 @@ namespace op {
   }
 #endif
   inline void mkl_set_priv_flag(const TBlob &b) {
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
     std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
     bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
     if (mem_valid) {
@@ -62,7 +68,21 @@ namespace op {
     }
 #endif
   }
-#if MKL_EXPERIMENTAL == 1
+#if MXNET_USE_MKLDNN == 1
+  template<typename DType>
+  inline std::shared_ptr<MKLDNNData<DType> > mkl_get_mem_desc(
+    const std::shared_ptr<MKLMemHolder> data_mem) {
+    std::shared_ptr<PrvMemDescr> prv_descriptor =
+      data_mem->get_prv_descriptor();
+    CHECK_EQ(prv_descriptor->get_descr_type(),
+      PrvMemDescr::PRV_DESCR_MKLDNN);
+    std::shared_ptr<MKLDNNData<DType> > mem_descr
+      = std::static_pointer_cast<MKLDNNData<DType>>(prv_descriptor);
+    CHECK(mem_descr != NULL);
+    return mem_descr;
+  }
+#endif
+#if MXNET_USE_MKL2017 == 1
   template<typename DType>
   inline std::shared_ptr<MKLData<DType> > mkl_get_mem_desc(
     const std::shared_ptr<MKLMemHolder> data_mem) {
@@ -90,7 +110,7 @@ namespace op {
     return b.get_with_shape<xpu, dim, DType>(shape, s);
   }
 }  // namespace op
-#if MKL_EXPERIMENTAL == 1
+#if MKL_EXPERIMENTAL == 1 || MXNET_USE_MKLDNN == 1
 inline void mkl_tblobs_prv_to_cpu(const std::vector<TBlob> &data) {
   for (size_t i = 0; i < data.size(); i++) {
     std::shared_ptr<MKLMemHolder> mem_holder = data[i].Mkl_mem_;
diff --git a/src/operator/mkl/mkldnn_base-inl.h b/src/operator/mkl/mkldnn_base-inl.h
new file mode 100644
index 0000000000..9092d1fb2b
--- /dev/null
+++ b/src/operator/mkl/mkldnn_base-inl.h
@@ -0,0 +1,164 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_base-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include <string>
+#include <vector>
+#include <iterator>
+#include "mkldnn.hpp"
+
+namespace mxnet {
+extern bool EnableMkldnnWarnGenerated();
+// =====  CpuEngine =======================================
+// cpu_engine singleton
+class CpuEngine {
+ public:
+    static CpuEngine & Instance() {
+        // I's thread-safe in C++11.
+        static CpuEngine myInstance;
+        return myInstance;
+    }
+    CpuEngine(CpuEngine const&) = delete;             // Copy construct
+    CpuEngine(CpuEngine&&) = delete;                  // Move construct
+    CpuEngine& operator=(CpuEngine const&) = delete;  // Copy assign
+    CpuEngine& operator=(CpuEngine &&) = delete;      // Move assign
+
+    mkldnn::engine & get_engine() { return _cpu_engine; }
+ protected:
+    CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {}
+    ~CpuEngine() {}
+ private:
+    mkldnn::engine _cpu_engine;
+};
+
+// =====  MKLDNNStream =======================================
+class MKLDNNStream {
+ public:
+    MKLDNNStream():_ready(false) { prepare(); }
+    virtual ~MKLDNNStream() {}
+    MKLDNNStream  &submit(std::vector<mkldnn::primitive> primitives) {
+        _stream->submit(primitives); return *this;
+    }
+    bool wait(bool block = true) {
+        _ready = false;
+        bool res = _stream->wait(block);
+        return res;
+    }
+    bool ready() { return _ready; }
+    void prepare() {
+        if (_ready == false) {
+            _stream.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+        }
+        _ready = true;
+    }
+
+ private:
+    bool _ready;
+    std::shared_ptr<mkldnn::stream> _stream;
+};
+
+// =====  StreamHolder =======================================
+// singleton
+class StreamHolder {
+ public:
+    static StreamHolder & Instance() {
+        // I's thread-safe in C++11.
+        static StreamHolder myInstance;
+        return myInstance;
+    }
+    StreamHolder(StreamHolder const&) = delete;             // Copy construct
+    StreamHolder(StreamHolder&&) = delete;                  // Move construct
+    StreamHolder& operator=(StreamHolder const&) = delete;  // Copy assign
+    StreamHolder& operator=(StreamHolder &&) = delete;      // Move assign
+
+    std::shared_ptr<MKLDNNStream> get_stream();
+    std::shared_ptr<MKLDNNStream> current_stream() { return _current_stream; }
+    void prepare_mkldnn_stream(std::shared_ptr<MKLDNNStream> mkldnn_stream) {
+        _current_stream = mkldnn_stream;
+        _current_stream->prepare();
+    }
+ protected:
+    StreamHolder() : _current_stream(NULL) {}
+    ~StreamHolder() {}
+ private:
+    std::shared_ptr<MKLDNNStream> _current_stream;
+};
+
+// =====  MKLDNNLayer =======================================
+template <typename Dtype>
+class MKLDNNLayer {
+ public:
+    MKLDNNLayer() {}
+    virtual ~MKLDNNLayer() {}
+};
+
+// =====  MKLDNNPrimitive =======================================
+template <typename Dtype>
+class MKLDNNPrimitive {
+ public:
+    MKLDNNPrimitive():aprimitive(NULL), mkldnn_stream(NULL) {}
+    virtual ~MKLDNNPrimitive() {}
+    void reset(mkldnn::primitive* pprimitive) { this->aprimitive.reset(pprimitive);}
+    std::shared_ptr<mkldnn::primitive> aprimitive;
+    std::shared_ptr<MKLDNNStream> mkldnn_stream;
+    std::shared_ptr<MKLDNNStream> get_mkldnn_stream();
+    std::shared_ptr<MKLDNNStream> submit();
+};
+
+// type enumerator
+template<typename T>
+struct data_type_enum {};
+
+template<>
+struct data_type_enum<float> {
+    enum { type = mkldnn::memory::data_type::f32 };
+};
+
+template<>
+struct data_type_enum<int32_t> {
+    enum { type = mkldnn::memory::data_type::s32 };
+};
+
+template<>
+struct data_type_enum<int16_t> {
+    enum { type = mkldnn::memory::data_type::s16 };
+};
+
+template<>
+struct data_type_enum<int8_t> {
+    enum { type = mkldnn::memory::data_type::s8 };
+};
+
+template<>
+struct data_type_enum<uint8_t> {
+    enum { type = mkldnn::memory::data_type::u8 };
+};
+
+}  // namespace mxnet
+#endif
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_BASE_INL_H_
diff --git a/src/operator/mkl/mkldnn_base.cc b/src/operator/mkl/mkldnn_base.cc
new file mode 100644
index 0000000000..97e0c91b30
--- /dev/null
+++ b/src/operator/mkl/mkldnn_base.cc
@@ -0,0 +1,74 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_base.cc
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+
+#include <dmlc/logging.h>
+#ifdef MXNET_USE_MKLDNN
+#include "mkldnn_base-inl.h"
+using namespace mkldnn;
+namespace mxnet {
+
+bool EnableMkldnnWarnGenerated() {
+  return false;
+}
+std::shared_ptr<MKLDNNStream> StreamHolder::get_stream() {
+    if (this->_current_stream == NULL || !this->_current_stream->ready()) {
+        _current_stream.reset(new MKLDNNStream());
+    }
+    return _current_stream;
+}
+
+template <typename Dtype>
+std::shared_ptr<MKLDNNStream>  MKLDNNPrimitive<Dtype>::get_mkldnn_stream() {
+    if (mkldnn_stream == NULL)
+        mkldnn_stream = StreamHolder::Instance().get_stream();
+    else
+        StreamHolder::Instance().prepare_mkldnn_stream(mkldnn_stream);
+    return mkldnn_stream;
+}
+
+template <typename Dtype>
+std::shared_ptr<MKLDNNStream>  MKLDNNPrimitive<Dtype>::submit() {
+    CHECK(this->aprimitive);
+    try {
+        this->get_mkldnn_stream()->submit({*(this->aprimitive)}).wait();
+    } catch (std::exception& e) {
+        std::cout << e.what() << std::endl;
+    }
+    return mkldnn_stream;
+}
+
+template class MKLDNNLayer<double>;
+template class MKLDNNLayer<float>;
+template class MKLDNNLayer<uint8_t>;
+template class MKLDNNLayer<int8_t>;
+template class MKLDNNLayer<int32_t>;
+
+template class MKLDNNPrimitive<double>;
+template class MKLDNNPrimitive<float>;
+template class MKLDNNPrimitive<uint8_t>;
+template class MKLDNNPrimitive<int8_t>;
+template class MKLDNNPrimitive<int32_t>;
+}  // namespace mxnet
+#endif  // #ifdef MXNET_USE_MKLDNN
diff --git a/src/operator/mkl/mkldnn_batch_norm-inl.h b/src/operator/mkl/mkldnn_batch_norm-inl.h
new file mode 100644
index 0000000000..c680c2912a
--- /dev/null
+++ b/src/operator/mkl/mkldnn_batch_norm-inl.h
@@ -0,0 +1,418 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_batch_norm-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_BATCH_NORM_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_BATCH_NORM_INL_H_
+
+#include <mkldnn_types.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "mkldnn_base-inl.h"
+#include "mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+
+template<typename xpu, typename Dtype>
+class MKLDNNBatchNormOp : public Operator, public MKLDNNLayer<Dtype> {
+ public:
+  static int s_id_gen;
+  int m_id;
+  explicit MKLDNNBatchNormOp(BatchNormParam param) : MKLDNNLayer<Dtype>()
+    , fwd_top_data(NULL), fwd_bottom_data(NULL)
+    , fwd_inference_pd(NULL), fwd_training_pd(NULL)
+    , bwd_top_diff(NULL), bwd_bottom_diff(NULL), bwd_scaleshift_pd(NULL) {
+    this->param_ = param;
+    m_id = s_id_gen++;
+  }
+  virtual ~MKLDNNBatchNormOp() {
+  }
+  static std::string getName() {
+    std::string name = "MKLDNNBatchNormOp_";
+    // name = name + std::to_string(m_id);
+    return name;
+  }
+
+ private:
+  void LayerSetUp(const mshadow::Tensor<xpu, 4, Dtype> &data,
+                  const mshadow::Tensor<xpu, 4, Dtype> &out) {
+    eps_ = param_.eps;
+    channels_ = data.shape_[1];
+    height_ = data.shape_[2];
+    width_ = data.shape_[3];
+    num_ = data.shape_[0];
+    int32_t n = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+    memory::data_type mpcsn = memory::data_type::f32;
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    fwd_usr_input_md.reset(new memory::desc({ { n, ic, ih, iw } }, mpcsn, memory::format::nchw));
+    fwd_usr_mpd.reset(new memory::primitive_desc(*fwd_usr_input_md, cpu_engine));
+    /* auto pmfmt = ((__builtin_cpu_supports("avx2")) || */
+    /*                (__builtin_cpu_supports("avx"))) ? */
+    /*                memory::format::nChw8c : memory::format::nChw16c; */
+    if (ic % 8 == 0) {
+      auto pmfmt = memory::format::nChw8c;
+      fwd_prv_input_md.reset(new memory::desc({ { n, ic, ih, iw } }, mpcsn, pmfmt));
+      fwd_prv_mpd.reset(new memory::primitive_desc(*fwd_prv_input_md, cpu_engine));
+    } else {
+      fwd_prv_input_md = nullptr;
+      fwd_prv_mpd = nullptr;
+    }
+  }
+  void initFwd(const std::vector<TBlob> &in_data) {
+    void * bottom_data =
+      const_cast<Dtype*>(mkl_prv_data<Dtype>(in_data[batchnorm::kData]));
+    // ---- Initialize memory descriptors -------------
+    std::shared_ptr<memory::primitive_desc> usr_mpd(NULL);
+    if (bottom_data != NULL) {
+      std::shared_ptr<MKLDNNData<Dtype> > mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(in_data[batchnorm::kData]);
+      CHECK(mem_descr != NULL);
+      fwd_bottom_data = mem_descr;
+      input_md.reset(new memory::desc(mem_descr->prv_memory_pd()->desc()));
+      usr_mpd = mem_descr->usr_memory_pd();
+      prv_mpd = mem_descr->prv_memory_pd();
+    } else {
+      if (fwd_prv_input_md != nullptr)
+      input_md = fwd_prv_input_md;
+      else
+        input_md = fwd_usr_input_md;
+      usr_mpd = fwd_usr_mpd;
+      prv_mpd = fwd_prv_mpd;
+      fwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_mpd));
+      fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+    }
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    // ---- Initialize BatchNorm primitive descriptor -------------
+    batch_normalization_forward::desc BatchNormFwdInference_desc(prop_kind::forward_scoring,
+      *input_md, eps_, mkldnn_use_global_stats | mkldnn_use_scaleshift);
+    batch_normalization_forward::desc BatchNormFwdTraining_desc(prop_kind::forward_training,
+      *input_md, eps_, mkldnn_use_scaleshift);
+
+    fwd_inference_pd.reset(
+      new batch_normalization_forward::primitive_desc(BatchNormFwdInference_desc, cpu_engine));
+    fwd_training_pd.reset(
+      new batch_normalization_forward::primitive_desc(BatchNormFwdTraining_desc, cpu_engine));
+
+    fwd_top_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_mpd));
+    fwd_top_data->name = "fwd_top_data   @ " + this->getName();
+
+    weight_memory.reset(new memory(fwd_inference_pd->weights_primitive_desc()));
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+      using namespace mshadow;
+      using namespace mshadow::expr;
+      CHECK_EQ(in_data.size(), 3);
+      CHECK_EQ(aux_states.size(), 2);
+      if (ctx.is_train) {
+        CHECK_EQ(out_data.size(), 3);
+        CHECK_EQ(req.size(), 3);
+      } else {
+        CHECK_GE(out_data.size(), 1);
+        CHECK_GE(req.size(), 1);
+        CHECK_EQ(req[batchnorm::kOut], kWriteTo);
+      }
+      Stream<xpu> *s = ctx.get_stream<xpu>();
+      Tensor<xpu, 4, Dtype>  data;
+      Tensor<xpu, 4, Dtype>  out;
+      if (in_data[batchnorm::kData].ndim() == 2) {
+        Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0],
+                                 in_data[batchnorm::kData].shape_[1], 1, 1);
+        data = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+          in_data[batchnorm::kData], dshape, s);
+        out = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+          out_data[batchnorm::kOut], dshape, s);
+      } else {
+        data = mkl_experimental_direct_get<xpu, 4, Dtype>(in_data[batchnorm::kData], s);
+        out = mkl_experimental_direct_get<xpu, 4, Dtype>(out_data[batchnorm::kOut], s);
+      }
+      Tensor<xpu, 1, Dtype> slope = in_data[batchnorm::kGamma].get<xpu, 1, Dtype>(s);
+      Tensor<xpu, 1, Dtype> bias = in_data[batchnorm::kBeta].get<xpu, 1, Dtype>(s);
+      mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+      if (param_.fix_gamma)
+        slope = 1.f;
+
+
+      int32_t ic = this->channels_;
+      if (fwd_inference_pd == NULL) {
+        LayerSetUp(data, out);
+        initFwd(in_data);
+      }
+
+      // Setup weight
+      Dtype* scaleShift_buf = reinterpret_cast<Dtype *>(weight_memory->get_data_handle());
+      // use_weight_bias_
+      for (int i = 0; i < channels_; i++) {
+        scaleShift_buf[i] = (slope.dptr_)[i];
+      }
+      for (int i = 0; i < channels_; i++) {
+        scaleShift_buf[channels_ + i] = (bias.dptr_)[i];
+      }
+
+    if (!init_mkldnn_) {
+      init_mkldnn_ = true;
+      fwd_input_primitive = fwd_bottom_data->get_converted_prv(data.dptr_, false,
+        in_data[batchnorm::kData]);
+      fwd_output_memory = fwd_top_data->create_output_memory(out.dptr_,
+        out_data[batchnorm::kOut], fwd_top_data);
+      if (ctx.is_train && !param_.use_global_stats) {
+        Tensor<xpu, 1, Dtype> mean = out_data[batchnorm::kMean].get<xpu, 1, Dtype>(s);
+        Tensor<xpu, 1, Dtype> var = out_data[batchnorm::kVar].get<xpu, 1, Dtype>(s);
+        CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo);
+        CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo);
+        mean_memory.reset(new memory(fwd_training_pd->mean_primitive_desc(), mean.dptr_));
+        var_memory.reset(new memory(fwd_training_pd->variance_primitive_desc(), var.dptr_));
+      } else {
+        Tensor<xpu, 1, Dtype> moving_mean =
+          aux_states[batchnorm::kMovingMean].get<xpu, 1, Dtype>(s);
+        Tensor<xpu, 1, Dtype> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, Dtype>(s);
+        mean_memory.reset(new memory(fwd_inference_pd->mean_primitive_desc(),
+          moving_mean.dptr_));
+        var_memory.reset(new memory(fwd_inference_pd->variance_primitive_desc(),
+          moving_var.dptr_));
+      }
+      // ---- Create BatchNorm --------------------
+      if (ctx.is_train) {
+        BatchNormFwd.reset(new batch_normalization_forward(*fwd_training_pd,
+          *fwd_input_primitive, *weight_memory, *fwd_output_memory, *mean_memory, *var_memory));
+      } else {
+        BatchNormFwd.reset(new batch_normalization_forward(*fwd_inference_pd,
+          *fwd_input_primitive, (const primitive::at)*mean_memory, (const primitive::at)*var_memory,
+          *weight_memory, *fwd_output_memory));
+        }
+      } else {
+        fwd_bottom_data->sync_converted_prv(data.dptr_, false,
+          in_data[batchnorm::kData]);
+        fwd_top_data->sync_output_memory(
+          out_data[batchnorm::kOut], fwd_top_data);
+      }
+      BatchNormFwd.submit();
+  }
+  void InitBatchNormBwd(const std::vector<TBlob> &out_grad) {
+    int32_t n = this->num_;
+    int32_t w = this->width_;
+    int32_t h = this->height_;
+    int32_t c = this->channels_;
+
+    unsigned flags = use_scale_shift;
+    if (param_.use_global_stats)
+      flags |= use_global_stats;
+    void * top_diff_data =
+      const_cast<Dtype*>(mkl_prv_data<Dtype>(out_grad[batchnorm::kOut]));
+    bool top_diff_is_prv = (top_diff_data != NULL);
+
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mpcsn = memory::data_type::f32;
+    // ---- Initialize memory descriptors -------------
+
+    std::shared_ptr<memory::desc> top_diff_md, top_data_md;
+    std::shared_ptr<memory::primitive_desc> usr_diff_mpd(NULL), prv_diff_mpd(NULL);
+    std::shared_ptr<memory::desc> default_md;
+    default_md.reset(new memory::desc({ { n, c, h, w } }, mpcsn, memory::format::nchw));
+    if (top_diff_is_prv) {
+      std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(out_grad[batchnorm::kOut]);
+      usr_diff_mpd = mem_descr->usr_memory_pd();
+      prv_diff_mpd = mem_descr->prv_memory_pd();
+    } else {
+      if (prv_mpd != NULL) prv_diff_mpd = prv_mpd;
+      usr_diff_mpd.reset(new memory::primitive_desc(*default_md, cpu_engine));
+    }
+
+    if (prv_diff_mpd != NULL)
+      top_diff_md.reset(new memory::desc(prv_diff_mpd->desc()));
+    else 
+      top_diff_md.reset(new memory::desc(*default_md));
+    batch_normalization_backward::desc BatchNormBwd_desc(prop_kind::backward, *top_diff_md,
+      fwd_output_memory->get_primitive_desc().desc(), eps_, mkldnn_use_scaleshift);
+    bwd_scaleshift_pd.reset(
+      new batch_normalization_backward::primitive_desc(BatchNormBwd_desc, cpu_engine,
+        *fwd_training_pd));
+
+    diff_weight_memory.reset(
+      new memory(bwd_scaleshift_pd->diff_weights_primitive_desc()));
+
+    bwd_bottom_diff.reset(new MKLDNNData<Dtype>(usr_diff_mpd, prv_diff_mpd));
+    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + this->getName();
+    bwd_top_diff.reset(new MKLDNNData<Dtype>(usr_diff_mpd, prv_diff_mpd));
+    bwd_top_diff->name = "bwd_top_diff   @ " + this->getName();
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_grad.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, Dtype> data, grad, grad_in;
+
+    if (in_data[batchnorm::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0],
+        out_grad[batchnorm::kOut].shape_[1], 1, 1);
+      data = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        in_data[batchnorm::kData], dshape, s);
+      grad = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_grad[batchnorm::kOut], dshape, s);
+      grad_in = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        in_grad[batchnorm::kData], dshape, s);
+    } else {
+      data = mkl_experimental_direct_get<xpu, 4, Dtype>(in_data[batchnorm::kData], s);
+      grad = mkl_experimental_direct_get<xpu, 4, Dtype>(out_grad[batchnorm::kOut], s);
+      grad_in = mkl_experimental_direct_get<xpu, 4, Dtype>(in_grad[batchnorm::kData], s);
+    }
+
+    Tensor<xpu, 1, Dtype> gslope = in_grad[batchnorm::kGamma].get<xpu, 1, Dtype>(s);
+    Tensor<xpu, 1, Dtype> gbias = in_grad[batchnorm::kBeta].get<xpu, 1, Dtype>(s);
+    Tensor<xpu, 1, Dtype> slope = in_data[batchnorm::kGamma].get<xpu, 1, Dtype>(s);
+    Tensor<xpu, 1, Dtype> mean = out_data[batchnorm::kMean].get<xpu, 1, Dtype>(s);
+    Tensor<xpu, 1, Dtype> var = out_data[batchnorm::kVar].get<xpu, 1, Dtype>(s);
+    Tensor<xpu, 1, Dtype> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, Dtype>(s);
+    Tensor<xpu, 1, Dtype> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, Dtype>(s);
+
+    if (param_.fix_gamma)
+      slope = 1.f;
+
+
+    Dtype * mean_dptr = NULL;
+    Dtype * var_dptr = NULL;
+    if (ctx.is_train && !param_.use_global_stats) {
+      int size = mean.size(0);  // Tensor<xpu, 1, Dtype>
+      float * moving_mean_ptr = reinterpret_cast<float*>(moving_mean.dptr_);
+      float * mean_ptr = reinterpret_cast<float*>(mean.dptr_);
+      float * moving_var_ptr = reinterpret_cast<float*>(moving_var.dptr_);
+      float * var_ptr = reinterpret_cast<float*>(var.dptr_);
+      float minus_mom = (1 - param_.momentum);
+      for (int i = 0; i < size; i++) {
+        moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum
+          + mean_ptr[i] * minus_mom;
+      }
+      for (int i = 0; i < size; i++) {
+        moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum
+          + var_ptr[i] * minus_mom;
+      }
+      mean_dptr = mean.dptr_;
+      var_dptr = var.dptr_;
+    } else {
+      mean_dptr = moving_mean.dptr_;
+      var_dptr = moving_var.dptr_;
+    }
+
+    if (bwd_scaleshift_pd == NULL) {
+      InitBatchNormBwd(out_grad);
+      bmean_memory.reset(new memory(bwd_scaleshift_pd->mean_primitive_desc(), mean_dptr));
+      bvar_memory.reset(new memory(bwd_scaleshift_pd->variance_primitive_desc(), var_dptr));
+      bwd_diff_src_memory = bwd_bottom_diff->create_output_memory(grad_in.dptr_,
+        in_grad[batchnorm::kData], bwd_bottom_diff);
+
+      bwd_diff_dst_memory = bwd_top_diff->get_converted_prv(grad.dptr_,
+        false, out_grad[batchnorm::kOut]);
+
+      BatchNormBwd.reset(new batch_normalization_backward(*bwd_scaleshift_pd,
+        *fwd_input_primitive, *bmean_memory, *bvar_memory,
+        *bwd_diff_dst_memory,
+        *weight_memory, *bwd_diff_src_memory, *diff_weight_memory));
+    } else {
+      bwd_top_diff->sync_converted_prv(grad.dptr_,
+        false, out_grad[batchnorm::kOut]);
+      bwd_bottom_diff->sync_output_memory(
+        in_grad[batchnorm::kData], bwd_bottom_diff);
+    }
+    BatchNormBwd.submit();
+    Dtype * scaleShiftDiff_buf = reinterpret_cast<Dtype*>(diff_weight_memory->get_data_handle());
+    if (!param_.fix_gamma) {
+      // Store ScaleShift blobs
+      Dtype* diff_scale = gslope.dptr_;
+      for (int i = 0; i < channels_; i++) {
+        diff_scale[i] = scaleShiftDiff_buf[i];
+      }
+    } else {
+      int gslope_size = gslope.size(0);
+      float * gslope_ptr = reinterpret_cast<float*>(gslope.dptr_);
+      for (int i = 0; i < gslope_size; i++) {
+        *gslope_ptr++ = 0.0f;
+      }
+    }
+    Dtype* diff_shift = gbias.dptr_;
+    for (int i = 0; i < channels_; i++) {
+      diff_shift[i] = scaleShiftDiff_buf[channels_ + i];
+    }
+  }
+
+ private:
+  BatchNormParam param_;
+  bool init_mkldnn_ = false;
+  std::shared_ptr<memory::desc> fwd_usr_input_md;
+  std::shared_ptr<memory::primitive_desc> fwd_usr_mpd;
+  std::shared_ptr<memory::desc> fwd_prv_input_md;
+  std::shared_ptr<memory::primitive_desc> fwd_prv_mpd, prv_mpd;
+
+  // Forward
+  std::shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
+  std::shared_ptr<batch_normalization_forward::primitive_desc> fwd_inference_pd;
+  std::shared_ptr<batch_normalization_forward::primitive_desc> fwd_training_pd;
+  MKLDNNPrimitive<Dtype> BatchNormFwd;
+
+  // Backward
+  MKLDNNPrimitive<Dtype> BatchNormBwd;
+  std::shared_ptr<MKLDNNData<Dtype> > bwd_top_diff;
+  std::shared_ptr<MKLDNNData<Dtype> > bwd_bottom_diff;
+  std::shared_ptr<memory::desc> input_md, scaleshift_md;
+  std::shared_ptr<batch_normalization_backward::primitive_desc> bwd_scaleshift_pd;
+  std::shared_ptr<memory> weight_memory;
+  std::shared_ptr<memory> diff_weight_memory;
+  std::shared_ptr<memory> fwd_output_memory;
+  // common
+  int32_t num_, width_, height_, channels_;
+  Dtype eps_;
+  std::shared_ptr<memory> mean_memory, var_memory;
+  std::shared_ptr<memory> fwd_input_primitive;
+    std::shared_ptr<memory> bmean_memory, bvar_memory;
+    std::shared_ptr<memory> bwd_input_primitive;
+    std::shared_ptr<memory> bwd_diff_dst_memory;
+    std::shared_ptr<memory> bwd_diff_src_memory;
+};  // class MKLDNNBatchNormOp
+  template<> int MKLDNNBatchNormOp<cpu, float>::s_id_gen = 1;
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_BATCH_NORM_INL_H_
diff --git a/src/operator/mkl/mkldnn_concat-inl.h b/src/operator/mkl/mkldnn_concat-inl.h
new file mode 100644
index 0000000000..1fb648795e
--- /dev/null
+++ b/src/operator/mkl/mkldnn_concat-inl.h
@@ -0,0 +1,328 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_concat-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_CONCAT_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_CONCAT_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename Dtype>
+class MKLDNNConcatOp : public Operator, public MKLDNNLayer<Dtype> {
+ public:
+  static std::string getName() {
+    std::string name = "MKLDNNConcatOp";
+    return name;
+  }
+  explicit MKLDNNConcatOp(ConcatParam param) : MKLDNNLayer<Dtype>()
+    , size_(param.num_args), dimension_(param.dim), split_channels_(param.num_args) {
+    init_mkldnn_ = false;
+  }
+  virtual ~MKLDNNConcatOp() {
+  }
+
+ private:
+  void LayerSetup(const std::vector<mshadow::Tensor<xpu, 4, Dtype> > &data,
+                  size_t data_shape_size) {
+    for (size_t i = 1; i < size_; ++i) {
+      for (size_t j = 1; j < data_shape_size; ++j) {
+        if (j == dimension_) continue;
+        CHECK_EQ(data[0].shape_[j], data[i].shape_[j]);
+      }
+    }
+    n_ = data[0].shape_[0];
+    c_ = 0;
+    h_ = data[0].shape_[2];
+    w_ = data[0].shape_[3];
+    for (size_t i = 0; i < size_; ++i) {
+      CHECK_EQ((int)data_shape_size, data[i].shape_.kDimension);
+      split_channels_[i] = data[i].shape_[dimension_];
+      c_ += split_channels_[i];
+    }
+  }
+
+  void InitConcatFwd(const OpContext &ctx, const std::vector<TBlob> &in_data,
+                       const std::vector<TBlob> &out_data) {
+    
+
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mtype = memory::data_type::f32;
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::dims output_tz = {n_, c_, h_, w_};
+
+    for (size_t i = 0; i < size_; ++i) {
+      memory::format mfmt = mfmt_nchw;
+      fwd_bottom_data_.push_back(std::shared_ptr<MKLDNNData<Dtype> >());
+      memory::dims input_tz = {n_, (int32_t)split_channels_[i], h_, w_};
+
+      std::shared_ptr<memory::primitive_desc> prv_src_mpd;
+      std::shared_ptr<memory::primitive_desc> usr_src_mpd(
+        new memory::primitive_desc({input_tz, mtype, mfmt_nchw}, cpu_engine));
+
+      if (const_cast<Dtype*>(mkl_prv_data<Dtype>(in_data[i])) != NULL) {
+        std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > mem_descr
+          = get_mkldnn_prv_descriptor<Dtype>(in_data[i]);
+        mfmt = static_cast<memory::format>(
+              mem_descr->prv_memory_pd()->desc().data.format);
+        prv_src_mpd.reset(new memory::primitive_desc(
+                {input_tz, mtype, mfmt}, cpu_engine));
+      }
+
+      bottom_data_mpd.push_back(memory::primitive_desc(
+          {input_tz, mtype, mfmt}, cpu_engine));
+
+      fwd_bottom_data_[i].reset(new MKLDNNData<Dtype>(usr_src_mpd, prv_src_mpd));
+    }
+
+    std::shared_ptr<memory::primitive_desc> usr_dst_mpd(new memory::primitive_desc(
+        {output_tz, mtype, mfmt_nchw}, cpu_engine));
+
+    fwd_pd.reset(new concat::primitive_desc(static_cast<int>(dimension_), bottom_data_mpd));
+
+    std::shared_ptr<memory::primitive_desc> prv_dst_mpd(new memory::primitive_desc(
+        fwd_pd->dst_primitive_desc()));
+
+    fwd_top_data_.reset(new MKLDNNData<Dtype>(usr_dst_mpd, prv_dst_mpd));
+
+ }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(static_cast<int>(in_data.size()), size_);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 4, Dtype> > data(size_);
+    Tensor<xpu, 4, Dtype> out;
+    if (in_data[0].ndim() == 2) {
+      for (size_t i = 0; i < size_; ++i) {
+        Shape<4> dshape = Shape4(in_data[i].shape_[0],
+                                 in_data[i].shape_[1], 1, 1);
+        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+          in_data[i], dshape, s);
+      }
+      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
+                               out_data[concat_enum::kOut].shape_[1], 1, 1);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_data[concat_enum::kOut], dshape, s);
+    } else if (in_data[0].ndim() == 3) {
+      for (size_t i = 0; i < size_; ++i) {
+        Shape<4> dshape = Shape4(in_data[i].shape_[0],
+          in_data[i].shape_[1], in_data[i].shape_[2], 1);
+        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+          in_data[i], dshape, s);
+      }
+      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
+        out_data[concat_enum::kOut].shape_[1],
+        out_data[concat_enum::kOut].shape_[2], 1);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_data[concat_enum::kOut], dshape, s);
+    } else {
+      for (size_t i = 0; i < size_; ++i) {
+        data[i] = mkl_experimental_direct_get<xpu, 4, Dtype>(in_data[i], s);
+      }
+      out = mkl_experimental_direct_get<xpu, 4, Dtype>(out_data[concat_enum::kOut], s);
+    }
+    
+    if (!init_mkldnn_) {
+        init_mkldnn_ = true;
+        LayerSetup(data, 4);
+        InitConcatFwd(ctx, in_data, out_data);
+        for (size_t i = 0; i < size_; ++i) {
+          inputs.push_back(fwd_bottom_data_[i]->get_converted_prv(data[i].dptr_, false, in_data[i]));
+        }
+  
+        for (size_t i = 0; i < inputs.size(); i++) {
+            inputs_at.push_back(*inputs[i]);
+        }
+        output_memory = fwd_top_data_->create_output_memory(
+          out.dptr_, out_data[concat_enum::kOut], fwd_top_data_);
+        concatFwd.reset(new mkldnn::concat(*fwd_pd, inputs_at, *output_memory));
+      } else {
+        for (size_t i = 0; i < size_; ++i) {
+          fwd_bottom_data_[i]->sync_converted_prv(data[i].dptr_, false, in_data[i]);
+        }
+        fwd_top_data_->sync_output_memory(
+          out_data[concat_enum::kOut], fwd_top_data_);
+     }
+     concatFwd.submit();
+  }
+
+  void InitConcatBwd(const std::vector<TBlob> &out_grad,
+                  const std::vector<mshadow::Tensor<xpu, 4, Dtype> > &data,
+                  const mshadow::Tensor<xpu, 4, Dtype> &out) {
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mtype = memory::data_type::f32;
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::format diff_dst_mfmt = mfmt_nchw;
+    memory::dims input_tz = {n_, c_, h_, w_};
+    memory::dims offsets = {0, 0, 0, 0};
+
+    std::shared_ptr<memory::primitive_desc> prv_diff_dst_mpd;
+    std::shared_ptr<memory::primitive_desc> usr_diff_dst_mpd(
+      new memory::primitive_desc({input_tz, mtype, mfmt_nchw},
+        cpu_engine));
+
+    bool top_diff_is_prv =
+      (const_cast<Dtype*>(mkl_prv_data<Dtype>(out_grad[concat_enum::kOut])) != NULL);
+    if (top_diff_is_prv) {
+        std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > mem_descr
+          = get_mkldnn_prv_descriptor<Dtype>(out_grad[concat_enum::kOut]);
+        diff_dst_mfmt = static_cast<memory::format>(
+            mem_descr->prv_memory_pd()->desc().data.format);
+        prv_diff_dst_mpd.reset(new memory::primitive_desc(
+              {input_tz, mtype, diff_dst_mfmt}, cpu_engine));
+    }
+
+    bwd_top_diff_.reset(new MKLDNNData<Dtype>(usr_diff_dst_mpd, prv_diff_dst_mpd));
+
+    for (size_t i = 0; i < size_; ++i) {
+      bwd_bottom_diff_.push_back(std::shared_ptr<MKLDNNData<Dtype> >());
+      bwd_pd.push_back(std::shared_ptr<reorder::primitive_desc>());
+      memory::dims dims = {n_, (int32_t)split_channels_[i], h_, w_};
+      std::shared_ptr<memory::primitive_desc> usr_diff_src_mpd(
+        new memory::primitive_desc({dims, mtype, mfmt_nchw},
+            cpu_engine));
+      std::shared_ptr<memory::primitive_desc> prv_diff_src_mpd(
+        new memory::primitive_desc({dims, mtype, diff_dst_mfmt},
+            cpu_engine));
+      bwd_bottom_diff_[i].reset(new MKLDNNData<Dtype>(
+            usr_diff_src_mpd, prv_diff_src_mpd));
+
+      auto view_pd = top_diff_is_prv ?
+        view::primitive_desc(*prv_diff_dst_mpd, dims, offsets) :
+        view::primitive_desc(*usr_diff_dst_mpd, dims, offsets);
+      auto view_dst_pd = view_pd.dst_primitive_desc();
+      bwd_pd[i].reset(new reorder::primitive_desc(view_dst_pd, *prv_diff_src_mpd));
+      offsets[dimension_] += split_channels_[i];
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 4, Dtype> > grad_in(size_);
+    Tensor<xpu, 4, Dtype> grad;
+    if (in_grad[0].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
+        out_grad[concat_enum::kOut].shape_[1], 1, 1);
+      grad = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_grad[concat_enum::kOut], dshape, s);
+      for (size_t i = 0; i < size_; ++i) {
+        dshape = Shape4(in_grad[i].shape_[0],
+          in_grad[i].shape_[1], 1, 1);
+        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+          in_grad[i], dshape, s);
+      }
+    } else if (in_grad[0].ndim() == 3) {
+      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
+        out_grad[concat_enum::kOut].shape_[1],
+        out_grad[concat_enum::kOut].shape_[2], 1);
+      grad = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_grad[concat_enum::kOut], dshape, s);
+      for (size_t i = 0; i < size_; ++i) {
+        dshape = Shape4(in_grad[i].shape_[0],
+          in_grad[i].shape_[1], in_grad[i].shape_[2], 1);
+        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+          in_grad[i], dshape, s);
+      }
+    } else {
+      grad = mkl_experimental_direct_get<xpu, 4, Dtype>(out_grad[concat_enum::kOut], s);
+      for (size_t i = 0; i < size_; ++i) {
+        grad_in[i] = mkl_experimental_direct_get<xpu, 4, Dtype>(in_grad[i], s);
+      }
+    }
+
+    int need_bwd = 0;
+    for (size_t n = 0; n < size_; n++) {
+      need_bwd += req[n];
+    }
+    if (!need_bwd) {
+      return;
+    }
+
+    if (bwd_pd.empty()) {
+      InitConcatBwd(out_grad, grad_in, grad);
+    }
+
+    for (size_t i = 0; i < size_; ++i) {
+     std::shared_ptr<memory> bwd_reorder_input_memory =
+      bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[concat_enum::kOut]);
+     std::shared_ptr<memory> bwd_reorder_output_memory =
+      bwd_bottom_diff_[i]->create_output_memory(grad_in[i].dptr_, in_grad[i], bwd_bottom_diff_[i]);
+
+     MKLDNNPrimitive<Dtype> concatBwd;
+     concatBwd.reset(
+        new reorder(*bwd_pd[i], *bwd_reorder_input_memory, *bwd_reorder_output_memory));
+     concatBwd.submit();
+    }
+  }
+
+ private:
+  int32_t n_, c_, h_, w_;
+  size_t size_;
+  size_t dimension_;
+  bool init_mkldnn_;
+  std::vector<size_t> split_channels_;
+
+  std::shared_ptr<MKLDNNData<Dtype> > fwd_top_data_;
+  std::vector< std::shared_ptr<MKLDNNData<Dtype> > > fwd_bottom_data_;
+  std::shared_ptr<MKLDNNData<Dtype> > bwd_top_diff_;
+  std::vector< std::shared_ptr<MKLDNNData<Dtype> > > bwd_bottom_diff_;
+  MKLDNNPrimitive<Dtype> concatFwd;
+  std::vector<std::shared_ptr<memory>> inputs;
+  std::vector<primitive::at> inputs_at;
+  std::vector<memory::primitive_desc> bottom_data_mpd;
+  std::shared_ptr<memory::desc> top_data_md;
+  std::shared_ptr<memory> output_memory;
+  std::shared_ptr<concat::primitive_desc> fwd_pd;
+  std::vector<std::shared_ptr<mkldnn::reorder::primitive_desc>> bwd_pd;
+};  // class MKLDNNConcatOp
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_CONCAT_INL_H_
diff --git a/src/operator/mkl/mkldnn_convolution-inl.h b/src/operator/mkl/mkldnn_convolution-inl.h
new file mode 100644
index 0000000000..d4c0667bb2
--- /dev/null
+++ b/src/operator/mkl/mkldnn_convolution-inl.h
@@ -0,0 +1,504 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_convolution-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_CONVOLUTION_INL_H_
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "mkl_conv-common-inl.h"
+#include "mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+class MKLDNNConvolutionOp : public Operator, public MKLDNNLayer<DType>,
+  public MKLConvCommon<xpu, DType> {
+ public:
+  std::string getName() {
+    std::string name = "MKLDNNConvolutionOp";
+    return name;
+  }
+  explicit MKLDNNConvolutionOp(ConvolutionParam p)
+    : MKLDNNLayer<DType>()
+    , fwd_bottom_data(NULL), fwd_top_data(NULL), fwd_weights_data(NULL), fwd_bias_data(NULL)
+    , convFwd_pd(NULL)
+    , convBwdData_pd(NULL), convBwdWeights_pd(NULL) {
+    this->param_ = p;
+    param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    b_init_conv = false;
+  }
+
+  virtual ~MKLDNNConvolutionOp() {
+  }
+  void init_properties(const mshadow::Tensor<xpu, 4, DType> &data,
+    const mshadow::Tensor<xpu, 4, DType> &out) {
+    this->stride_w_ = param_.stride[1];
+    this->stride_h_ = param_.stride[0];
+    this->width_ = data.shape_[3];
+    this->height_ = data.shape_[2];
+    this->pad_w_ = param_.pad[1];
+    this->pad_h_ = param_.pad[0];
+    this->kernel_w_ = param_.kernel[1];
+    this->kernel_h_ = param_.kernel[0];
+    this->channels_ = data.shape_[1];
+    this->num_ = data.shape_[0];
+    this->group_ = param_.num_group;
+    this->width_out_ = out.shape_[3];
+    this->height_out_ = out.shape_[2];
+    this->channel_output_ = out.shape_[1];
+  }
+ private:
+  void InitForward(const OpContext &ctx) {
+      auto propagation =
+        (!ctx.is_train) ? prop_kind::forward_scoring : prop_kind::forward_training;
+
+      int32_t g = std::max(this->group_, 1);
+      int32_t n = this->num_;
+      int32_t iw = this->width_;
+      int32_t ih = this->height_;
+      int32_t ic = this->channels_;
+
+      int32_t ow = this->width_out_;
+      int32_t oh = this->height_out_;
+      int32_t oc = this->channel_output_;
+
+      int32_t kw = this->kernel_w_;
+      int32_t kh = this->kernel_h_;
+      memory::dims convolutionStrides{ static_cast<int>(this->stride_h_),
+        static_cast<int>(this->stride_w_) };
+      memory::dims padding{ this->pad_h_, this->pad_w_ };
+
+      memory::data_type mpcsn = memory::data_type::f32;
+      memory::format mfmt_any = memory::format::any;
+      mkldnn::engine cpu_engine = mxnet::CpuEngine::Instance().get_engine();
+
+      memory::dims bottom_tz = { n, ic, ih, iw };
+      memory::dims bias_tz = { oc };
+      memory::dims top_tz = { n, oc, oh, ow };
+      memory::dims weights_tz =
+        (g != 1) ? memory::dims{ g, oc / g, ic / g, kh, kw } : memory::dims{ oc, ic, kh, kw };
+
+      memory::desc init_bottom_md({ bottom_tz }, mpcsn, mfmt_any);
+      memory::desc init_bias_md({ bias_tz }, mpcsn, mfmt_any);
+      memory::desc init_top_md({ top_tz }, mpcsn, mfmt_any);
+      memory::desc init_weights_md({ weights_tz }, mpcsn, mfmt_any);
+
+      // ---- Initialize convolution primitive descriptor
+      std::shared_ptr<convolution_forward::desc> convFwd_desc;
+      if (!this->param_.no_bias) {
+        convFwd_desc.reset(
+          new convolution_forward::desc(propagation, algorithm::convolution_direct
+          , init_bottom_md, init_weights_md, init_bias_md, init_top_md
+          , convolutionStrides, padding, padding, padding_kind::zero));
+      } else {
+        convFwd_desc.reset(
+          new convolution_forward::desc(propagation, algorithm::convolution_direct
+          , init_bottom_md, init_weights_md, init_top_md
+          , convolutionStrides, padding, padding, padding_kind::zero));
+      }
+      convFwd_pd.reset(new convolution_forward::primitive_desc(*convFwd_desc, cpu_engine));
+      CHECK(convFwd_pd);
+      // ---- Create priv memory primitive descriptors stored as class members -------------
+      typedef typename memory::primitive_desc MemPD;
+      std::shared_ptr<MemPD> prv_fwd_bottom_data_memory_pd(
+        new MemPD(convFwd_pd->src_primitive_desc()));
+      std::shared_ptr<MemPD> prv_fwd_top_data_memory_pd(
+        new MemPD(convFwd_pd->dst_primitive_desc()));
+      std::shared_ptr<MemPD> prv_fwd_weights_data_memory_pd(
+        new MemPD(convFwd_pd->weights_primitive_desc()));
+
+      // ---- Create usr memory primitive descriptors -------------
+      memory::format mfmt_nchw = memory::format::nchw;
+      memory::format weights_mfmt = (g != 1) ? memory::format::goihw : memory::format::oihw;
+
+      std::shared_ptr<MemPD> usr_bottom_data_memory_pd(
+        new MemPD({ { bottom_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+      std::shared_ptr<MemPD> usr_bias_data_memory_pd(
+        new MemPD({ { bias_tz }, mpcsn, memory::format::x }, cpu_engine));
+      std::shared_ptr<MemPD> usr_top_data_memory_pd(
+        new MemPD({ { top_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+      std::shared_ptr<MemPD> usr_weights_data_memory_pd(
+        new MemPD({ { weights_tz }, mpcsn, weights_mfmt }, cpu_engine));
+
+
+      // ---  init primitive and prv_memory descriptors ----------------------
+      fwd_bottom_data.reset(
+        new MKLDNNData<DType>(usr_bottom_data_memory_pd, prv_fwd_bottom_data_memory_pd));
+      fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+      fwd_top_data.reset(
+        new MKLDNNData<DType>(usr_top_data_memory_pd, prv_fwd_top_data_memory_pd));
+      fwd_top_data->name = "fwd_top_data      @ " + this->getName();
+      fwd_weights_data.reset(
+        new MKLDNNData<DType>(usr_weights_data_memory_pd, prv_fwd_weights_data_memory_pd));
+      fwd_weights_data->name = "fwd_weights_data  @ " + this->getName();
+      if (!this->param_.no_bias) {
+        std::shared_ptr<MemPD> prv_fwd_bias_data_memory_pd(
+          new MemPD(convFwd_pd->bias_primitive_desc()));
+        fwd_bias_data.reset(
+          new MKLDNNData<DType>(usr_bias_data_memory_pd, prv_fwd_bias_data_memory_pd));
+        fwd_bias_data->name = "fwd_bias_data     @ " + this->getName();
+      }
+  }
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+        using namespace mshadow;
+        using namespace mshadow::expr;
+        CHECK_EQ(req[conv::kOut], kWriteTo);
+        // size_t expected = this->param_.no_bias ? 2 : 3;
+        // CHECK_EQ(in_data.size(), expected);
+        CHECK_EQ(out_data.size(), 1);
+        Stream<xpu> *s = ctx.get_stream<xpu>();
+        Tensor<xpu, 4, DType> data =
+            mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
+        Tensor<xpu, 4, DType> out =
+            mkl_experimental_direct_get<xpu, 4, DType>(out_data[conv::kOut], s);
+        Tensor<xpu, 4, DType> wmat =
+            mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kWeight], s);
+        CHECK_EQ(data.CheckContiguous(), true);
+        CHECK_EQ(wmat.CheckContiguous(), true);
+        CHECK_EQ(out.CheckContiguous(), true);
+        DType *data_ptr = data.dptr_;
+        DType *wmat_ptr = wmat.dptr_;
+        DType *out_ptr = out.dptr_;
+      if (convFwd_pd == NULL) {
+        if (!b_init_conv) {
+          this->init_properties(data, out);
+          this->b_init_conv = true;
+        }
+
+        InitForward(ctx);
+          // ---  init primitive and prv_memory descriptors ---------
+        fwd_bottom_data_primitive =
+          fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]);
+        fwd_weights_data_primitive = fwd_weights_data->get_converted_prv(wmat_ptr, true,
+          in_data[conv::kWeight]);
+        if (!this->param_.no_bias) {
+          Tensor<xpu, 1, DType> bias = mkl_experimental_direct_get<xpu, 1, DType>(in_data[conv::kBias], s);
+          fwd_bias_data_primitive =
+            fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]);
+        }
+        fwd_top_data_memory = fwd_top_data->create_output_memory(out_ptr, out_data[conv::kOut],
+          fwd_top_data);
+        if (!this->param_.no_bias) {
+          convFwd.reset(new convolution_forward(*convFwd_pd
+            , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+            , *fwd_bias_data_primitive, *fwd_top_data_memory));
+        } else {
+          convFwd.reset(new convolution_forward(*convFwd_pd
+            , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+            , *fwd_top_data_memory));
+        }
+      } else {
+          fwd_bottom_data->sync_converted_prv(data_ptr, false, in_data[conv::kData]);
+          fwd_weights_data->sync_converted_prv(wmat_ptr, true, in_data[conv::kWeight]);
+          if (!this->param_.no_bias) {
+              Tensor<xpu, 1, DType> bias = mkl_experimental_direct_get<xpu, 1, DType>(in_data[conv::kBias], s);
+              fwd_bias_data->sync_converted_prv(bias.dptr_, true, in_data[conv::kBias]);
+          }
+          fwd_top_data->sync_output_memory(out_data[conv::kOut],
+            fwd_top_data);
+      }
+      convFwd.submit();
+  }
+  void InitConvolutionBwd(const OpContext &ctx,
+    const std::vector<TBlob> &out_grad,
+    const std::vector<TBlob> &in_data,
+    const std::vector<TBlob> &in_grad) {
+    int32_t g = std::max(this->group_, 1);
+    int32_t n = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+
+    int32_t ow = this->width_out_;
+    int32_t oh = this->height_out_;
+    int32_t oc = this->channel_output_;
+
+    int32_t kw = this->kernel_w_;
+    int32_t kh = this->kernel_h_;
+    memory::dims convolutionStrides{ this->stride_h_, this->stride_w_ };
+    memory::dims padding{ this->pad_h_, this->pad_w_ };
+
+    memory::data_type mpcsn = memory::data_type::f32;
+    memory::format mfmt_any = memory::format::any;
+
+    memory::dims bottom_tz = { n, ic, ih, iw };
+    memory::dims bias_tz = { oc };
+    memory::dims top_tz = { n, oc, oh, ow };
+    memory::dims weights_tz =
+      (g != 1) ? memory::dims{ g, oc / g, ic / g, kh, kw } : memory::dims{ oc, ic, kh, kw };
+    memory::desc init_bottom_md({ bottom_tz }, mpcsn, mfmt_any);
+    memory::desc init_bias_md({ bias_tz }, mpcsn, mfmt_any);
+    memory::desc init_top_md({ top_tz }, mpcsn, mfmt_any);
+    memory::desc init_weights_md({ weights_tz }, mpcsn, mfmt_any);
+
+    void * top_diff_data =
+      const_cast<DType*>(mkl_prv_data<DType>(out_grad[0]));
+      std::shared_ptr<MKLDNNMemoryDescriptor<DType> > mem_descr
+        = get_mkldnn_prv_descriptor<DType>(out_grad[0]);
+    // ---- Initialize convolution primitive descriptor -------------
+    std::shared_ptr<convolution_backward_data::desc> convBwdData_desc;
+    std::shared_ptr<convolution_backward_weights::desc> convBwdWeights_desc;
+    if (!this->param_.no_bias) {
+      convBwdWeights_desc.reset(
+        new convolution_backward_weights::desc(algorithm::convolution_direct
+        , init_bottom_md, init_weights_md, init_bias_md, init_top_md
+        , convolutionStrides, padding, padding, padding_kind::zero));
+    } else {
+      convBwdWeights_desc.reset(
+        new convolution_backward_weights::desc(algorithm::convolution_direct
+        , init_bottom_md, init_weights_md, init_top_md
+        , convolutionStrides, padding, padding, padding_kind::zero));
+    }
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    convBwdData_desc.reset(
+      new convolution_backward_data::desc(algorithm::convolution_direct
+      , init_bottom_md, init_weights_md, init_top_md
+      , convolutionStrides, padding, padding, padding_kind::zero));
+    convBwdData_pd.reset(
+      new convolution_backward_data::primitive_desc(*convBwdData_desc,
+      cpu_engine, *convFwd_pd));
+
+    convBwdWeights_pd.reset(
+      new convolution_backward_weights::primitive_desc(*convBwdWeights_desc,
+      cpu_engine, *convFwd_pd));
+
+
+    // ---- Create priv memory primitive descriptors stored as class members -------------
+    typedef typename memory::primitive_desc MemPD;
+
+    std::shared_ptr<MemPD> prv_bwdd_bottom_diff_memory_pd(
+      new MemPD(convBwdData_pd->diff_src_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdd_top_diff_memory_pd(
+      new MemPD(convBwdData_pd->diff_dst_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdd_weights_data_memory_pd(
+      new MemPD(convBwdData_pd->weights_primitive_desc()));
+
+    std::shared_ptr<MemPD> prv_bwdw_bottom_data_memory_pd(
+      new MemPD(convBwdWeights_pd->src_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdw_top_diff_memory_pd(
+      new MemPD(convBwdWeights_pd->diff_dst_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdw_weights_diff_memory_pd(
+      new MemPD(convBwdWeights_pd->diff_weights_primitive_desc()));
+
+    // ---- Create usr memory primitive descriptors -------------
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::format weights_mfmt = (g != 1) ? memory::format::goihw : memory::format::oihw;
+
+    // ???!!! can we use usr memory primitive descrittors for backward??
+    std::shared_ptr<MemPD> usr_bottom_data_memory_pd(
+      new MemPD({ { bottom_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+    std::shared_ptr<MemPD> usr_bias_data_memory_pd(
+      new MemPD({ { bias_tz }, mpcsn, memory::format::x }, cpu_engine));
+    std::shared_ptr<MemPD> usr_top_data_memory_pd(
+      new MemPD({ { top_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+    std::shared_ptr<MemPD> usr_weights_data_memory_pd(
+      new MemPD({ { weights_tz }, mpcsn, weights_mfmt }, cpu_engine));
+
+    // ---  init primitive and prv_memory descriptors ----------------------
+    bwdd_bottom_diff.reset(
+      new MKLDNNData<DType>(usr_bottom_data_memory_pd, prv_bwdd_bottom_diff_memory_pd));
+    bwdd_bottom_diff->name = "bwdd_bottom_diff   @ " + this->getName();
+    bwdw_bottom_data.reset(
+      new MKLDNNData<DType>(usr_bottom_data_memory_pd, prv_bwdw_bottom_data_memory_pd));
+    bwdw_bottom_data->name = "bwdw_bottom_data   @ " + this->getName();
+
+    bwdd_top_diff.reset(
+      new MKLDNNData<DType>(usr_top_data_memory_pd, prv_bwdd_top_diff_memory_pd));
+    bwdd_top_diff->name = "bwdd_top_diff      @ " + this->getName();
+    bwdw_top_diff.reset(
+      new MKLDNNData<DType>(usr_top_data_memory_pd, prv_bwdw_top_diff_memory_pd));
+    bwdw_top_diff->name = "bwdw_top_diff      @ " + this->getName();
+    bwdd_weights_data.reset(
+      new MKLDNNData<DType>(usr_weights_data_memory_pd, prv_bwdd_weights_data_memory_pd));
+    bwdd_weights_data->name = "bwdd_weights_data  @ " + this->getName();
+    bwdw_weights_diff.reset(
+      new MKLDNNData<DType>(usr_weights_data_memory_pd, prv_bwdw_weights_diff_memory_pd));
+    bwdw_weights_diff->name = "bwdw_weights_diff  @ " + this->getName();
+    if (!this->param_.no_bias) {
+      std::shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(
+        new MemPD(convBwdWeights_pd->diff_bias_primitive_desc()));
+      bwdw_bias_diff.reset(
+        new MKLDNNData<DType>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_pd));
+      bwdw_bias_diff->name = "bwdw_bias_diff     @ " + this->getName();
+    }
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
+    CHECK_EQ(out_grad.size(), 1);
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
+    Shape<3> wmat_shape =
+      Shape3(param_.num_group,
+        param_.num_filter / param_.num_group,
+        data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3, DType> wmat =
+      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
+        in_data[conv::kWeight], wmat_shape, s);
+    Tensor<xpu, 4, DType> grad =
+      mkl_experimental_direct_get<xpu, 4, DType>(out_grad[conv::kOut], s);
+    Tensor<xpu, 4, DType> gdata =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_grad[conv::kData], s);
+    Tensor<xpu, 3, DType> gwmat =
+      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
+        in_grad[conv::kWeight], wmat_shape, s);
+    
+    if (!b_init_conv) {
+      this->init_properties(data, grad);
+      b_init_conv = true;
+    }
+    if (convBwdData_pd == NULL) {
+      this->InitConvolutionBwd(ctx, out_grad, in_data, in_grad);
+    }
+
+
+    // ---  init primitive and prv_memory descriptors ---------
+    if (req[0]) {
+      Storage::Handle addtoWorkspace;
+      if (req[0] == kAddTo) {
+          // wait mkl support addto mode
+          this->AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(),
+            &addtoWorkspace);
+      }
+      if (convBwdData.aprimitive != NULL) {
+        bwdd_top_diff->sync_converted_prv(grad.dptr_, false, out_grad[conv::kOut]);
+        bwdd_weights_data->sync_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]);
+        bwdd_bottom_diff->sync_output_memory(in_grad[conv::kData], bwdd_bottom_diff);
+      } else {
+        bwdd_top_diff_primitive = bwdd_top_diff->get_converted_prv(grad.dptr_, false,
+        out_grad[conv::kOut]);
+      bwdd_weights_data_primitive = bwdd_weights_data->get_converted_prv(wmat.dptr_, false,
+        in_data[conv::kWeight]);
+      bwdd_bottom_diff_memory = bwdd_bottom_diff->create_output_memory(gdata.dptr_,
+        in_grad[conv::kData], bwdd_bottom_diff);
+
+      convBwdData.reset(new convolution_backward_data(*convBwdData_pd
+        , *bwdd_top_diff_primitive, *bwdd_weights_data_primitive
+        , *bwdd_bottom_diff_memory));
+      }
+      convBwdData.submit();
+      if (req[0] == kAddTo) {
+        if (bwdd_bottom_diff->conversion_needed()) {
+          bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
+        }
+        this->AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_,
+          in_grad[conv::kData].Size());
+      }
+    }
+    if (req[1]) {
+      Storage::Handle addtoWorkspace;
+      if (req[1] == kAddTo) {
+        // wait mkl support addto mode
+        this->AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(),
+          &addtoWorkspace);
+      }
+      if (convBwdWeights.aprimitive == NULL) {
+          bwdw_top_diff_primitive = bwdw_top_diff->get_converted_prv(grad.dptr_, false,
+            out_grad[conv::kOut]);
+          bwdw_bottom_data_primitive = bwdw_bottom_data->get_converted_prv(data.dptr_, false,
+            in_data[conv::kData]);
+          
+          bwdw_weights_diff_memory = bwdw_weights_diff->create_output_memory(gwmat.dptr_,
+            in_grad[conv::kWeight], bwdw_weights_diff);
+          if (!this->param_.no_bias) {
+            Tensor<xpu, 1, DType> gbias = mkl_experimental_direct_get<xpu, 1, DType>(in_grad[conv::kBias], s);
+            bwdw_bias_diff_memory = bwdw_bias_diff->create_output_memory(gbias.dptr_,
+              in_grad[conv::kBias], bwdw_bias_diff);
+
+            convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+              , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+              , *bwdw_weights_diff_memory, *bwdw_bias_diff_memory));
+
+          } else {
+            convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+              , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+              , *bwdw_weights_diff_memory));
+          }
+      } else {
+        bwdw_top_diff->sync_converted_prv(grad.dptr_, false, out_grad[conv::kOut]);
+        bwdw_bottom_data->sync_converted_prv(data.dptr_, false, in_data[conv::kData]);
+        bwdw_weights_diff->sync_output_memory(in_grad[conv::kWeight], bwdw_weights_diff);
+        if (!this->param_.no_bias) 
+          bwdw_bias_diff->sync_output_memory(in_grad[conv::kBias], bwdw_bias_diff);
+      }
+      convBwdWeights.submit();
+      if (req[1] == kAddTo) {
+        if (bwdw_weights_diff->conversion_needed()) {
+          bwdw_weights_diff->convert_from_prv(gwmat.dptr_);
+        }
+        this->AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_,
+          in_grad[conv::kWeight].Size());
+      }
+    }
+}
+
+ private:
+  std::shared_ptr<MKLDNNData<DType> > fwd_bottom_data, fwd_top_data,
+    fwd_weights_data, fwd_bias_data,
+    bwdd_weights_data, bwdw_bottom_data;
+  std::shared_ptr<MKLDNNData<DType> > bwdd_bottom_diff, bwdd_top_diff,
+    bwdw_top_diff, bwdw_weights_diff, bwdw_bias_diff;
+  std::shared_ptr<convolution_forward::primitive_desc> convFwd_pd;
+  MKLDNNPrimitive<DType> convFwd;
+  std::shared_ptr<convolution_backward_data::primitive_desc> convBwdData_pd;
+  std::shared_ptr<convolution_backward_weights::primitive_desc> convBwdWeights_pd;
+  MKLDNNPrimitive<DType> convBwdData, convBwdWeights;
+  ConvolutionParam param_;
+  bool b_init_conv;
+  memory::dims input_tz;
+  memory::dims bias_tz;
+  memory::dims output_tz;
+  memory::dims weights_tz;
+  std::shared_ptr<memory> fwd_bottom_data_primitive,
+        fwd_weights_data_primitive, fwd_bias_data_primitive;
+      std::shared_ptr<memory> fwd_top_data_memory;
+      std::shared_ptr<memory> bwdd_top_diff_primitive, bwdd_weights_data_primitive,
+        bwdd_diff_src_primitive;
+      std::shared_ptr<memory> bwdd_bottom_diff_memory;
+      std::shared_ptr<memory> bwdw_bottom_data_primitive, bwdw_top_diff_primitive;
+      std::shared_ptr<memory> bwdw_weights_diff_memory, bwdw_bias_diff_memory;
+};  // class MKLDNNConvolutionOp
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_CONVOLUTION_INL_H_
diff --git a/src/operator/mkl/mkldnn_deconvolution-inl.h b/src/operator/mkl/mkldnn_deconvolution-inl.h
new file mode 100644
index 0000000000..4741c018fb
--- /dev/null
+++ b/src/operator/mkl/mkldnn_deconvolution-inl.h
@@ -0,0 +1,357 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_deconvolution-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_DECONVOLUTION_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_DECONVOLUTION_INL_H_
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "mkl_conv-common-inl.h"
+#include "mkldnn_base-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+class MKLDNNDeConvolutionOp : public Operator, public MKLDNNLayer<DType>,
+  public MKLConvCommon<xpu, DType> {
+ private:
+  static int s_id_gen;
+  int m_id;
+
+ public:
+  std::string getName() {
+    std::string name = "MKLDNNDeConvolutionOp_";
+    name = name + std::to_string(m_id);
+    return name;
+  }
+  explicit MKLDNNDeConvolutionOp(DeconvolutionParam p)
+    : MKLDNNLayer<DType>() {
+    this->param_ = p;
+    this->init_mkldnn_ = false;
+  }
+
+  virtual ~MKLDNNDeConvolutionOp() {
+  }
+  void init_properties(const mshadow::Tensor<xpu, 4, DType> &data,
+    const mshadow::Tensor<xpu, 4, DType> &out) {
+    this->stride_w_ = param_.stride[1];
+    this->stride_h_ = param_.stride[0];
+    this->width_ = data.shape_[3];
+    this->height_ = data.shape_[2];
+    this->pad_w_ = param_.pad[1];
+    this->pad_h_ = param_.pad[0];
+    this->kernel_w_ = param_.kernel[1];
+    this->kernel_h_ = param_.kernel[0];
+    this->channels_ = data.shape_[1];
+    this->num_ = data.shape_[0];
+    this->group_ = param_.num_group;
+    this->width_out_ = out.shape_[3];
+    this->height_out_ = out.shape_[2];
+    this->channel_output_ = out.shape_[1];
+  }
+  void InitDeconvolution(const OpContext &ctx) {
+    typedef typename memory::primitive_desc MemPD;
+    int32_t g = std::max(this->group_, 1);
+    int32_t n = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+
+    int32_t ow = this->width_out_;
+    int32_t oh = this->height_out_;
+    int32_t oc = this->channel_output_;
+
+    int32_t kw = this->kernel_w_;
+    int32_t kh = this->kernel_h_;
+    memory::dims convolutionStrides{ static_cast<int>(this->stride_h_),
+      static_cast<int>(this->stride_w_) };
+    memory::dims padding{ this->pad_h_, this->pad_w_ };
+
+    // ---- Initialize memory descriptors (fromat = any) to create convolution descriptor
+    memory::data_type mpcsn = memory::data_type::f32;
+    memory::format mfmt_any = memory::format::any;
+    mkldnn::engine cpu_engine = mxnet::CpuEngine::Instance().get_engine();
+
+    input_tz = { n, ic, ih, iw };
+    bias_tz = { oc };
+    output_tz = { n, oc, oh, ow };
+    weights_tz = (g != 1) ?
+      memory::dims{ g, oc / g, ic / g, kh, kw } : memory::dims{ oc, ic, kh, kw };
+
+    // ---- Memory descriptors for initializing of convolution primitive descriptor
+    memory::desc init_input_md({ input_tz }, mpcsn, mfmt_any);
+    memory::desc init_bias_md({ bias_tz }, mpcsn, mfmt_any);
+    memory::desc init_output_md({ output_tz }, mpcsn, mfmt_any);
+    memory::desc init_weights_md({ weights_tz }, mpcsn, mfmt_any);
+    // ---- Create usr memory primitive descriptors
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::format weights_mfmt = (g != 1) ? memory::format::goihw : memory::format::oihw;
+    std::shared_ptr<MemPD> usr_input_mpd(
+      new MemPD({ { input_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+    std::shared_ptr<MemPD> usr_output_mpd(
+      new MemPD({ { output_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+    std::shared_ptr<MemPD> usr_weights_mpd(
+      new MemPD({ { weights_tz }, mpcsn, weights_mfmt }, cpu_engine));
+    std::shared_ptr<MemPD> usr_bias_mpd(
+      new MemPD({ { bias_tz }, mpcsn, memory::format::x }, cpu_engine));
+
+    // ---- Decov Backward Data
+    std::shared_ptr<convolution_forward::desc> deconvBwd_desc;
+    deconvBwd_desc.reset(new convolution_forward::desc(prop_kind::forward_training
+      , algorithm::convolution_direct
+      , init_output_md , init_weights_md, init_input_md
+      , convolutionStrides, padding, padding, padding_kind::zero));
+    deconvBwd_pd.reset(new convolution_forward::primitive_desc(*deconvBwd_desc, cpu_engine));
+    CHECK(deconvBwd_pd);
+    std::shared_ptr<MemPD> bwdd_prv_output_mpd(new MemPD(deconvBwd_pd->dst_primitive_desc()));
+    std::shared_ptr<MemPD> bwdd_prv_input_mpd(new MemPD(deconvBwd_pd->src_primitive_desc()));
+    std::shared_ptr<MemPD> bwdd_prv_weights_mpd(new MemPD(deconvBwd_pd->weights_primitive_desc()));
+
+    // ---  init primitive and prv_memory descriptors ---------
+    bwdd_top_diff.reset(new MKLDNNData<DType>(usr_output_mpd, bwdd_prv_input_mpd));
+    bwdd_top_diff->name = "bwdd_top_diff   @ " + this->getName();
+    bwdd_bottom_diff.reset(new MKLDNNData<DType>(usr_input_mpd, bwdd_prv_output_mpd));
+    bwdd_bottom_diff->name = "bwdd_bottom_diff      @ " + this->getName();
+    bwdd_filter_data.reset(new MKLDNNData<DType>(usr_weights_mpd, bwdd_prv_weights_mpd));
+    bwdd_filter_data->name = "bwdd_filter_data  @ " + this->getName();
+    // ---- Decov Forward Data
+    std::shared_ptr<convolution_backward_data::desc> deconvFwd_desc;
+    deconvFwd_desc.reset(new convolution_backward_data::desc(algorithm::convolution_direct
+      , init_output_md,  init_weights_md, init_input_md
+      , convolutionStrides, padding, padding, padding_kind::zero));
+    deconvFwd_pd.reset(new convolution_backward_data::primitive_desc(
+      *deconvFwd_desc, cpu_engine, *deconvBwd_pd));
+    CHECK(deconvFwd_pd);
+    std::shared_ptr<MemPD> fwd_prv_output_mpd(new MemPD(deconvFwd_pd->diff_src_primitive_desc()));
+    std::shared_ptr<MemPD> fwd_prv_input_mpd(new MemPD(deconvFwd_pd->diff_dst_primitive_desc()));
+    std::shared_ptr<MemPD> fwd_prv_weights_mpd(new MemPD(deconvFwd_pd->weights_primitive_desc()));
+
+    // ---  init primitive and prv_memory descriptors ---------
+    fwd_bottom_data.reset(new MKLDNNData<DType>(usr_input_mpd, fwd_prv_input_mpd));
+    fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+    fwd_top_data.reset(new MKLDNNData<DType>(usr_output_mpd, fwd_prv_output_mpd));
+    fwd_top_data->name = "fwd_top_data      @ " + this->getName();
+    fwd_filter_data.reset(new MKLDNNData<DType>(usr_weights_mpd, fwd_prv_weights_mpd));
+    fwd_filter_data->name = "fwd_filter_data  @ " + this->getName();
+
+    // ---- Decov Backward Weight
+    std::shared_ptr<convolution_backward_weights::desc> deconvBwdWeight_desc;
+    if (!this->param_.no_bias) {
+      deconvBwdWeight_desc.reset(new convolution_backward_weights::desc(
+        algorithm::convolution_direct
+        , init_output_md, init_weights_md, init_bias_md, init_input_md
+        , convolutionStrides, padding, padding, padding_kind::zero));
+    } else {
+      deconvBwdWeight_desc.reset(new convolution_backward_weights::desc(
+        algorithm::convolution_direct
+        , init_output_md, init_weights_md, init_input_md
+        , convolutionStrides, padding, padding, padding_kind::zero));
+    }
+    deconvBwdWeight_pd.reset(new convolution_backward_weights::primitive_desc(
+      *deconvBwdWeight_desc, cpu_engine, *deconvBwd_pd));
+    CHECK(deconvBwdWeight_pd);
+
+    std::shared_ptr<MemPD> bwdf_prv_diff_dst_mpd(
+      new MemPD(deconvBwdWeight_pd->diff_dst_primitive_desc()));
+    std::shared_ptr<MemPD> bwdf_prv_src_mpd(
+      new MemPD(deconvBwdWeight_pd->src_primitive_desc()));
+    std::shared_ptr<MemPD> bwdf_prv_diff_weights_md(
+      new MemPD(deconvBwdWeight_pd->diff_weights_primitive_desc()));
+    bwdf_top_diff.reset(new MKLDNNData<DType>(usr_input_mpd, bwdf_prv_diff_dst_mpd));
+    bwdf_top_diff->name = "bwdf_top_diff      @ " + this->getName();
+
+    bwdf_bottom_data.reset(new MKLDNNData<DType>(usr_output_mpd, bwdf_prv_src_mpd));
+    bwdf_bottom_data->name = "bwdf_bottom_data   @ " + this->getName();
+
+    bwdf_filter_diff.reset(new MKLDNNData<DType>(usr_weights_mpd,
+      bwdf_prv_diff_weights_md));
+    bwdf_filter_diff->name = "bwdf_filter_diff   @ " + this->getName();
+    if (!this->param_.no_bias) {
+      std::shared_ptr<MemPD> bwdf_prv_diff_bias_mpd(
+        new MemPD(deconvBwdWeight_pd->diff_bias_primitive_desc()));
+      // Backward by data layer setup
+      bwdb_bias_diff.reset(new MKLDNNData<DType>(usr_bias_mpd, bwdf_prv_diff_bias_mpd));
+      bwdb_bias_diff->name = "bwdb_bias_diff   @ " + this->getName();
+    }
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_data[deconv::kData], s);
+    Tensor<xpu, 4, DType> out =
+      mkl_experimental_direct_get<xpu, 4, DType>(out_data[deconv::kOut], s);
+    Tensor<xpu, 4, DType> wmat =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_data[deconv::kWeight], s);
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(wmat.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    if (deconvFwd_pd == NULL) {
+      this->init_properties(data, out);
+      InitDeconvolution(ctx);
+    }
+    // Diff Dst => dy => data
+    // Diff Src => dx => out
+    std::shared_ptr<memory> fwd_data_primitive, fwd_weights_primitive, fwd_out_memory;
+    fwd_data_primitive = fwd_bottom_data->get_converted_prv(data.dptr_, true,
+      in_data[deconv::kData]);
+    fwd_weights_primitive = fwd_filter_data->get_converted_prv(wmat.dptr_, false,
+      in_data[deconv::kWeight]);
+    fwd_out_memory = fwd_top_data->create_output_memory(out.dptr_,
+      out_data[deconv::kOut], fwd_top_data);
+    deconvFwd.reset(new convolution_backward_data(*deconvFwd_pd
+      , *fwd_data_primitive, *fwd_weights_primitive
+      , *fwd_out_memory));
+    deconvFwd.submit();
+    if (!param_.no_bias) {
+      // add bias, broadcast bias to dim 1: channel
+      Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);
+      Tensor<xpu, 4, DType> out_cpu = out_data[deconv::kOut].get<xpu, 4, DType>(s);
+      out_cpu += broadcast<1>(bias, out_cpu.shape_);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(bing): check the BLAS Handle, be careful
+    CHECK_EQ(out_grad.size(), 1);
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
+    // get data
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_data[deconv::kData], s);
+    Tensor<xpu, 4, DType> grad =
+      mkl_experimental_direct_get<xpu, 4, DType>(out_grad[deconv::kOut], s);
+    Tensor<xpu, 4, DType> gdata =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_grad[deconv::kData], s);
+    Shape<3> wmat_shape =
+      Shape3(param_.num_group,
+        data.shape_[1] / param_.num_group,
+        param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+
+    Tensor<xpu, 3, DType> wmat =
+      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
+        in_data[deconv::kWeight], wmat_shape, s);
+    Tensor<xpu, 3, DType> gwmat =
+      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
+        in_grad[deconv::kWeight], wmat_shape, s);
+    std::shared_ptr<memory> bwdf_src_primitive, bwdf_diff_dst_primitive;
+    std::shared_ptr<memory> bwdf_diff_weights_memory, bwdd_diff_bias_memory;
+    if (req[1]) {
+      bwdf_diff_dst_primitive = bwdf_top_diff->get_converted_prv(grad.dptr_, true,
+        out_grad[deconv::kOut]);
+      bwdf_src_primitive = bwdf_bottom_data->get_converted_prv(data.dptr_, false,
+        in_data[deconv::kData]);
+      Storage::Handle addtoWorkspace;
+      if (req[1] == kAddTo) {
+        // wait mkl support addto mode
+        this->AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[deconv::kWeight].Size(),
+          &addtoWorkspace);
+      }
+      bwdf_diff_weights_memory = bwdf_filter_diff->create_output_memory(gwmat.dptr_,
+        in_grad[deconv::kWeight], bwdf_filter_diff);
+
+      if (!this->param_.no_bias) {
+        Tensor<xpu, 1, DType> gbias =
+          mkl_experimental_direct_get<xpu, 1, DType>(in_grad[deconv::kBias], s);
+        bwdd_diff_bias_memory = bwdb_bias_diff->create_output_memory(gbias.dptr_,
+          in_grad[deconv::kBias], bwdb_bias_diff);
+        deconvBwdWeight.reset(new convolution_backward_weights(*deconvBwdWeight_pd
+          , *bwdf_diff_dst_primitive, *bwdf_src_primitive, *bwdf_diff_weights_memory
+          , *bwdd_diff_bias_memory));
+      } else {
+        deconvBwdWeight.reset(new convolution_backward_weights(*deconvBwdWeight_pd
+          , *bwdf_diff_dst_primitive, *bwdf_src_primitive, *bwdf_diff_weights_memory));
+      }
+      deconvBwdWeight.submit();
+      if (req[1] == kAddTo) {
+        if (bwdf_filter_diff->conversion_needed()) {
+          bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
+        }
+        this->AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_,
+          in_grad[deconv::kWeight].Size());
+      }
+    }
+    if (req[deconv::kData] != kNullOp) {
+      std::shared_ptr<memory> grad_primitive, weights_primitive;
+      std::shared_ptr<memory> gdata_output_memory;
+      grad_primitive = bwdd_top_diff->get_converted_prv(grad.dptr_, false,
+        out_grad[deconv::kOut]);
+      weights_primitive = bwdd_filter_data->get_converted_prv(wmat.dptr_, true,
+        in_data[deconv::kWeight]);
+      gdata_output_memory = bwdd_bottom_diff->create_output_memory(gdata.dptr_,
+        in_grad[deconv::kData], bwdd_bottom_diff);
+      deconvBwd.reset(new convolution_forward(*deconvBwd_pd
+        , *grad_primitive, *weights_primitive, *gdata_output_memory));
+      deconvBwd.submit();
+    }
+  }
+
+ private:
+  bool init_mkldnn_;
+  DeconvolutionParam param_;
+  std::shared_ptr<MKLDNNData<DType> > fwd_bottom_data, fwd_top_data, fwd_filter_data;
+  std::shared_ptr<convolution_backward_data::primitive_desc> deconvFwd_pd;
+  MKLDNNPrimitive<DType> deconvFwd;
+  /* Bwd filter step */
+  std::shared_ptr<MKLDNNData<DType> > bwdf_bottom_data, bwdf_top_diff, bwdf_filter_diff,
+    bwdb_bias_diff;
+  std::shared_ptr<convolution_backward_weights::primitive_desc> deconvBwdWeight_pd;
+  MKLDNNPrimitive<DType> deconvBwdWeight;
+  std::shared_ptr<convolution_forward::primitive_desc> deconvBwd_pd;
+  MKLDNNPrimitive<DType> deconvBwd;
+  std::shared_ptr<MKLDNNData<DType> > bwdd_top_diff, bwdd_bottom_diff,
+    bwdd_filter_data;
+  memory::dims input_tz;
+  memory::dims bias_tz;
+  memory::dims output_tz;
+  memory::dims weights_tz;
+};  // class MKLDNNDeConvolutionOp
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_DECONVOLUTION_INL_H_
diff --git a/src/operator/mkl/mkldnn_elemwise_sum-inl.h b/src/operator/mkl/mkldnn_elemwise_sum-inl.h
new file mode 100644
index 0000000000..611362ca7d
--- /dev/null
+++ b/src/operator/mkl/mkldnn_elemwise_sum-inl.h
@@ -0,0 +1,233 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_elemwise_sum-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*
+*******************************************************************************/
+
+#pragma once
+
+#include <dmlc/logging.h>
+#include <cstring>
+#include <vector>
+#include <mkldnn_types.h>
+#include "../operator_common.h"
+#include "../tensor/elemwise_binary_op.h"
+#include "../elemwise_op_common.h"
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+/**
+ * Adds n input data element-wise and store output to a single output buffer.
+ * @tparam xpu
+ * @tparam DType
+ * @param attrs
+ * @param ctx
+ * @param in_data
+ * @param req
+ * @param out_data
+ */
+template<typename xpu, typename DType>
+void MKLDNNElementWiseSumCompute(const nnvm::NodeAttrs &attrs,
+                                 const OpContext &ctx,
+                                 const std::vector<TBlob> &in_data,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<TBlob> &out_data) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  if (req[0] == kNullOp) return;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Instance().get_engine();
+
+  // TODO lfeng: MKLDNN might support other layouts (1D, 2D). leaving as
+  // future work.
+  // assuming all in_data TBlobs have same shape
+  // get data shape info
+  int32_t n = in_data[0].shape_[0];
+  int32_t c = in_data[0].shape_[1];
+  int32_t h = in_data[0].shape_[2];
+  int32_t w = in_data[0].shape_[3];
+
+  // if we are dealing with cpu only input/output data, we will use this
+  // descriptor format as default
+  memory::desc default_usr_desc =
+      {{n, c, h, w}, memory::data_type::f32, memory::format::nchw};
+
+  // start with the output data descriptor which will determine the layout for
+  // the inputs. which may need to be converted.
+  std::shared_ptr<memory> output;
+
+  // this is needed for the sum primitive descriptor, initialize to be the
+  // same as default usr descriptor.
+  memory::desc output_prv_desc(default_usr_desc);
+
+  std::shared_ptr<memory::primitive_desc> output_usr_mpd(nullptr);
+  std::shared_ptr<memory::primitive_desc> output_prv_mpd(nullptr);
+  // check if output data has a valid prv buffer set up
+  // TODO lfeng: it's possible that mkl prv data exists but is not valid
+  // (head_ != HEAD_AT_PRV), this should not happen in general and could mean
+  // the previous MKLDNN operator has a bug causing the output data to have
+  // head_ flag set to HEAD_AT_CPU. We should find a way to detect this case.
+  void *output_ptr = mkl_prv_data<DType>(out_data[0]);
+  if (output_ptr != nullptr) {
+    // the output data has a valid prv buffer, we will use it directly
+    std::shared_ptr<MKLDNNData<DType>>
+        output_dnn_data = get_mkldnn_prv_descriptor<DType>(out_data[0]);
+    // get memory primitive descriptor for usr and prv
+    output_usr_mpd = output_dnn_data->usr_memory_pd();
+    output_prv_mpd = output_dnn_data->prv_memory_pd();
+
+    output_prv_desc = output_prv_mpd->desc().data;
+
+    // use the output prv memory directly
+    output = output_dnn_data->get_prv_memory();
+
+  } else {
+    // TODO lfeng: this should be rare and expensive, maybe output a warning?
+    // if output data does not have a mkl prv buffer, we assume usr
+    // layout, default is nchw
+    output_usr_mpd.reset(new memory::primitive_desc(default_usr_desc,
+                                                    cpu_engine));
+    output_prv_mpd.reset(new memory::primitive_desc(output_prv_desc,
+                                                    cpu_engine));
+    std::shared_ptr<MKLDNNData<DType>> output_dnn_data;
+    output_dnn_data.reset(new MKLDNNData<DType>(output_usr_mpd,
+                                                output_prv_mpd));
+    // create output memory primitive and update the out_data[0].Mkl_mem_ data
+    // to use output_dnn_data, in this case it means cpu data buffer will be
+    // used for output prv data (since their layouts are the same)
+    output =
+        output_dnn_data->create_output_memory(static_cast<DType *>(out_data[0].dptr_),
+                                              out_data[0],
+                                              output_dnn_data);
+  }
+
+  // Inputs - get input memory descriptors
+  std::vector<primitive::at> inputs;
+  // store an input memory primitive descriptor for each input data, this is
+  // required for creating sum primitive descriptor
+  std::vector<memory::primitive_desc> input_prv_mpd_array;
+  std::shared_ptr<MKLDNNData<DType>> input_dnn_data(nullptr);
+  // TODO lfeng: don't really want to store these shared_ptrs, but have to
+  // keep these memory alive.
+  std::vector<std::shared_ptr<memory>> input_memory_sp;
+  for (size_t i = 0; i < in_data.size(); ++i) {
+    std::shared_ptr<memory::primitive_desc> input_usr_mpd;
+    std::shared_ptr<memory::primitive_desc> input_prv_mpd;
+    // checking if we have mkldnn prv data
+    void *input_ptr = mkl_prv_data<DType>(in_data[i]);
+    if (input_ptr != nullptr) {
+      // input data has valid prv buffer
+      input_dnn_data = get_mkldnn_prv_descriptor<DType>(in_data[i]);
+      input_usr_mpd = input_dnn_data->usr_memory_pd();
+      input_prv_mpd = input_dnn_data->prv_memory_pd();
+      // check input prv descriptor match output prv descriptor
+      if (input_prv_mpd != output_prv_mpd) {
+        // TODO lfeng: this should be rare and expensive, maybe output a warning?
+        // input and output prv layout are different, we don't want modify
+        // the input data object prv buffer, so we need to create annew
+        // MKLDNNData object and do a conversion and copy data to new memory
+        // buffer, this is expensive.
+        input_prv_mpd.reset(new memory::primitive_desc(output_prv_desc,
+                                                       cpu_engine));
+        input_dnn_data.reset(new MKLDNNData<DType>(input_usr_mpd,
+                                                   input_prv_mpd));
+      }
+    } else {
+      // default usr descriptor
+      input_usr_mpd.reset(new memory::primitive_desc(default_usr_desc,
+                                                     cpu_engine));
+
+      // for prv buffer, we want to match with the output prv desc
+      input_prv_mpd.reset(new memory::primitive_desc(output_prv_desc,
+                                                     cpu_engine));
+
+      input_dnn_data.reset(new MKLDNNData<DType>(input_usr_mpd, input_prv_mpd));
+    }
+    input_prv_mpd_array.push_back(*input_prv_mpd);
+
+    // this is where the magic happens. Depending on how the layouts are
+    // configured, we should get a prv pointer with valid layout for the input.
+    input_memory_sp.push_back(
+        input_dnn_data->get_converted_prv(static_cast<float *>(in_data[i].dptr_),
+                                          false,
+                                          in_data[i]));
+    inputs.push_back(*input_memory_sp[i]);
+  }
+
+  // scaling factor for each input data
+  std::vector<double> scale(in_data.size(), 1.0);
+
+  // sum primitive descriptor
+  // need output memory::desc, scale per input, and memory primitive_desc for
+  // inputs
+  sum::primitive_desc sum_pd(output_prv_desc, scale, input_prv_mpd_array);
+
+  MKLDNNPrimitive<DType> elemwise_sum;
+  elemwise_sum.reset(new mkldnn::sum(sum_pd, inputs, *output));
+  elemwise_sum.submit();
+}
+
+/**
+ * Intended for adding two input buffers element-wise and store in a single
+ * output buffer.
+ * @tparam xpu
+ * @param attrs
+ * @param ctx
+ * @param in_data
+ * @param req
+ * @param out_data
+ */
+template<typename xpu>
+inline void MKLDNNElementWiseAddCompute(const nnvm::NodeAttrs &attrs,
+                                 const OpContext &ctx,
+                                 const std::vector<TBlob> &in_data,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<TBlob> &out_data) {
+
+
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(in_data.size(), 2U);
+  CHECK_EQ(out_data.size(), 1U);
+  const auto& shape = in_data[0].shape_;
+  if (shape.ndim() == 4 && shape[0] > 0 && shape[1] > 0 && shape[2] > 0 &&
+      shape[3] > 0 &&
+      out_data[0].type_flag_ == mshadow::kFloat32) {
+    // MKLDNN does not work for certain shapes (requires dim = 4, non of which
+    // can be 0, and supports floats only)
+    MKLDNNElementWiseSumCompute<cpu, float>(attrs, ctx, in_data, req, out_data);
+  }
+  else {
+    // fallback to CPU implementation
+    ElemwiseBinaryOp::Compute<cpu, mshadow::op::plus>(attrs, ctx, in_data,
+                                                      req, out_data);
+  }
+}
+}
+}
diff --git a/src/operator/mkl/mkldnn_fully_connected-inl.h b/src/operator/mkl/mkldnn_fully_connected-inl.h
new file mode 100644
index 0000000000..42bb0b69f5
--- /dev/null
+++ b/src/operator/mkl/mkldnn_fully_connected-inl.h
@@ -0,0 +1,457 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_fully_connected-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*         
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_FULLY_CONNECTED_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_FULLY_CONNECTED_INL_H_
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "mkldnn_base-inl.h"
+#include "mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename Dtype>
+class MKLDNNFullyConnectedOp : public Operator, public MKLDNNLayer<Dtype> {
+ public:
+  explicit MKLDNNFullyConnectedOp(FullyConnectedParam p):
+    init_mkldnn_(false),
+    fwd_bottom_data(NULL),
+    fwd_top_data(NULL),
+    fwd_weights_data(NULL),
+    fwd_bias_data(NULL),
+    bwdd_weights_data(NULL),
+    bwdw_bottom_data(NULL),
+    bwdd_bottom_diff(NULL),
+    bwdd_top_diff(NULL),
+    bwdw_top_diff(NULL),
+    bwdw_weights_diff(NULL),
+    bwdw_bias_diff(NULL),
+    ipFwd_pd(NULL),
+    ipBwdData_pd(NULL),
+    ipBwdWeights_pd(NULL),
+    w_(0), h_(0) {
+    param_ = p;
+  }
+
+  ~MKLDNNFullyConnectedOp() {}
+  std::string getName() {
+    return "MKLDNNFullyConnectedOp";
+  }
+
+ private:
+  void LayerSetUp(const mshadow::Tensor<xpu, 2, Dtype> &data,
+     const mshadow::Tensor<xpu, 2, Dtype> &out) {
+     this->w_ = 1;
+     this->h_ = 1;
+     this->M_ = data.shape_[0];
+     this->channels_ = data.shape_[1];
+     this->N_ = out.shape_[1];
+  }
+    void InitInnerProductFwd(const std::vector<TBlob> &in_data) {
+      int32_t n = this->M_;
+      int32_t w = this->w_;
+      int32_t h = this->h_;
+      int32_t oc = this->N_;
+      int32_t ic = this->channels_;
+      bool has_spatial = h > 1 || w > 1;
+
+      // Initialize memory descriptors (fromat = any) to create inner_product descriptor
+      memory::data_type mpcsn = memory::data_type::f32;
+      memory::format mfmt = memory::format::any;
+
+      memory::dims bottom_tz =
+        (has_spatial) ? memory::dims{ n, ic, h, w } : memory::dims{ n, ic };
+      memory::dims top_tz = { n, oc };
+      memory::dims weights_tz =
+        (has_spatial) ? memory::dims{ oc, ic, h, w } : memory::dims{ oc, ic };
+      memory::dims bias_tz = { oc };
+
+      memory::desc init_bottom_md({ bottom_tz }, mpcsn, mfmt);
+      memory::desc init_top_md({ top_tz }, mpcsn, mfmt);
+      memory::desc init_weights_md({ weights_tz }, mpcsn, mfmt);
+      memory::desc init_bias_md({ bias_tz }, mpcsn, mfmt);
+
+      // Initialize inner_product primitive descriptor
+      std::shared_ptr<inner_product_forward::desc> ipFwd_desc;
+      if (!param_.no_bias) {
+        ipFwd_desc.reset(new inner_product_forward::desc(
+          prop_kind::forward_training, init_bottom_md,
+          init_weights_md, init_bias_md, init_top_md));
+      } else {
+        ipFwd_desc.reset(new inner_product_forward::desc(
+          prop_kind::forward_training, init_bottom_md,
+          init_weights_md, init_top_md));
+      }
+      mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+      ipFwd_pd.reset(new inner_product_forward::primitive_desc(*ipFwd_desc, cpu_engine));
+      CHECK(ipFwd_pd);
+
+      // Create priv memory primitive descriptors stored as class members
+      typedef typename memory::primitive_desc MemPD;
+
+      std::shared_ptr<MemPD> prv_fwd_bottom_data_memory_pd(
+        new MemPD(ipFwd_pd->src_primitive_desc()));
+      std::shared_ptr<MemPD> prv_fwd_top_data_memory_pd(
+        new MemPD(ipFwd_pd->dst_primitive_desc()));
+      std::shared_ptr<MemPD> prv_fwd_weights_data_memory_pd(
+        new MemPD(ipFwd_pd->weights_primitive_desc()));
+      std::shared_ptr<MemPD> prv_fwd_bias_data_memory_pd(
+        new MemPD(ipFwd_pd->bias_primitive_desc()));
+      memory::format input_mfmt = has_spatial ? memory::format::nchw : memory::format::nc;
+      std::shared_ptr<MemPD> usr_bottom_data_memory_pd(
+        new MemPD({ { bottom_tz }, mpcsn, input_mfmt }, cpu_engine));
+      std::shared_ptr<MemPD> usr_bias_data_memory_pd(
+        new MemPD({ { bias_tz }, mpcsn, memory::format::x }, cpu_engine));
+      std::shared_ptr<MemPD> usr_top_data_memory_pd(
+        new MemPD({ { top_tz }, mpcsn, memory::format::nc }, cpu_engine));
+      memory::format weights_mfmt = has_spatial ? memory::format::oihw : memory::format::oi;
+      std::shared_ptr<MemPD> usr_weights_data_memory_pd(
+        new MemPD({ { weights_tz }, mpcsn, weights_mfmt }, cpu_engine));
+
+      // ---  init primitive and prv_memory descriptors ----------------------
+      fwd_bottom_data.reset(new MKLDNNData<Dtype>(
+        usr_bottom_data_memory_pd, prv_fwd_bottom_data_memory_pd));
+      fwd_top_data.reset(new MKLDNNData<Dtype>(
+        usr_top_data_memory_pd, prv_fwd_top_data_memory_pd));
+      fwd_weights_data.reset(new MKLDNNData<Dtype>(
+        usr_weights_data_memory_pd, prv_fwd_weights_data_memory_pd));
+      fwd_bias_data.reset(new MKLDNNData<Dtype>(
+        usr_bias_data_memory_pd, prv_fwd_bias_data_memory_pd));
+
+      // Names are for debugging purposes only.
+      fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+      fwd_top_data->name = "fwd_top_data      @ " + this->getName();
+      fwd_weights_data->name = "fwd_weights_data  @ " + this->getName();
+      fwd_bias_data->name = "fwd_bias_data     @ " + this->getName();
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    if (req[fullc::kOut] == kNullOp) return;
+    CHECK_EQ(req[fullc::kOut], kWriteTo);
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1);
+    int status;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2, Dtype> data;
+    Tensor<xpu, 2, Dtype> out;
+
+    const TShape& ishape = in_data[fullc::kData].shape_;
+    const TShape& oshape = out_data[fullc::kOut].shape_;
+
+    data = mkl_experimental_direct_get_with_shape<xpu, 2, Dtype>(
+      in_data[fullc::kData],
+      Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    out = mkl_experimental_direct_get_with_shape<xpu, 2, Dtype>(
+      out_data[fullc::kOut],
+      Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+    Tensor<xpu, 2, Dtype> wmat =
+      mkl_experimental_direct_get<xpu, 2, Dtype>(in_data[fullc::kWeight], s);
+
+
+    if (!init_mkldnn_) {
+      LayerSetUp(data, out);
+      init_mkldnn_ = true;
+    }
+    if (ipFwd_pd == NULL) {
+      InitInnerProductFwd(in_data);
+      fwd_bottom_data_primitive = fwd_bottom_data->get_converted_prv(data.dptr_,
+        false, in_data[fullc::kData]);
+      fwd_weights_data_primitive = fwd_weights_data->get_converted_prv(wmat.dptr_,
+        true, in_data[fullc::kWeight]);
+      fwd_top_data_memory = fwd_top_data->create_output_memory(
+        out.dptr_, out_data[fullc::kOut], fwd_top_data);
+      if (!param_.no_bias) {
+        Tensor<xpu, 1, Dtype> bias =
+          mkl_experimental_direct_get<xpu, 1, Dtype>(in_data[fullc::kBias], s);
+        fwd_bias_data_primitive = fwd_bias_data->get_converted_prv(bias.dptr_,
+          true, in_data[fullc::kBias]);
+        ipFwd.reset(new inner_product_forward(*ipFwd_pd
+          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+          , *fwd_bias_data_primitive, *fwd_top_data_memory));
+      } else {
+        ipFwd.reset(new inner_product_forward(*ipFwd_pd
+          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+          , *fwd_top_data_memory));
+      }
+    } else {
+      fwd_bottom_data->sync_converted_prv(data.dptr_,
+        false, in_data[fullc::kData]);
+      fwd_weights_data->sync_converted_prv(wmat.dptr_,
+        true, in_data[fullc::kWeight]);
+      fwd_top_data->sync_output_memory(
+        out_data[fullc::kOut], fwd_top_data);
+      if (!param_.no_bias) {
+        Tensor<xpu, 1, Dtype> bias =
+          mkl_experimental_direct_get<xpu, 1, Dtype>(in_data[fullc::kBias], s);
+        fwd_bias_data->sync_converted_prv(bias.dptr_,
+          true, in_data[fullc::kBias]);
+      }
+    }
+    ipFwd.submit();
+
+  }
+  void InitInnerProductBwd() {
+    int32_t n = this->M_;
+    int32_t w = this->w_;
+    int32_t h = this->h_;
+    int32_t oc = this->N_;
+    int32_t ic = this->channels_;
+    bool has_spatial = h > 1 || w > 1;
+
+    // Initialize memory descriptors (format = any) to create inner_product descriptor
+    memory::data_type mpcsn = memory::data_type::f32;
+    memory::format mfmt = memory::format::any;
+
+    memory::dims bottom_tz =
+      (has_spatial) ? memory::dims{ n, ic, h, w } : memory::dims{ n, ic };
+    memory::dims top_tz = { n, oc };
+    memory::dims weights_tz =
+      (has_spatial) ? memory::dims{ oc, ic, h, w } : memory::dims{ oc, ic };
+    memory::dims bias_tz = { oc };
+
+    memory::desc init_bottom_md({ bottom_tz }, mpcsn, mfmt);
+    memory::desc init_top_md({ top_tz }, mpcsn, mfmt);
+    memory::desc init_weights_md({ weights_tz }, mpcsn, mfmt);
+    memory::desc init_bias_md({ bias_tz }, mpcsn, mfmt);
+
+    // Initialize inner_product primitive descriptor
+    std::shared_ptr<inner_product_backward_data::desc> ipBwdData_desc;
+    std::shared_ptr<inner_product_backward_weights::desc> ipBwdWeights_desc;
+
+    if (!param_.no_bias)
+      ipBwdWeights_desc.reset(new inner_product_backward_weights::desc(
+        init_bottom_md, init_weights_md
+        , init_bias_md, init_top_md));
+    else
+      ipBwdWeights_desc.reset(new inner_product_backward_weights::desc(
+        init_bottom_md, init_weights_md, init_top_md));
+    
+    ipBwdData_desc.reset(new inner_product_backward_data::desc(
+      init_bottom_md, init_weights_md, init_top_md));
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    ipBwdData_pd.reset(new inner_product_backward_data::primitive_desc(*ipBwdData_desc,
+      cpu_engine, *ipFwd_pd));
+    CHECK(ipBwdData_pd);
+    ipBwdWeights_pd.reset(new inner_product_backward_weights::primitive_desc(
+      *ipBwdWeights_desc, cpu_engine, *ipFwd_pd));
+    CHECK(ipBwdWeights_pd);
+    // Create priv memory primitive descriptors stored as class members
+    typedef typename memory::primitive_desc MemPD;
+    std::shared_ptr<MemPD> prv_bwdd_bottom_diff_memory_pd(
+      new MemPD(ipBwdData_pd->diff_src_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdd_top_diff_memory_pd(
+      new MemPD(ipBwdData_pd->diff_dst_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdd_weights_data_memory_pd(
+      new MemPD(ipBwdData_pd->weights_primitive_desc()));
+
+    std::shared_ptr<MemPD> prv_bwdw_bottom_data_memory_pd(
+      new MemPD(ipBwdWeights_pd->src_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdw_top_diff_memory_pd(
+      new MemPD(ipBwdWeights_pd->diff_dst_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdw_weights_diff_memory_pd(
+      new MemPD(ipBwdWeights_pd->diff_weights_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(
+      new MemPD(ipBwdWeights_pd->diff_bias_primitive_desc()));
+
+    // Create usr memory primitive descriptors stored as class members
+
+    memory::format input_mfmt = has_spatial ? memory::format::nchw : memory::format::nc;
+    std::shared_ptr<MemPD> usr_bottom_data_memory_pd(
+      new MemPD({ { bottom_tz }, mpcsn, input_mfmt }, cpu_engine));
+    std::shared_ptr<MemPD> usr_bias_data_memory_pd(
+      new MemPD({ { bias_tz }, mpcsn, memory::format::x }, cpu_engine));
+    std::shared_ptr<MemPD> usr_top_data_memory_pd(
+      new MemPD({ { top_tz }, mpcsn, memory::format::nc }, cpu_engine));
+    memory::format weights_mfmt = has_spatial ? memory::format::oihw : memory::format::oi;
+    std::shared_ptr<MemPD> usr_weights_data_memory_pd(
+      new MemPD({ { weights_tz }, mpcsn, weights_mfmt }, cpu_engine));
+
+    // ---  init primitive and prv_memory descriptors ----------------------
+    bwdd_bottom_diff.reset(new MKLDNNData<Dtype>(
+      usr_bottom_data_memory_pd, prv_bwdd_bottom_diff_memory_pd));
+    bwdd_bottom_diff->name = "bwdd_bottom_diff   @ " + this->getName();
+    bwdw_bottom_data.reset(new MKLDNNData<Dtype>(
+      usr_bottom_data_memory_pd, prv_bwdw_bottom_data_memory_pd));
+    bwdw_bottom_data->name = "bwdw_bottom_data   @ " + this->getName();
+
+    bwdd_top_diff.reset(new MKLDNNData<Dtype>(
+      usr_top_data_memory_pd, prv_bwdd_top_diff_memory_pd));
+    bwdd_top_diff->name = "bwdd_top_diff      @ " + this->getName();
+    bwdw_top_diff.reset(new MKLDNNData<Dtype>(
+      usr_top_data_memory_pd, prv_bwdw_top_diff_memory_pd));
+    bwdw_top_diff->name = "bwdw_top_diff      @ " + this->getName();;
+
+    bwdd_weights_data.reset(new MKLDNNData<Dtype>(
+      usr_weights_data_memory_pd, prv_bwdd_weights_data_memory_pd));
+    bwdd_weights_data->name = "bwdd_weights_data  @ " + this->getName();
+    bwdw_weights_diff.reset(new MKLDNNData<Dtype>(
+      usr_weights_data_memory_pd, prv_bwdw_weights_diff_memory_pd));
+    bwdw_weights_diff->name = "bwdw_weights_diff  @ " + this->getName();;
+
+    if (!param_.no_bias) {
+      bwdw_bias_diff.reset(new MKLDNNData<Dtype>(
+        usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_pd));
+      bwdw_bias_diff->name = "bwdw_bias_diff     @ " + this->getName();;
+    }
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& ishape = in_data[fullc::kData].shape_;
+    const TShape& oshape = out_grad[fullc::kOut].shape_;
+
+    Tensor<xpu, 2, Dtype> data = mkl_experimental_direct_get_with_shape<xpu, 2, Dtype>(
+      in_data[fullc::kData],
+      Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    Tensor<xpu, 2, Dtype> wmat = mkl_experimental_direct_get<xpu, 2, Dtype>(
+      in_data[fullc::kWeight], s);
+    Tensor<xpu, 2, Dtype> grad = mkl_experimental_direct_get_with_shape<xpu, 2, Dtype>(
+      out_grad[fullc::kOut],
+      Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+    //  backprop
+    CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+    // gradient of weight
+    Tensor<xpu, 2, Dtype> gwmat = mkl_experimental_direct_get<xpu, 2, Dtype>(
+      in_grad[fullc::kWeight], s);
+    Tensor<xpu, 2, Dtype> gdata = mkl_experimental_direct_get_with_shape<xpu, 2, Dtype>(
+      in_grad[fullc::kData],
+      Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    Tensor<xpu, 1, Dtype> gbias;
+    if (!param_.no_bias)
+        gbias = mkl_experimental_direct_get<xpu, 1, Dtype>(
+            in_grad[fullc::kBias], s);
+    if (!init_mkldnn_) {
+      LayerSetUp(data, grad);
+      init_mkldnn_ = true;
+    }
+    if (ipBwdData_pd == NULL) {
+      InitInnerProductBwd();
+    }
+    if (req[fullc::kData]) {
+      if (ipBwdData.aprimitive == NULL) {
+      bwdd_top_diff_primitive = bwdd_top_diff->get_converted_prv(grad.dptr_,
+        false, out_grad[fullc::kOut]);
+      bwdd_weights_data_primitive = bwdd_weights_data->get_converted_prv(wmat.dptr_,
+        false, in_data[fullc::kWeight]);
+      bwdd_bottom_diff_memory = bwdd_bottom_diff->create_output_memory(gdata.dptr_,
+        in_grad[fullc::kData], bwdd_bottom_diff);
+      ipBwdData.reset(new inner_product_backward_data(*ipBwdData_pd
+        , *bwdd_top_diff_primitive, *bwdd_weights_data_primitive
+        , *bwdd_bottom_diff_memory));
+      } else {
+       bwdd_top_diff->sync_converted_prv(grad.dptr_,
+        false, out_grad[fullc::kOut]);
+       bwdd_weights_data->sync_converted_prv(wmat.dptr_,
+        false, in_data[fullc::kWeight]);
+       bwdd_bottom_diff->sync_output_memory(
+        in_grad[fullc::kData], bwdd_bottom_diff);
+      }
+      ipBwdData.submit();
+    }
+    if (req[fullc::kWeight]) {
+      if (ipBwdWeights.aprimitive == NULL) {
+        bwdw_bottom_data_primitive = bwdw_bottom_data->get_converted_prv(data.dptr_,
+          false, in_data[fullc::kData]);
+        bwdw_top_diff_primitive = bwdw_top_diff->get_converted_prv(grad.dptr_,
+          false, out_grad[fullc::kOut]);
+        bwdw_weights_diff_memory = bwdw_weights_diff->create_output_memory(gwmat.dptr_,
+        in_grad[fullc::kWeight], bwdw_weights_diff);
+        if (!param_.no_bias) {
+          bwdw_bias_diff_memory = bwdw_bias_diff->create_output_memory(gbias.dptr_,
+            in_grad[fullc::kBias], bwdw_bias_diff);
+          ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
+            , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+            , *bwdw_weights_diff_memory, *bwdw_bias_diff_memory));
+        } else {
+          ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
+            , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+            , *bwdw_weights_diff_memory));
+        }
+      } else {
+         bwdw_bottom_data->sync_converted_prv(data.dptr_,
+          false, in_data[fullc::kData]);
+         bwdw_top_diff->sync_converted_prv(grad.dptr_,
+          false, out_grad[fullc::kOut]);
+         bwdw_weights_diff->sync_output_memory(
+        in_grad[fullc::kWeight], bwdw_weights_diff);
+        if (!param_.no_bias) 
+           bwdw_bias_diff->sync_output_memory(
+            in_grad[fullc::kBias], bwdw_bias_diff);
+      }
+      ipBwdWeights.submit();
+    }
+
+  }
+
+ private:
+  bool init_mkldnn_;
+  std::shared_ptr<MKLDNNData<Dtype> > fwd_bottom_data, fwd_top_data, fwd_weights_data,
+    fwd_bias_data, bwdd_weights_data, bwdw_bottom_data, bwdd_bottom_diff, bwdd_top_diff,
+    bwdw_top_diff, bwdw_weights_diff, bwdw_bias_diff;
+  std::shared_ptr<inner_product_forward::primitive_desc> ipFwd_pd;
+  std::shared_ptr<inner_product_backward_data::primitive_desc> ipBwdData_pd;
+  std::shared_ptr<inner_product_backward_weights::primitive_desc> ipBwdWeights_pd;
+  MKLDNNPrimitive<Dtype> ipFwd, ipBwdData, ipBwdWeights;
+
+  std::shared_ptr<memory> fwd_top_data_memory;
+  std::shared_ptr<primitive> fwd_bottom_data_primitive,
+    fwd_weights_data_primitive, fwd_bias_data_primitive;
+    std::shared_ptr<memory> bwdd_bottom_diff_memory
+      , bwdw_weights_diff_memory, bwdw_bias_diff_memory;
+    std::shared_ptr<primitive> bwdd_top_diff_primitive, bwdd_weights_data_primitive
+      , bwdw_top_diff_primitive, bwdw_bottom_data_primitive;
+  int32_t w_, h_;
+  int M_;
+  int channels_;
+  int N_;
+  FullyConnectedParam param_;
+};  // class MKLDNNFullyConnectedOp
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/mkl/mkldnn_lrn-inl.h b/src/operator/mkl/mkldnn_lrn-inl.h
new file mode 100644
index 0000000000..7025ec2ad4
--- /dev/null
+++ b/src/operator/mkl/mkldnn_lrn-inl.h
@@ -0,0 +1,269 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_lrn-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_LRN_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_LRN_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename Dtype>
+class MKLDNNLRNOp : public Operator, public MKLDNNLayer<Dtype> {
+ public:
+  std::string getName() {
+    return "MKLDNNLRNOp";
+  }
+
+  explicit MKLDNNLRNOp(LRNParam param) :
+    MKLDNNLayer<Dtype>()
+    , fwd_bottom_data(NULL), fwd_top_data(NULL)
+    , bwd_bottom_diff(NULL), bwd_top_diff(NULL)
+    , lrnFwdInference_pd(NULL), lrnBwd_pd(NULL)
+    , alpha_(0), beta_(0), k_(0)
+    , size_(0), num_(0), width_(0), height_(0), channels_(0) {
+    lrn_algorithm = algorithm::lrn_across_channels;
+    this->param_ = param;
+    init_mkldnn_ = false;
+  }
+
+  virtual ~MKLDNNLRNOp() {
+  }
+
+ private:
+  void LayerSetup(const mshadow::Tensor<xpu, 4, Dtype> &data) {
+    size_ = param_.nsize;
+    CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size";
+    alpha_ = param_.alpha;
+    beta_ = param_.beta;
+    k_ = param_.knorm;
+    channels_ = data.shape_[1];
+    height_ = data.shape_[2];
+    width_ = data.shape_[3];
+    num_ = data.shape_[0];
+  }
+  void InitLRNFwd(const std::vector<TBlob> &in_data) {
+    int32_t n = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+
+    bool bottom_data_is_prv =
+      (const_cast<Dtype*>(mkl_prv_data<Dtype>(in_data[lrn_enum::kData])) != NULL);
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mpcsn = memory::data_type::f32;
+    // ---- Initialize memory descriptors -------------
+    memory::dims tz = { n, ic, ih, iw };
+    std::shared_ptr<memory::desc> bottom_md, top_md;
+    std::shared_ptr<memory::primitive_desc> usr_mpd, prv_mpd;
+    if (bottom_data_is_prv) {
+      std::shared_ptr<MKLDNNData<Dtype> > mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(in_data[lrn_enum::kData].Mkl_mem_);
+      bottom_md.reset(new memory::desc(mem_descr->prv_memory_pd()->desc()));
+      usr_mpd = mem_descr->usr_memory_pd();
+      prv_mpd = mem_descr->prv_memory_pd();
+    } else {
+      bottom_md.reset(new memory::desc({ tz }, mpcsn, memory::format::nchw));
+      usr_mpd.reset(new memory::primitive_desc(*bottom_md, cpu_engine));
+    }
+    top_md = bottom_md;
+
+    // ---- Initialize LRN primitive descriptor -------------
+    lrn_forward::desc lrnFwdInference_desc(prop_kind::forward_scoring, lrn_algorithm, *bottom_md,
+      size_, alpha_, beta_);
+
+    lrnFwdInference_pd.reset(new lrn_forward::primitive_desc(lrnFwdInference_desc, cpu_engine));
+    CHECK(lrnFwdInference_pd);
+    lrn_forward::desc lrnFwdTraining_desc(prop_kind::forward_training, lrn_algorithm, *bottom_md,
+      size_, alpha_, beta_);
+    lrnFwdTraining_pd.reset(new lrn_forward::primitive_desc(lrnFwdTraining_desc, cpu_engine));
+    CHECK(lrnFwdTraining_pd);
+    typedef typename memory::primitive_desc MemPD;
+    std::shared_ptr<MemPD> prv_fwd_bottom_data_memory_pd(
+      new MemPD(lrnFwdTraining_pd->src_primitive_desc()));
+    std::shared_ptr<MemPD> prv_fwd_top_data_memory_pd(
+      new MemPD(lrnFwdTraining_pd->dst_primitive_desc()));
+
+    // ---- Create usr memory primitive descriptors -------------
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::format scratch_mfmt = memory::format::nchw;
+
+    std::shared_ptr<MemPD> usr_data_memory_pd(new MemPD({ { tz }, mpcsn, mfmt_nchw }, cpu_engine));
+
+    // ---  init primitive and prv_memory descriptors ----------------------
+    fwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_data_memory_pd, prv_fwd_bottom_data_memory_pd));
+    fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+    fwd_top_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_fwd_top_data_memory_pd));
+    fwd_top_data->name = "fwd_top_data   @ " + this->getName();
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(param_.nsize % 2, 1) << "LRN only supports odd values for local_size";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, Dtype> data = mkl_experimental_direct_get<xpu, 4, Dtype>(
+      in_data[lrn_enum::kData], s);
+    Tensor<xpu, 4, Dtype> out = mkl_experimental_direct_get<xpu, 4, Dtype>(
+      out_data[lrn_enum::kOut], s);
+    if (!init_mkldnn_) {
+      LayerSetup(data);
+      init_mkldnn_ = true;
+    }
+    if (lrnFwdInference_pd == NULL) {
+      InitLRNFwd(in_data);
+    }
+    MKLDNNPrimitive<Dtype> lrnFwd;
+    fwd_bottom_data_primitive =
+      fwd_bottom_data->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]);
+    std::shared_ptr<memory> fwd_top_data_memory = fwd_top_data->create_output_memory(
+      out.dptr_, out_data[lrn_enum::kOut], fwd_top_data);
+    if (ctx.is_train) {
+      memory::primitive_desc scratch_mpd(lrnFwdTraining_pd->workspace_primitive_desc());
+      scratch_memory.reset(new memory(scratch_mpd));
+      lrnFwd.reset(new lrn_forward(*lrnFwdTraining_pd, *fwd_bottom_data_primitive, *scratch_memory,
+        *fwd_top_data_memory));
+    } else {
+      lrnFwd.reset(new lrn_forward(*lrnFwdInference_pd, *fwd_bottom_data_primitive,
+        *fwd_top_data_memory));
+    }
+    lrnFwd.submit();
+  }
+  void InitLRNBwd(const std::vector<TBlob> &out_grad) {
+    int32_t n = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+    void * top_diff_data =
+      const_cast<Dtype*>(mkl_prv_data<Dtype>(out_grad[lrn_enum::kOut]));
+    bool top_diff_is_prv = (top_diff_data != NULL);
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mpcsn = memory::data_type::f32;
+    // ---- Initialize memory descriptors -------------
+    memory::dims tz = { n, ic, ih, iw };
+    std::shared_ptr<memory::desc> bottom_diff_md, top_diff_md;
+    std::shared_ptr<memory::primitive_desc> usr_diff_mpd, prv_diff_mpd;
+    if (top_diff_is_prv) {
+      std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(out_grad[lrn_enum::kOut].Mkl_mem_);
+      top_diff_md.reset(new memory::desc(mem_descr->prv_memory_pd()->desc()));
+      usr_diff_mpd = mem_descr->usr_memory_pd();
+      prv_diff_mpd = mem_descr->prv_memory_pd();
+    } else {
+      top_diff_md.reset(new memory::desc({ tz }, mpcsn, memory::format::nchw));
+      usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine));
+    }
+    bottom_diff_md = top_diff_md;
+
+    // ---- Initialize LRN primitive descriptor -------------
+    lrn_backward::desc lrnBwd_desc(lrn_algorithm, *bottom_diff_md, *top_diff_md,
+      size_, alpha_, beta_);
+    lrnBwd_pd.reset(new lrn_backward::primitive_desc(lrnBwd_desc,
+      cpu_engine, *lrnFwdTraining_pd));
+
+    CHECK(lrnBwd_pd);
+    // ---- Create priv memory primitive descriptors stored as class members -------------
+    typedef typename memory::primitive_desc MemPD;
+    std::shared_ptr<MemPD> prv_bwd_bottom_diff_memory_pd(
+      new MemPD(lrnBwd_pd->diff_src_primitive_desc()));
+    std::shared_ptr<MemPD> prv_bwd_top_diff_memory_pd(
+      new MemPD(lrnBwd_pd->diff_dst_primitive_desc()));
+
+    // ---- Create usr memory primitive descriptors -------------
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::format scratch_mfmt = memory::format::nchw;
+
+    std::shared_ptr<MemPD> usr_data_memory_pd(new MemPD({ { tz }, mpcsn, mfmt_nchw }, cpu_engine));
+
+    // ---  init primitive and prv_memory descriptors ----------------------
+    bwd_bottom_diff.reset(new MKLDNNData<Dtype>(usr_data_memory_pd, prv_bwd_bottom_diff_memory_pd));
+    bwd_bottom_diff->name = "bwd_bottom_diff_data   @ " + this->getName();
+    bwd_top_diff.reset(new MKLDNNData<Dtype>(usr_diff_mpd, prv_bwd_top_diff_memory_pd));
+    bwd_top_diff->name = "bwd_top_diff_data   @ " + this->getName();
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 2);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, Dtype> grad = mkl_experimental_direct_get<xpu, 4, Dtype>(
+      out_grad[lrn_enum::kOut], s);
+    Tensor<xpu, 4, Dtype> data = mkl_experimental_direct_get<xpu, 4, Dtype>(
+      in_data[lrn_enum::kData], s);
+    Tensor<xpu, 4, Dtype> grad_in = mkl_experimental_direct_get<xpu, 4, Dtype>(
+      in_grad[lrn_enum::kData], s);
+    if (lrnBwd_pd == NULL)
+      InitLRNBwd(out_grad);
+    MKLDNNPrimitive<Dtype> lrnBwd;
+    std::shared_ptr<memory> bwd_bottom_diff_memory;
+    std::shared_ptr<primitive> bwd_top_diff_primitive;
+    bwd_top_diff_primitive =
+      bwd_top_diff->get_converted_prv(grad.dptr_, false, out_grad[lrn_enum::kOut]);
+    bwd_bottom_diff_memory = bwd_bottom_diff->create_output_memory(grad_in.dptr_,
+      in_grad[lrn_enum::kData], bwd_bottom_diff);
+    lrnBwd.reset(new lrn_backward(*lrnBwd_pd, *fwd_bottom_data_primitive,
+      *bwd_top_diff_primitive, *scratch_memory, *bwd_bottom_diff_memory));
+    lrnBwd.submit();
+  }
+
+ private:
+  LRNParam param_;
+  bool init_mkldnn_;
+  std::shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data,
+    bwd_top_diff, bwd_bottom_diff;
+  std::shared_ptr<lrn_forward::primitive_desc> lrnFwdInference_pd;
+  std::shared_ptr<lrn_forward::primitive_desc> lrnFwdTraining_pd;
+  std::shared_ptr<lrn_backward::primitive_desc> lrnBwd_pd;
+  std::shared_ptr<primitive> fwd_bottom_data_primitive;
+  std::shared_ptr<memory> scratch_memory;
+  Dtype alpha_, beta_, k_;
+  int size_, num_, width_, height_, channels_;
+  algorithm  lrn_algorithm;
+};  // class LocalResponseNormOp
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_LRN_INL_H_
+
diff --git a/src/operator/mkl/mkldnn_memory-inl.h b/src/operator/mkl/mkldnn_memory-inl.h
new file mode 100644
index 0000000000..d6a247ccc4
--- /dev/null
+++ b/src/operator/mkl/mkldnn_memory-inl.h
@@ -0,0 +1,206 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_memory-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_MEMORY_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_MEMORY_INL_H_
+
+#include <string>
+#include <vector>
+#include <iterator>
+#if MXNET_USE_MKLDNN == 1
+#include "mkldnn.hpp"
+#include "mkldnn_base-inl.h"
+#define CHECK_MKL_BUFFER 0
+#if CHECK_MKL_BUFFER == 1
+#include "../../operator_common.h"
+#include "../../mshadow_op.h"
+#endif
+using namespace mkldnn;
+
+namespace mxnet {
+
+template <typename Dtype>
+struct MKLDNNMemoryDescriptorBase : public PrvMemDescr,
+ public std::enable_shared_from_this<MKLDNNMemoryDescriptorBase<Dtype> > {
+    MKLDNNMemoryDescriptorBase(std::shared_ptr<memory::primitive_desc> usr_memory_pd
+        , std::shared_ptr<memory::primitive_desc> prv_memory_pd);
+
+    ~MKLDNNMemoryDescriptorBase() {
+    }
+    std::shared_ptr<MKLDNNMemoryDescriptorBase<Dtype> > get_shared_ptr() {
+      return this->shared_from_this();
+    }
+    // ---- PrvMemDescr virtual functions -----
+    void allocate() {
+      if (_prv_memory == nullptr) {
+        _prv_memory = std::shared_ptr<memory>(new memory(*_prv_memory_pd));
+        _internal_ptr = reinterpret_cast<Dtype *>(_prv_memory->get_data_handle());
+        _internal_size = prv_size();
+      }
+    }
+    std::shared_ptr<memory>  get_prv_memory(bool usage_check = false) {
+      if (_prv_memory == nullptr) {
+        if (usage_check)
+          CHECK_EQ(usage_check, true) << "get null prv memory from";
+        allocate();
+      }
+      return _prv_memory;
+    }
+    inline bool conversion_needed() const {
+      if (!_prv_memory_pd_not_null)
+        return false;
+      if (!_usr_memory_pd_not_null)
+        return false;
+      if (*_usr_memory_pd != *_prv_memory_pd)
+        return true;
+      else
+        return false;
+    }
+
+    void set_prv_memory_pd(std::shared_ptr<memory::primitive_desc> memory_pd) {
+      _prv_memory_pd = memory_pd;
+      if (_prv_memory_pd)
+        _prv_memory_pd_not_null = true;
+    }
+
+    void set_usr_memory_pd(std::shared_ptr<memory::primitive_desc> memory_pd) {
+      _usr_memory_pd = memory_pd;
+      if (_usr_memory_pd)
+        _usr_memory_pd_not_null = true;
+    }
+
+    virtual void* prv_ptr(bool allocate_when_uninit = true) {
+      return _internal_ptr;
+    }
+    virtual size_t prv_size() { return _prv_memory_pd->get_size(); }
+    virtual size_t prv_count() { return prv_size() / sizeof(Dtype); }
+
+    virtual bool layout_compare(std::shared_ptr<PrvMemDescr> other);
+    virtual PrvDescrType get_descr_type() { return PRV_DESCR_MKLDNN; }
+
+    std::shared_ptr<memory::primitive_desc>  prv_memory_pd() const {
+        return _prv_memory_pd;
+    }
+    std::shared_ptr<memory::primitive_desc>  usr_memory_pd() const {
+        return _usr_memory_pd;
+    }
+
+    std::string name;  // for debugging purposes
+
+    void check_usr_with_prv_descriptors();
+    void set_prv_memory(std::shared_ptr<memory> memory) {
+        _prv_memory = memory;
+        if (_prv_memory == nullptr) {
+          _internal_ptr = reinterpret_cast<Dtype *>(_prv_memory->get_data_handle());
+          _internal_size = prv_size();
+        } else {
+          VLOG(1) << "Set NULL Prv Memory";
+        }
+    }
+
+ protected:
+    std::shared_ptr<memory::primitive_desc> _usr_memory_pd;
+    std::shared_ptr<memory::primitive_desc> _prv_memory_pd;
+    bool _usr_memory_pd_not_null;
+    bool _prv_memory_pd_not_null;
+    std::shared_ptr<memory> _prv_memory;
+    Dtype* _internal_ptr;
+    int  _internal_size;
+    std::shared_ptr<memory> _usr_memory;
+    void* _dbg_cpu_ptr;
+};
+
+template <typename Dtype>
+class MKLDNNMemoryDescriptor : public MKLDNNMemoryDescriptorBase<Dtype> {
+ public:
+    MKLDNNMemoryDescriptor(std::shared_ptr<memory::primitive_desc> usr_memory_pd
+        , std::shared_ptr<memory::primitive_desc> prv_memory_pd);
+
+    virtual void convert_from_prv(void* cpu_ptr);
+    virtual void convert_to_prv(void* cpu_ptr);
+    virtual void convert_from_extprv(std::shared_ptr<memory> extprv_memory);
+    virtual void convert_from_other(std::shared_ptr<PrvMemDescr> other);
+    virtual bool on_to_cpu();
+
+    virtual void create_reorder_from_prv(void* cpu_ptr);
+    virtual void create_reorder_to_prv(void* cpu_ptr);
+
+    // The last get_blob_data_ptr() argument is a hack for reusing
+    // in backward a conversion done already in the forward direction.
+    std::shared_ptr<memory> get_converted_prv(Dtype* cpu_data,
+      bool set_prv_ptr, const TBlob &blob);
+    void sync_converted_prv(Dtype* cpu_data, bool set_prv_ptr, const TBlob &tblob);
+    std::shared_ptr<memory> create_output_memory(Dtype* cpu_data, const TBlob &blob,
+        std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > thisData = nullptr, bool in_place = false);
+    void sync_output_memory(const TBlob &blob,
+        std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > thisData = nullptr, bool in_place = false);
+
+    std::shared_ptr<primitive>  reorder_usr2prv() { return _reorder_usr2prv.aprimitive; }
+    std::shared_ptr<primitive>  reorder_prv2usr() { return _reorder_prv2usr.aprimitive; }
+
+ private:
+    MKLDNNPrimitive<Dtype> _reorder_usr2prv;
+    MKLDNNPrimitive<Dtype> _reorder_prv2usr;
+};
+
+template <typename Dtype>
+class MKLDNNData : public MKLDNNMemoryDescriptor<Dtype> {
+ public:
+    MKLDNNData(std::shared_ptr<memory::primitive_desc> usr_memory_pd
+        , std::shared_ptr<memory::primitive_desc> prv_memory_pd)
+        : MKLDNNMemoryDescriptor<Dtype>(usr_memory_pd, prv_memory_pd) {}
+};
+
+template <typename Dtype>
+std::shared_ptr<MKLDNNData<Dtype> >
+get_mkldnn_prv_descriptor(std::shared_ptr<MKLMemHolder> blob);
+
+template <typename Dtype>
+inline std::shared_ptr<MKLDNNData<Dtype> > get_mkldnn_prv_descriptor(const TBlob &b) {
+  return get_mkldnn_prv_descriptor<Dtype>(b.Mkl_mem_);
+}
+
+template<typename DType>
+inline std::shared_ptr<memory> mkldnn_prv_memory(const TBlob &b) {
+  std::shared_ptr<MKLMemHolder> mkl_mem = b.Mkl_mem_;
+  bool mem_valid = (mkl_mem != nullptr) && mkl_mem->head_at_prv();
+  if (mem_valid) {
+    std::shared_ptr<MKLDNNMemoryDescriptor<DType> > mem_desc
+      = get_mkldnn_prv_descriptor<DType>(mkl_mem);
+    if (mem_desc != nullptr)
+      return mem_desc->get_prv_memory(true);
+  }
+  return nullptr;
+}
+template struct MKLDNNData<float>;
+template struct MKLDNNData<double>;
+template struct MKLDNNData<uint8_t>;
+template struct MKLDNNData<int8_t>;
+template struct MKLDNNData<int32_t>;
+
+}  // namespace mxnet
+
+#endif
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_MEMORY_INL_H_
diff --git a/src/operator/mkl/mkldnn_memory.cc b/src/operator/mkl/mkldnn_memory.cc
new file mode 100644
index 0000000000..fa3a59608c
--- /dev/null
+++ b/src/operator/mkl/mkldnn_memory.cc
@@ -0,0 +1,294 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_memory.cc
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+
+
+#include <mxnet/base.h>
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "mkldnn_memory-inl.h"
+
+namespace mxnet {
+
+template <typename Dtype>
+MKLDNNMemoryDescriptorBase<Dtype>::MKLDNNMemoryDescriptorBase(
+        std::shared_ptr<memory::primitive_desc> usr_memory_pd
+        , std::shared_ptr<memory::primitive_desc> prv_memory_pd)
+                                    : name("MKLDNNMemoryDescriptorBase"),
+                                    _prv_memory(NULL), _internal_ptr(NULL), _internal_size(0),
+                                    _usr_memory(NULL), _dbg_cpu_ptr(NULL) {
+    _usr_memory_pd_not_null = false;
+    _prv_memory_pd_not_null = false;
+    set_usr_memory_pd(usr_memory_pd);
+    set_prv_memory_pd(prv_memory_pd);
+}
+
+template <typename Dtype>
+void MKLDNNMemoryDescriptorBase<Dtype>::check_usr_with_prv_descriptors() {
+    CHECK(_usr_memory_pd);
+    CHECK(_prv_memory_pd);
+    int32_t ndims = _usr_memory_pd->desc().data.ndims;
+    CHECK_EQ(ndims, _prv_memory_pd->desc().data.ndims)
+            << "MKLDNNMemoryDescriptorBase: Usr and Prv memory must have same dimensions number";
+    for (int32_t dim = 0; dim < ndims; ++dim) {
+        CHECK_EQ(_usr_memory_pd->desc().data.dims[dim]
+                , _prv_memory_pd->desc().data.dims[dim])
+                << "MKLDNNMemoryDescriptorBase: Usr and Prv memory must have same dimensions";
+    }
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Implementation of MKLDNNMemoryDescriptor
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Dtype>
+ MKLDNNMemoryDescriptor<Dtype>::MKLDNNMemoryDescriptor(
+                        std::shared_ptr<memory::primitive_desc> usr_memory_pd
+                        , std::shared_ptr<memory::primitive_desc> prv_memory_pd)
+        : MKLDNNMemoryDescriptorBase<Dtype>(usr_memory_pd, prv_memory_pd) {
+}
+
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::create_reorder_to_prv(void* cpu_ptr) {
+    CHECK(cpu_ptr);
+    CHECK(this->_usr_memory_pd);
+    CHECK(this->_prv_memory_pd);
+
+    if (this->_usr_memory == NULL)
+        this->_usr_memory.reset(new memory(*this->_usr_memory_pd, cpu_ptr));
+    if (this->_reorder_usr2prv.aprimitive == NULL)
+        this->_reorder_usr2prv.reset(new reorder(*this->_usr_memory, *this->get_prv_memory()));
+}
+
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::convert_to_prv(void* cpu_ptr) {
+    CHECK(cpu_ptr);
+    if (this->_dbg_cpu_ptr == NULL)
+      this->_dbg_cpu_ptr = cpu_ptr;
+    create_reorder_to_prv(cpu_ptr);
+    // MKL_DLOG(INFO) << "convert usr => priv @" << this->name;
+    this->_reorder_usr2prv.submit();;
+}
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::convert_from_other(std::shared_ptr<PrvMemDescr> other) {
+  CHECK(NULL);  // Not implementation
+}
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::create_reorder_from_prv(void* cpu_ptr) {
+    CHECK(cpu_ptr);
+    CHECK(this->_usr_memory_pd);
+    CHECK(this->_prv_memory_pd);
+
+    if (this->_usr_memory == NULL)
+        this->_usr_memory.reset(new memory(*this->_usr_memory_pd, cpu_ptr));
+    if (this->_reorder_prv2usr.aprimitive == NULL) {
+        this->_reorder_prv2usr.reset(new reorder(*this->_prv_memory, *this->_usr_memory));
+    }
+}
+
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::convert_from_prv(void* cpu_ptr) {
+    CHECK(cpu_ptr);
+    if (this->_dbg_cpu_ptr == NULL)
+      this->_dbg_cpu_ptr = cpu_ptr;
+    create_reorder_from_prv(cpu_ptr);
+    // MKL_DLOG(INFO) << "convert priv => usr @" << this->name;
+    this->_reorder_prv2usr.submit();
+    // on_to_cpu();
+}
+
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::convert_from_extprv(std::shared_ptr<memory> extprv_memory) {
+    MKLDNNPrimitive<Dtype> reorder_extprv2prv;
+    reorder_extprv2prv.reset(new reorder(*extprv_memory, *this->get_prv_memory()));
+    // MKL_DLOG(INFO) << "convert extprv => priv @" << this->name;
+    reorder_extprv2prv.submit();;
+}
+
+
+template <typename Dtype>
+bool MKLDNNMemoryDescriptor<Dtype>::on_to_cpu() {
+    if (StreamHolder::Instance().current_stream() != NULL
+      && StreamHolder::Instance().current_stream()->ready()) {
+        StreamHolder::Instance().current_stream()->wait();
+    }
+    return true;
+}
+
+template <typename Dtype>
+bool MKLDNNMemoryDescriptorBase<Dtype>::layout_compare(std::shared_ptr<PrvMemDescr> other) {
+    CHECK_EQ(other->get_descr_type(),
+              PrvMemDescr::PRV_DESCR_MKLDNN);
+    std::shared_ptr<MKLDNNMemoryDescriptorBase<Dtype> > other_descr =
+        std::static_pointer_cast<MKLDNNMemoryDescriptorBase<Dtype> >(other);
+    return (*other_descr->prv_memory_pd() == *this->prv_memory_pd());
+}
+
+
+template <typename Dtype>
+std::shared_ptr<memory> MKLDNNMemoryDescriptor<Dtype>::get_converted_prv(Dtype* cpu_data,
+                                            bool set_prv_ptr, const TBlob &tblob) {
+  std::shared_ptr<MKLMemHolder> blob = tblob.Mkl_mem_;
+  if (this->conversion_needed()) {
+    // have private format
+    const Dtype* prv_ptr = reinterpret_cast<Dtype*>(blob->prv_data());
+    if (prv_ptr == NULL) {
+      this->convert_to_prv(const_cast<Dtype*>(cpu_data));
+      if (set_prv_ptr) {
+        blob->set_prv_descriptor(this->get_shared_ptr(), true);
+      }
+      return this->get_prv_memory(true);
+    } else {
+      std::shared_ptr<MKLDNNData<Dtype> > blob_prv_mkldnn_mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(blob);
+      if (*blob_prv_mkldnn_mem_descr->prv_memory_pd() != *this->prv_memory_pd()) {
+        // prv in blob and in this descrptor may have different layouts
+        this->convert_from_extprv(blob_prv_mkldnn_mem_descr->get_prv_memory(true));
+        if (set_prv_ptr) {
+          blob->set_prv_descriptor(this->get_shared_ptr(), true);
+        }
+        return this->get_prv_memory(true);
+      } else if (blob_prv_mkldnn_mem_descr.get() != this) {
+        // MKL_DLOG(INFO) << "layout OK ";
+      }
+      // Need:    CHECK(blob_prv_mkldnn_mem_descr->mkldnn_primitive());
+      return blob_prv_mkldnn_mem_descr->get_prv_memory(true);
+    }
+  } else {
+    const Dtype* prv_ptr = reinterpret_cast<Dtype*>(blob->prv_data());
+    if (prv_ptr != NULL) {
+      std::shared_ptr<MKLDNNData<Dtype> > blob_prv_mkldnn_mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(blob);
+      blob_prv_mkldnn_mem_descr->convert_from_prv(cpu_data);
+    }
+  }
+  std::shared_ptr<mkldnn::memory> pres;
+  memory * input_memory = new memory(*this->usr_memory_pd(), const_cast<Dtype*>(cpu_data));
+  pres.reset(input_memory);
+  return pres;
+}
+
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::sync_converted_prv(Dtype* cpu_data,
+                                            bool set_prv_ptr, const TBlob &tblob) {
+  std::shared_ptr<MKLMemHolder> blob = tblob.Mkl_mem_;
+  if (this->conversion_needed()) {
+    // have private format
+    const Dtype* prv_ptr = reinterpret_cast<Dtype*>(blob->prv_data());
+    if (prv_ptr == NULL) {
+      this->convert_to_prv(const_cast<Dtype*>(cpu_data));
+      if (set_prv_ptr) {
+        blob->set_prv_descriptor(this->get_shared_ptr(), true);
+      }
+    } else {
+      std::shared_ptr<MKLDNNData<Dtype> > blob_prv_mkldnn_mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(blob);
+      if (*blob_prv_mkldnn_mem_descr->prv_memory_pd() != *this->prv_memory_pd()) {
+        // prv in blob and in this descrptor may have different layouts
+        this->convert_from_extprv(blob_prv_mkldnn_mem_descr->get_prv_memory(true));
+        if (set_prv_ptr) {
+          blob->set_prv_descriptor(this->get_shared_ptr(), true);
+        }
+      } else if (blob_prv_mkldnn_mem_descr.get() != this) {
+        // MKL_DLOG(INFO) << "layout OK ";
+      }
+      // Need:    CHECK(blob_prv_mkldnn_mem_descr->mkldnn_primitive());
+    }
+  } else {
+    const Dtype* prv_ptr = reinterpret_cast<Dtype*>(blob->prv_data());
+    if (prv_ptr != NULL) {
+      std::shared_ptr<MKLDNNData<Dtype> > blob_prv_mkldnn_mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(blob);
+      blob_prv_mkldnn_mem_descr->convert_from_prv(cpu_data);
+    }
+  }
+}
+
+
+template <typename Dtype>
+std::shared_ptr<memory> MKLDNNMemoryDescriptor<Dtype>::create_output_memory(
+    Dtype* cpu_data, const TBlob &blob,
+    std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > thisData, bool in_place) {
+    std::shared_ptr<memory> omem;
+    if (this->conversion_needed()) {
+      if (in_place) {
+        std::shared_ptr<MKLDNNData<Dtype> > blob_omem = get_mkldnn_prv_descriptor<Dtype>(blob);
+        omem = blob_omem->get_prv_memory();
+      } else {
+        omem = this->get_prv_memory();
+        blob.Mkl_mem_->set_prv_descriptor(thisData);
+      }
+    } else {
+      blob.Mkl_mem_->check_and_prv_to_cpu(cpu_data, false);
+      omem.reset(new memory(*this->usr_memory_pd(), cpu_data));
+    }
+    return omem;
+}
+
+template <typename Dtype>
+void MKLDNNMemoryDescriptor<Dtype>::sync_output_memory(const TBlob &blob,
+    std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > thisData, bool in_place) {
+    if (this->conversion_needed()) {
+      if (!in_place) {
+        blob.Mkl_mem_->set_prv_descriptor(thisData);
+      }
+    } else {
+      blob.Mkl_mem_->check_and_prv_to_cpu(nullptr, false);
+    }
+}
+
+
+
+template <typename Dtype>
+std::shared_ptr<MKLDNNData<Dtype> > get_mkldnn_prv_descriptor(
+    std::shared_ptr<MKLMemHolder> blob) {
+    std::shared_ptr<PrvMemDescr> blob_prv_mem_descriptor =
+        blob->get_prv_descriptor();
+    if (blob_prv_mem_descriptor == nullptr)
+      return nullptr;
+    CHECK_EQ(blob_prv_mem_descriptor->get_descr_type(), PrvMemDescr::PRV_DESCR_MKLDNN);
+    std::shared_ptr<MKLDNNData<Dtype> > blob_prv_mkldnn_mem_descr =
+        std::static_pointer_cast<MKLDNNData<Dtype> >(blob_prv_mem_descriptor);
+    CHECK(blob_prv_mkldnn_mem_descr != NULL);
+    return blob_prv_mkldnn_mem_descr;
+}
+
+template class MKLDNNMemoryDescriptor<double>;
+template class MKLDNNMemoryDescriptor<float>;
+template class MKLDNNMemoryDescriptor<uint8_t>;
+template class MKLDNNMemoryDescriptor<int8_t>;
+template class MKLDNNMemoryDescriptor<int32_t>;
+
+template class MKLDNNMemoryDescriptorBase<float>;
+template class MKLDNNMemoryDescriptorBase<double>;
+template class MKLDNNMemoryDescriptorBase<uint8_t>;
+template class MKLDNNMemoryDescriptorBase<int8_t>;
+template class MKLDNNMemoryDescriptorBase<int32_t>;
+
+}  // namespace mxnet
+#endif  // #ifdef MKLDNN_SUPPORTED
diff --git a/src/operator/mkl/mkldnn_pooling-inl.h b/src/operator/mkl/mkldnn_pooling-inl.h
new file mode 100644
index 0000000000..4739f5de50
--- /dev/null
+++ b/src/operator/mkl/mkldnn_pooling-inl.h
@@ -0,0 +1,355 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_pooling-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_POOLING_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_POOLING_INL_H_
+#include <vector>
+#include <string>
+#include <utility>
+
+namespace mxnet {
+namespace op {
+template<typename xpu, typename Dtype>
+class MKLDNNPoolingOp : public Operator, public MKLDNNLayer<Dtype> {
+ public:
+  std::string getName() {
+     std::string name = "MKLDNNPoolingOp";
+     return name;
+  }
+  explicit MKLDNNPoolingOp(PoolingParam p) : MKLDNNLayer<Dtype>()
+    , num_(0), channels_(0), width_(0), height_(0), width_out_(0), height_out_(0)
+    , kernel_w_(0), kernel_h_(0), stride_w_(0), stride_h_(0)
+    , pad_t_(0), pad_b_(0), pad_l_(0), pad_r_(0) {
+    this->param_ = p;
+    this->init_mkldnn_ = false;
+    switch (param_.pool_type) {
+    case pool_enum::kMaxPooling:
+      pooling_algorithm_ = pooling_max;
+      break;
+    case pool_enum::kAvgPooling:
+      pooling_algorithm_ = pooling_avg;
+      break;
+    default:
+      LOG(FATAL) << "Unknown pooling method.";
+    }
+  }
+  virtual ~MKLDNNPoolingOp() {
+  }
+
+ private:
+  void LayerSetUp(const mshadow::Tensor<xpu, 4, Dtype> &data,
+                  const mshadow::Tensor<xpu, 4, Dtype> &out) {
+    channels_ = data.shape_[1];
+    height_ = data.shape_[2];
+    width_ = data.shape_[3];
+    num_ = data.shape_[0];
+    global_pooling_ = param_.global_pool;
+    if (global_pooling_) {
+      kernel_h_ = height_;
+      kernel_w_ = width_;
+    } else {
+      kernel_h_ = param_.kernel[0];
+      kernel_w_ = param_.kernel[1];
+    }
+    CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+    pad_t_ = pad_b_ = param_.pad[0];
+    pad_l_ = pad_r_ = param_.pad[1];
+
+    stride_h_ = param_.stride[0];
+    stride_w_ = param_.stride[1];
+
+    if (global_pooling_) {
+      CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+            << "With Global_pooling: true; only pad = 0 and stride = 1";
+    }
+    if (pad_t_ != 0 || pad_l_ != 0) {
+      CHECK(param_.pool_type == pool_enum::kAvgPooling
+        || param_.pool_type == pool_enum::kMaxPooling)
+        << "Padding implemented only for average and max pooling.";
+      CHECK_LT(pad_t_, kernel_h_);
+      CHECK_LT(pad_l_, kernel_w_);
+    }
+    height_out_ = out.shape_[2];
+    width_out_ = out.shape_[3];
+  }
+
+ public:
+  void InitPoolingFwd(const std::vector<TBlob> &in_data) {
+      int32_t n = this->num_;
+      int32_t c = this->channels_;
+      int32_t ih = this->height_;
+      int32_t iw = this->width_;
+      int32_t oh = this->height_out_;
+      int32_t ow = this->width_out_;
+
+      int32_t kh = this->kernel_h_;
+      int32_t kw = this->kernel_w_;
+
+      int32_t sh = this->stride_h_;
+      int32_t sw = this->stride_w_;
+
+      int32_t pt = this->pad_t_;
+      int32_t pb = this->pad_b_;
+      int32_t pl = this->pad_l_;
+      int32_t pr = this->pad_r_;
+
+     bool bottom_data_is_prv =
+       (const_cast<Dtype*>(mkl_prv_data<Dtype>(in_data[pool_enum::kData])) != NULL);
+     mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+     memory::data_type mpcsn = memory::data_type::f32;
+     memory::dims bottom_tz = { n, c, ih, iw };
+     memory::dims top_tz = { n, c, oh, ow };
+     memory::format mfmt_nchw = memory::format::nchw;
+
+     // ---- Initialize memory descriptors -------------
+     typedef typename memory::primitive_desc MemPD;
+
+     memory::format cmfmt = mfmt_nchw;
+     if (bottom_data_is_prv) {
+       std::shared_ptr<MKLDNNData<Dtype> > mem_descr
+         = get_mkldnn_prv_descriptor<Dtype>(in_data[pool_enum::kData].Mkl_mem_);
+       cmfmt = static_cast<memory::format>(mem_descr->prv_memory_pd()->desc().data.format);
+     }
+     std::shared_ptr<memory::desc> init_fwd_bottom_md(
+       new memory::desc({ bottom_tz }, mpcsn, cmfmt));
+     std::shared_ptr<memory::desc> init_fwd_top_md(new memory::desc({ top_tz }, mpcsn, cmfmt));
+     std::shared_ptr<MemPD> usr_bottom_data_mpd(new MemPD({ { bottom_tz }, mpcsn, mfmt_nchw },
+       cpu_engine));
+     std::shared_ptr<MemPD> usr_top_data_mpd(
+       new MemPD({ { top_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+
+     pooling_forward::desc poolingFwdInference_desc(prop_kind::forward_scoring,
+        pooling_algorithm_, *init_fwd_bottom_md, *init_fwd_top_md
+       , { sh, sw }, { kh, kw }, { pt, pl }, { pb, pr }, padding_kind::zero);
+     pooling_forward::desc poolingFwdTraining_desc(prop_kind::forward_training
+       , pooling_algorithm_, *init_fwd_bottom_md, *init_fwd_top_md
+       , { sh, sw }, { kh, kw }, { pt, pl }, { pb, pr }, padding_kind::zero);
+     poolingFwdInference_pd.reset(new pooling_forward::primitive_desc(
+       poolingFwdInference_desc, cpu_engine));
+     CHECK(poolingFwdInference_pd);
+     poolingFwdTraining_pd.reset(new pooling_forward::primitive_desc(
+       poolingFwdTraining_desc, cpu_engine));
+     CHECK(poolingFwdTraining_pd);
+
+     // ---- Initialize remaining memory descriptors -------------
+     std::shared_ptr<MemPD> prv_fwd_bottom_data_mpd;
+     std::shared_ptr<MemPD> prv_fwd_top_data_mpd;
+     if (bottom_data_is_prv) {
+       prv_fwd_bottom_data_mpd.reset(new MemPD(*init_fwd_bottom_md, cpu_engine));
+       prv_fwd_top_data_mpd.reset(new MemPD(*init_fwd_top_md, cpu_engine));
+     }
+
+     fwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_bottom_data_mpd, prv_fwd_bottom_data_mpd));
+     fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+
+     fwd_top_data.reset(new MKLDNNData<Dtype>(usr_top_data_mpd, prv_fwd_top_data_mpd));
+     fwd_top_data->name = "fwd_top_data   @ " + this->getName();
+     // ---- Initialize pooling primitive descriptor -------------
+     if (pooling_algorithm_ != algorithm::pooling_avg) {
+       indices_pd.reset(
+         new memory::primitive_desc(poolingFwdTraining_pd->workspace_primitive_desc()));
+       indices_memory.reset(new memory(*indices_pd));
+     }
+  }
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+      using namespace mshadow;
+      using namespace mshadow::expr;
+      CHECK_EQ(in_data.size(), 1);
+      CHECK_EQ(out_data.size(), 1);
+      Stream<xpu> *s = ctx.get_stream<xpu>();
+      if (param_.kernel.ndim() >= 3) {
+        LOG(FATAL) << "Not implmented";
+      }
+      Tensor<xpu, 4, Dtype> data = mkl_experimental_direct_get<xpu, 4, Dtype>(
+        in_data[pool_enum::kData], s);
+      Tensor<xpu, 4, Dtype> out = mkl_experimental_direct_get<xpu, 4, Dtype>(
+        out_data[pool_enum::kOut], s);
+    if (!init_mkldnn_) {
+      LayerSetUp(data, out);
+      init_mkldnn_ = true;
+
+      if (poolingFwdInference_pd == NULL)
+        InitPoolingFwd(in_data);
+      // ---  init primitive and prv_memory descriptors ----------------------
+      fwd_input_primitive = fwd_bottom_data->get_converted_prv(data.dptr_, false,
+        in_data[pool_enum::kData]);
+      fwd_output_memory = fwd_top_data->create_output_memory(out.dptr_, out_data[pool_enum::kOut],
+        fwd_top_data);
+      if (ctx.is_train && pooling_algorithm_ != algorithm::pooling_avg) {
+        poolingFwd.reset(new pooling_forward(*poolingFwdTraining_pd, *fwd_input_primitive,
+          *fwd_output_memory, *indices_memory));
+      } else {
+        poolingFwd.reset(new pooling_forward(*poolingFwdInference_pd, *fwd_input_primitive,
+          *fwd_output_memory));
+      }
+    } else {
+      fwd_bottom_data->sync_converted_prv(data.dptr_, false,
+        in_data[pool_enum::kData]);
+      fwd_top_data->sync_output_memory(out_data[pool_enum::kOut],
+        fwd_top_data);
+    }
+    poolingFwd.submit();
+  }
+  void InitPoolingBwd(const std::vector<TBlob> &out_grad) {
+    int32_t n = this->num_;
+    int32_t c = this->channels_;
+    int32_t ih = this->height_;
+    int32_t iw = this->width_;
+    int32_t oh = this->height_out_;
+    int32_t ow = this->width_out_;
+
+    int32_t kh = this->kernel_h_;
+    int32_t kw = this->kernel_w_;
+
+    int32_t sh = this->stride_h_;
+    int32_t sw = this->stride_w_;
+
+    int32_t pt = this->pad_t_;
+    int32_t pb = this->pad_b_;
+
+    int32_t pr = this->pad_r_;
+    int32_t pl = this->pad_l_;
+
+    void * top_diff_data =
+      const_cast<Dtype*>(mkl_prv_data<Dtype>(out_grad[pool_enum::kOut]));
+    bool top_diff_is_prv = (top_diff_data != NULL);
+
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mpcsn = memory::data_type::f32;
+    memory::dims bottom_tz = { n, c, ih, iw };
+    memory::dims top_tz = { n, c, oh, ow };
+    memory::format mfmt_nchw = memory::format::nchw;
+
+    // ---- Initialize memory descriptors -------------
+    typedef typename memory::primitive_desc MemPD;
+
+    memory::format bwd_cmfmt = mfmt_nchw;
+    if (top_diff_is_prv) {
+      std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(out_grad[pool_enum::kOut].Mkl_mem_);
+      bwd_cmfmt = static_cast<memory::format>(mem_descr->prv_memory_pd()->desc().data.format);
+    }
+
+    std::shared_ptr<memory::desc> init_bwd_bottom_md(
+      new memory::desc({ bottom_tz }, mpcsn, bwd_cmfmt));
+    std::shared_ptr<memory::desc> init_bwd_top_md(
+      new memory::desc({ top_tz }, mpcsn, bwd_cmfmt));
+    std::shared_ptr<MemPD> usr_bottom_data_mpd(
+      new MemPD({ { bottom_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+    std::shared_ptr<MemPD> usr_top_data_mpd(
+      new MemPD({ { top_tz }, mpcsn, mfmt_nchw }, cpu_engine));
+    // ---- Initialize pooling primitive descriptor -------------
+    pooling_backward::desc poolingBwd_desc(this->pooling_algorithm_, *init_bwd_bottom_md,
+      *init_bwd_top_md
+      , { sh, sw }, { kh, kw }, { pt, pl }, { pb, pr }, padding_kind::zero);
+    poolingBwd_pd.reset(new pooling_backward::primitive_desc(poolingBwd_desc,
+      cpu_engine, *poolingFwdTraining_pd));
+    CHECK(poolingBwd_pd);
+    // ---- Initialize remaining memory descriptors -------------
+    std::shared_ptr<MemPD> prv_bwd_bottom_diff_mpd, prv_bwd_top_diff_mpd;
+    if (top_diff_is_prv) {
+      prv_bwd_bottom_diff_mpd.reset(new MemPD(*init_bwd_bottom_md, cpu_engine));
+      prv_bwd_top_diff_mpd.reset(new MemPD(*init_bwd_top_md, cpu_engine));
+    }
+    bwd_bottom_diff.reset(new MKLDNNData<Dtype>(usr_bottom_data_mpd, prv_bwd_bottom_diff_mpd));
+    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
+    bwd_top_diff.reset(new MKLDNNData<Dtype>(usr_top_data_mpd, prv_bwd_top_diff_mpd));
+    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    if (!req[0]) {
+      return;
+    }
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    if (param_.kernel.ndim() >= 3) {
+      LOG(FATAL) << "Not implmented";
+    }
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, Dtype> grad = mkl_experimental_direct_get<xpu, 4, Dtype>(
+      out_grad[pool_enum::kOut], s);
+    Tensor<xpu, 4, Dtype> input_grad = mkl_experimental_direct_get<xpu, 4, Dtype>(
+      in_grad[pool_enum::kData], s);
+    if (poolingBwd_pd == NULL) {
+      InitPoolingBwd(out_grad);
+      diff_dst_memory = bwd_top_diff->get_converted_prv(grad.dptr_, false, out_grad[pool_enum::kOut]);
+      diff_src_memory = bwd_bottom_diff->create_output_memory(input_grad.dptr_,
+        in_grad[pool_enum::kData], bwd_bottom_diff);
+      if (param_.pool_type != pool_enum::kAvgPooling) {
+        poolingBwd.reset(new pooling_backward(*poolingBwd_pd, *diff_dst_memory,
+          *indices_memory, *diff_src_memory));
+      } else {
+        poolingBwd.reset(new pooling_backward(*poolingBwd_pd, *diff_dst_memory,
+          *diff_src_memory));
+      }
+    } else {
+      bwd_top_diff->sync_converted_prv(grad.dptr_, false, out_grad[pool_enum::kOut]);
+      bwd_bottom_diff->sync_output_memory(
+        in_grad[pool_enum::kData], bwd_bottom_diff);
+    }
+    poolingBwd.submit();
+  }
+
+ private:
+  PoolingParam param_;
+  int32_t num_, channels_, width_, height_, width_out_, height_out_;
+  int32_t kernel_w_, kernel_h_, stride_w_, stride_h_;
+  int32_t  pad_t_, pad_b_, pad_l_, pad_r_;
+  bool global_pooling_;
+  std::shared_ptr<pooling_forward::primitive_desc> poolingFwdInference_pd;
+  std::shared_ptr<pooling_forward::primitive_desc> poolingFwdTraining_pd;
+  std::shared_ptr<pooling_backward::primitive_desc> poolingBwd_pd;
+
+  std::shared_ptr<MKLDNNData<Dtype> > fwd_bottom_data, fwd_top_data,
+    bwd_top_diff, bwd_bottom_diff;
+  std::shared_ptr<memory::primitive_desc> indices_pd;
+  std::shared_ptr<memory> indices_memory;
+  bool init_mkldnn_;
+  algorithm pooling_algorithm_;
+  MKLDNNPrimitive<Dtype> poolingFwd;
+  MKLDNNPrimitive<Dtype>  poolingBwd;
+  std::shared_ptr<memory> fwd_input_primitive, fwd_output_memory;
+  std::shared_ptr<memory> diff_dst_memory, diff_src_memory;
+};  // class MKLDNNPoolingOp
+}   // namespace op
+}   // namespace mxnet
+
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_POOLING_INL_H_
diff --git a/src/operator/mkl/mkldnn_relu-inl.h b/src/operator/mkl/mkldnn_relu-inl.h
new file mode 100644
index 0000000000..501b9d6782
--- /dev/null
+++ b/src/operator/mkl/mkldnn_relu-inl.h
@@ -0,0 +1,301 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_relu-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_
+#define MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_
+
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename Dtype>
+class MKLDNNReluOp : public Operator, public MKLDNNLayer<Dtype> {
+ public:
+  std::string getName() {
+    std::string name = "MKLDNNReluOp";
+    return name;
+  }
+  MKLDNNReluOp() : MKLDNNLayer<Dtype>()
+    , fwd_top_data(NULL), fwd_bottom_data(NULL), prv_mpd(NULL)
+    , num_(0), width_(0), height_(0), channels_(0) {
+    init_mkldnn_ = false;
+  }
+  ~MKLDNNReluOp() {
+  }
+
+ private:
+  void LayerSetup(const mshadow::Tensor<xpu, 4, Dtype> &data) {
+    this->width_ = data.shape_[3];
+    this->height_ = data.shape_[2];
+    this->channels_ = data.shape_[1];
+    this->num_ = data.shape_[0];
+  }
+  void InitReLUFwd(const std::vector<TBlob> &in_data) {
+    void * bottom_data = reinterpret_cast<void *>(mkl_prv_data<Dtype>(in_data[activation::kData]));
+    std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > bottom_prv_descriptor
+      = get_mkldnn_prv_descriptor<Dtype>(in_data[activation::kData]);
+    std::shared_ptr<memory::desc> bottom_data_md, top_data_md;
+    std::shared_ptr<memory::primitive_desc> usr_mpd(NULL);
+
+    int32_t n = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+    Dtype negative_slope = 0;
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mpcsn = memory::data_type::f32;
+
+    if (bottom_data != NULL) {
+      bottom_data_md.reset(new memory::desc(bottom_prv_descriptor->prv_memory_pd()->desc()));
+      usr_mpd = bottom_prv_descriptor->usr_memory_pd();
+      prv_mpd = bottom_prv_descriptor->prv_memory_pd();
+    } else {
+      bottom_data_md.reset(new memory::desc({ { n, ic, ih, iw } }, mpcsn, memory::format::nchw));
+      usr_mpd.reset(new memory::primitive_desc(*bottom_data_md, cpu_engine));
+    }
+    top_data_md = bottom_data_md;
+
+    // ---- Initialize relu primitive descriptor -------------
+    relu_forward::desc fwd_inference_desc(prop_kind::forward_scoring,
+      *bottom_data_md, negative_slope);
+    fwd_inference_pd.reset(new relu_forward::primitive_desc(fwd_inference_desc, cpu_engine));
+    /* relu_forward::desc fwd_training_desc(prop_kind::forward_training, */
+    /*   *bottom_data_md, negative_slope); */
+    // relu_forward is being deprecated, use new eltwise_forward
+    eltwise_forward::desc fwd_training_desc(prop_kind::forward_training, eltwise_relu, *bottom_data_md, negative_slope);
+    fwd_training_pd.reset(new relu_forward::primitive_desc(fwd_training_desc, cpu_engine));
+    fwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_mpd));
+    fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+    fwd_top_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_mpd));
+    fwd_top_data->name = "fwd_top_data   @ " + this->getName();
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+        using namespace mshadow;
+        using namespace mshadow::expr;
+        CHECK_EQ(in_data.size(), 1);
+        CHECK_EQ(out_data.size(), 1);
+        Stream<xpu> *s = ctx.get_stream<xpu>();
+        Tensor<xpu, 4, Dtype> data;
+        Tensor<xpu, 4, Dtype> out;
+        if (in_data[activation::kData].ndim() == 2) {
+          Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+            in_data[activation::kData].shape_[1], 1, 1);
+          data = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+            in_data[activation::kData], dshape, s);
+          out = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+            out_data[activation::kOut], dshape, s);
+        } else if (in_data[activation::kData].ndim() == 3) {
+          Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+            in_data[activation::kData].shape_[1],
+            in_data[activation::kData].shape_[2], 1);
+          data = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+            in_data[activation::kData], dshape, s);
+          out = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+            out_data[activation::kOut], dshape, s);
+        } else {
+          data = mkl_experimental_direct_get<xpu, 4, Dtype>(in_data[activation::kData], s);
+          out = mkl_experimental_direct_get<xpu, 4, Dtype>(out_data[activation::kOut], s);
+        }
+
+    if (!init_mkldnn_) {
+      LayerSetup(data);
+      InitReLUFwd(in_data);
+      init_mkldnn_ = true;
+      in_place_ = (data.dptr_ == out.dptr_);
+      // ---- Initialize memory descriptors -------------
+
+      input_primitive = fwd_bottom_data->get_converted_prv(data.dptr_,
+        false, in_data[activation::kData]);
+      output_memory = fwd_top_data->create_output_memory(
+        out.dptr_, out_data[activation::kOut], fwd_top_data, in_place_);
+      if (ctx.is_train) {
+        reluFwd.reset(new relu_forward(*fwd_training_pd, *input_primitive, *output_memory));
+      } else {
+        reluFwd.reset(new relu_forward(*fwd_inference_pd, *input_primitive, *output_memory));
+      }
+    } else {
+      fwd_bottom_data->sync_converted_prv(data.dptr_,
+        false, in_data[activation::kData]);
+        fwd_top_data->sync_output_memory(
+          out_data[activation::kOut], fwd_top_data, in_place_);
+    }
+    reluFwd.submit();
+  }
+
+  void InitReLUBwd(const std::vector<TBlob> &out_grad, const std::vector<TBlob> &in_data) {
+    int32_t n = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+    Dtype negative_slope = 0;
+    void * top_diff_data =
+      const_cast<Dtype*>(mkl_prv_data<Dtype>(out_grad[activation::kOut]));
+    bool top_diff_is_prv = (top_diff_data != NULL);
+    mkldnn::engine cpu_engine = CpuEngine::Instance().get_engine();
+    memory::data_type mpcsn = memory::data_type::f32;
+    // ---- Initialize memory descriptors -------------
+    std::shared_ptr<memory::desc> bottom_diff_md;
+    std::shared_ptr<memory::desc> top_diff_md;
+    std::shared_ptr<memory::desc> top_data_md;
+
+    std::shared_ptr<memory::primitive_desc> usr_diff_mpd;
+    std::shared_ptr<memory::primitive_desc> prv_diff_mpd;
+
+    std::shared_ptr<memory::desc> default_md;
+    default_md.reset(new memory::desc({ { n, ic, ih, iw } }, mpcsn, memory::format::nchw));
+    if (top_diff_is_prv) {
+      std::shared_ptr<MKLDNNMemoryDescriptor<Dtype> > mem_descr
+        = get_mkldnn_prv_descriptor<Dtype>(out_grad[activation::kOut]);
+      usr_diff_mpd = mem_descr->usr_memory_pd();
+      prv_diff_mpd = mem_descr->prv_memory_pd();
+    } else {
+      if (prv_mpd != NULL) prv_diff_mpd = prv_mpd;
+      usr_diff_mpd.reset(new memory::primitive_desc(*default_md, cpu_engine));
+    }
+    if (prv_diff_mpd != NULL)
+      top_diff_md.reset(new memory::desc(prv_diff_mpd->desc()));
+    else 
+      top_diff_md.reset(new memory::desc(*default_md));
+    top_data_md = top_diff_md;
+    bottom_diff_md = top_diff_md;
+    /* relu_backward::desc reluBwd_desc(*top_diff_md, *top_data_md, negative_slope); */
+    // use eltwise instead of relu_backward
+    eltwise_backward::desc reluBwd_desc(eltwise_relu, *top_diff_md, *top_data_md, negative_slope);
+    bwd_pd.reset(new relu_backward::primitive_desc(reluBwd_desc, cpu_engine,
+      *fwd_training_pd));
+    bwd_top_diff.reset(new MKLDNNData<Dtype>(usr_diff_mpd, prv_diff_mpd));
+    bwd_top_diff->name = "bwd_top_diff   @ " + this->getName();
+    bwd_bottom_diff.reset(new MKLDNNData<Dtype>(usr_diff_mpd, prv_diff_mpd));
+    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + this->getName();
+    bwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_diff_mpd, prv_diff_mpd));
+    bwd_bottom_data->name = "bwd_bottom_data   @ " + this->getName();
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    if (!req[0]) {
+      return;
+    }
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, Dtype> m_out_grad;
+    Tensor<xpu, 4, Dtype> m_in_grad;
+    Tensor<xpu, 4, Dtype> m_out_data;
+
+    if (out_grad[activation::kOut].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
+        out_grad[activation::kOut].shape_[1], 1, 1);
+      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_grad[activation::kOut], dshape, s);
+      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_data[activation::kOut], dshape, s);
+      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        in_grad[activation::kData], dshape, s);
+    } else if (out_grad[activation::kOut].ndim() == 3) {
+      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
+        out_grad[activation::kOut].shape_[1],
+        out_grad[activation::kOut].shape_[2], 1);
+      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_grad[activation::kOut], dshape, s);
+      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        out_data[activation::kOut], dshape, s);
+      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, Dtype>(
+        in_grad[activation::kData], dshape, s);
+    } else {
+      m_out_grad = mkl_experimental_direct_get<xpu, 4, Dtype>(out_grad[activation::kOut], s);
+      m_out_data = mkl_experimental_direct_get<xpu, 4, Dtype>(out_data[activation::kOut], s);
+      m_in_grad = mkl_experimental_direct_get<xpu, 4, Dtype>(in_grad[activation::kData], s);
+    }
+    in_place_b_ = (m_out_grad.dptr_ != m_in_grad.dptr_);
+if (bwd_pd == nullptr) {
+    InitReLUBwd(out_grad, in_data);
+    // use the src memory from forward call
+    /* src_memory = bwd_bottom_data->get_converted_prv(m_out_data.dptr_, false, */
+    /*   out_data[activation::kOut]); */
+    diff_dst_memory = bwd_top_diff->get_converted_prv(m_out_grad.dptr_,
+      false, out_grad[activation::kOut]);
+    diff_src_memory = bwd_bottom_diff->create_output_memory(m_in_grad.dptr_,
+      in_grad[activation::kData], bwd_bottom_diff, in_place_b_);
+    reluBwd.reset(new relu_backward(*bwd_pd, *input_primitive, *diff_dst_memory,
+      *diff_src_memory));
+    } else {
+    // use the src memory from forward call
+    /* bwd_bottom_data->sync_converted_prv(false, */
+    /*   out_data[activation::kOut]); */
+    bwd_top_diff->sync_converted_prv(m_out_grad.dptr_,
+      false, out_grad[activation::kOut]);
+    bwd_bottom_diff->sync_output_memory(
+      in_grad[activation::kData], bwd_bottom_diff, in_place_b_);
+    }
+    reluBwd.submit();
+
+}
+
+ private:
+  bool init_mkldnn_;
+  bool in_place_;
+  bool in_place_b_;
+
+  std::shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
+  std::shared_ptr<MKLDNNData<Dtype> > bwd_bottom_data, bwd_top_diff;
+  std::shared_ptr<MKLDNNData<Dtype> > bwd_bottom_diff;
+  std::shared_ptr<relu_forward::primitive_desc> fwd_inference_pd;
+  std::shared_ptr<relu_forward::primitive_desc> fwd_training_pd;
+  std::shared_ptr<relu_backward::primitive_desc> bwd_pd;
+  std::shared_ptr<memory::primitive_desc> prv_mpd;
+  int32_t num_, width_, height_, channels_;
+  std::shared_ptr<memory> input_primitive;
+  std::shared_ptr<memory> output_memory;
+  std::shared_ptr<memory> src_memory, diff_dst_memory, diff_src_memory;
+  MKLDNNPrimitive<Dtype> reluFwd, reluBwd;
+};  // class MKLDNNReluOp
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKLDNN_RELU_INL_H_
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 98a3e076fa..3242dd0814 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -28,6 +28,12 @@
 #include "./mkl/mkl_memory-inl.h"
 #include "./mkl/mkl_pooling-inl.h"
 #endif  // MXNET_USE_MKL2017
+#if MXNET_USE_MKLDNN == 1
+#include <mkl_memory.h>
+#include "./mkl/mkldnn_memory-inl.h"
+#include "./mkl/mkl_util-inl.h"
+#include "./mkl/mkldnn_pooling-inl.h"
+#endif  // MXNET_USE_MKLDNN
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_pooling-inl.h"
 #endif  // MXNET_USE_NNPACK
@@ -38,6 +44,20 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
+#if MXNET_USE_MKLDNN == 1
+    if (param.kernel.ndim() == 2
+      && (param.pool_type == pool_enum::kMaxPooling
+      || param.pool_type == pool_enum::kAvgPooling)) {
+      switch (dtype) {
+      case mshadow::kFloat32:
+        return new MKLDNNPoolingOp<cpu, float>(param);
+      default:
+        break;
+      }
+      if (EnableMkldnnWarnGenerated())
+        LOG(INFO) << "MKLDNNPoolingOp Skip MKL DNN optimization";
+    }
+#endif
 #if MXNET_USE_MKL2017 == 1
     if (param.kernel.ndim() == 2
       && ((param.pool_type == pool_enum::kMaxPooling)
@@ -45,8 +65,8 @@ Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
       switch (dtype) {
       case mshadow::kFloat32:
         return new MKLPoolingOp<cpu, float>(param);
-      case mshadow::kFloat64:
-        return new MKLPoolingOp<cpu, double>(param);
+      /*case mshadow::kFloat64:
+        return new MKLPoolingOp<cpu, double>(param);*/
       default:
         break;
       }
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index f812fe0745..e9658680eb 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -23,11 +23,22 @@
  */
 #include "./elemwise_unary_op.h"
 #include "./elemwise_binary_op-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "../mkl/mkldnn_elemwise_sum-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 
+#if MXNET_USE_MKLDNN == 1
+MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
+.set_attr<FInferStorageType>("FInferStorageType", 
+                              ElemwiseStorageType<2, 1, true, false, false>)       \
+.set_attr<FCompute>("FCompute<cpu>", MKLDNNElementWiseAddCompute<cpu>)             \
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseBinaryOp::ComputeEx<cpu, mshadow::op::plus>)
+#else
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, mshadow::op::plus)
+#endif
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
 .describe(R"code(Adds arguments element-wise.
diff --git a/src/operator/tensor/elemwise_sum.h b/src/operator/tensor/elemwise_sum.h
index 3d6d725111..0e5e1bacdd 100644
--- a/src/operator/tensor/elemwise_sum.h
+++ b/src/operator/tensor/elemwise_sum.h
@@ -32,6 +32,9 @@
 #include "../elemwise_op_common.h"
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
+#if MXNET_USE_MKLDNN == 1
+#include "mkldnn_elemwise_sum-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -107,9 +110,27 @@ void ElementWiseSumCompute(const nnvm::NodeAttrs& attrs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   CHECK_EQ(outputs.size(), 1U);
+
+#if MXNET_USE_MKLDNN == 1
+  const auto& shape = inputs[0].shape_;
+  if (shape.ndim() == 4 && shape[0] > 0 && shape[1] > 0 && shape[2] > 0 &&
+      shape[3] > 0 &&
+      outputs[0].type_flag_ == mshadow::kFloat32) {
+    // MKLDNN does not work for certain shapes (requires dim = 4, non of which
+    // can be 0)
+    MKLDNNElementWiseSumCompute<xpu, float>(attrs, ctx, inputs, req, outputs);
+  }
+  else {
+    // fallback to cpu implementation
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      ElementWiseSumCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+#else
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       ElementWiseSumCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
   });
+#endif
 }
 
 template<typename xpu>
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 024e089832..001fefde6d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -64,7 +64,7 @@ def check_elementwise_sum_with_shape(shape, n):
 def test_elementwise_sum():
     np.random.seed(0)
     nrepeat = 2
-    maxdim = 4
+    maxdim = 5
     for repeat in range(nrepeat):
         for dim in range(1, maxdim):
             shape = tuple(np.random.randint(1, int(1000**(1.0/dim)), size=dim))


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services