You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/06/25 20:47:53 UTC
[GitHub] aaronmarkham closed pull request #11180: [MXNET-503] Website landing page for MMS, PR II

aaronmarkham closed pull request #11180: [MXNET-503] Website landing page for MMS, PR II
URL: https://github.com/apache/incubator-mxnet/pull/11180
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 8f80df0c461..6ab4da67834 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 8f80df0c46188cdf227764848d8aba0cba6e8273
+Subproject commit 6ab4da6783417d8afdeb6b0426b44959b2afc709
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e57c00b69e9..4fe900d4694 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ mxnet_option(USE_NCCL             "Use NVidia NCCL with CUDA" OFF)
 mxnet_option(USE_OPENCV           "Build with OpenCV support" ON)
 mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
-mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON)
+mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON IF NOT ARM)
 mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON) # autodetects support if ON
 mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
@@ -321,14 +321,15 @@ endif()
 
 # ---[ OpenCV
 if(USE_OPENCV)
-  find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
+  find_package(OpenCV COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
+    message(STATUS "OpenCV imgcodecs missing")
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
   endif()
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
   list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
   message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}")
-  message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
+  message(STATUS "OpenCV ${OpenCV_VERSION} found (${OpenCV_CONFIG_PATH})")
   add_definitions(-DMXNET_USE_OPENCV=1)
 else(USE_OPENCV)
   message(STATUS "OpenCV Disabled")
@@ -340,7 +341,11 @@ if(USE_OPENMP)
   find_package(OpenMP REQUIRED)
   # This should build on Windows, but there's some problem and I don't have a Windows box, so
   # could a Windows user please fix?
-  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp/CMakeLists.txt AND SYSTEM_ARCHITECTURE STREQUAL "x86_64" AND NOT MSVC)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp/CMakeLists.txt
+     AND SYSTEM_ARCHITECTURE STREQUAL "x86_64"
+     AND NOT MSVC
+     AND NOT CMAKE_CROSSCOMPILING)
+
     # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
     set(OPENMP_STANDALONE_BUILD TRUE)
     set(LIBOMP_ENABLE_SHARED TRUE)
@@ -360,7 +365,7 @@ if(USE_OPENMP)
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
     endif()
   endif()
-elseif(UNIX)
+elseif(UNIX AND NOT ANDROID)
   list(APPEND mxnet_LINKER_LIBS pthread)
 endif()
 
@@ -575,11 +580,6 @@ if(USE_PLUGIN_CAFFE)
   endif()
 endif()
 
-if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tvm/nnvm/CMakeLists.txt")
-  set(nnvm_LINKER_LIBS nnvm)
-  list(APPEND mxnet_LINKER_LIBS ${nnvm_LINKER_LIBS})
-endif()
-
 if(NOT MSVC)
   # Only add c++11 flags and definitions after cuda compiling
   add_definitions(-DDMLC_USE_CXX11)
@@ -648,7 +648,7 @@ if(USE_PLUGINS_WARPCTC)
 endif()
 
 
-if(USE_OPENCV)
+if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
   add_executable(im2rec "tools/im2rec.cc")
   if(MSVC)
     target_link_libraries(im2rec mxnet)
@@ -659,9 +659,11 @@ if(USE_OPENCV)
     ${mxnet_LINKER_LIBS}
     ${OpenCV_LIBS}
     dmlc
-    ${nnvm_LINKER_LIBS}
     ${pslite_LINKER_LIBS}
     )
+else()
+    message(WARNING "OpenCV_VERSION_MAJOR: ${OpenCV_VERSION_MAJOR}, version 3 with imgcodecs \
+    is required for im2rec, im2rec will not be available")
 endif()
 
 target_link_libraries(mxnet PUBLIC dmlc)
diff --git a/Jenkinsfile b/Jenkinsfile
index 28edda00959..56fbf3d74af 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -92,6 +92,20 @@ echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
 """
 }
 
+def collect_test_results_unix(original_file_name, new_file_name) {
+    echo 'Saving python test results for ' + new_file_name
+    // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
+    sh 'cp ' + original_file_name + ' ' + new_file_name
+    archiveArtifacts artifacts: new_file_name
+}
+
+def collect_test_results_windows(original_file_name, new_file_name) {
+    echo 'Saving python test results for ' + new_file_name
+    // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
+    bat 'xcopy ' + original_file_name + ' ' + new_file_name
+    archiveArtifacts artifacts: new_file_name
+} 
+
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
   def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
@@ -458,6 +472,16 @@ try {
           }
         }
       }
+    },
+    'Android / ARM64':{
+      node('mxnetlinux-cpu') {
+        ws('workspace/android64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            docker_run('android_arm64', 'build_android_arm64', false)
+          }
+        }
+      }
     }
   } // End of stage('Build')
 
@@ -465,36 +489,55 @@ try {
     parallel 'Python2: CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python2-cpu') {
-          init_git()
-          unpack_lib('cpu')
-          python2_ut('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('cpu')
+            python2_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_cpu_train.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_cpu_quantization.xml')
+          }
         }
       }
     },
     'Python3: CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python3-cpu') {
-          init_git()
-          unpack_lib('cpu')
-          python3_ut('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('cpu')
+            python3_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_quantization.xml')
+          }
         }
       }
     },
     'Python2: GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python2-gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          python2_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('gpu', mx_lib)
+            python2_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_gpu.xml')
+          }
         }
       }
     },
     'Python3: GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python3-gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          python3_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('gpu', mx_lib)
+            python3_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
+          }
         }
       }
     },
@@ -502,9 +545,13 @@ try {
       node('mxnetlinux-gpu-p3') {
         ws('workspace/ut-python2-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu', mx_lib)
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
+            try {
+              init_git()
+              unpack_lib('gpu', mx_lib)
+              docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
+            } finally {
+              collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python2_quantize_gpu.xml')
+            }
           }
         }
       }
@@ -513,9 +560,13 @@ try {
       node('mxnetlinux-gpu-p3') {
         ws('workspace/ut-python3-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu', mx_lib)
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
+            try {
+              init_git()
+              unpack_lib('gpu', mx_lib)
+              docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
+            } finally {
+              collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python3_quantize_gpu.xml')
+            }
           }
         }
       }
@@ -523,36 +574,55 @@ try {
     'Python2: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python2-mkldnn-cpu') {
-          init_git()
-          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
-          python2_ut('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+            python2_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_mkldnn_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_mkldnn_cpu_train.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_mkldnn_cpu_quantization.xml')
+          }
         }
       }
     },
     'Python2: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python2-mkldnn-gpu') {
-          init_git()
-          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
-          python2_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+            python2_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_mkldnn_gpu.xml')
+          }
         }
       }
     },
     'Python3: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python3-mkldnn-cpu') {
-          init_git()
-          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
-          python3_ut_mkldnn('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+            python3_ut_mkldnn('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_mkldnn_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_mkl.xml', 'nosetests_python3_mkldnn_cpu_mkl.xml')
+          }
         }
       }
     },
     'Python3: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python3-mkldnn-gpu') {
-          init_git()
-          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
-          python3_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+            python3_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu.xml')
+          }
         }
       }
     },
@@ -560,9 +630,14 @@ try {
       node('mxnetlinux-cpu') {
         ws('workspace/build-centos7-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('centos7_cpu')
-            docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
+            try {
+              init_git()
+              unpack_lib('centos7_cpu')
+              docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
+            } finally {
+              collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_centos7_cpu_unittest.xml')
+              collect_test_results_unix('nosetests_train.xml', 'nosetests_python3_centos7_cpu_train.xml')
+            }
           }
         }
       }
@@ -571,9 +646,13 @@ try {
       node('mxnetlinux-gpu') {
         ws('workspace/build-centos7-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('centos7_gpu')
-            docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
+            try {
+              init_git()
+              unpack_lib('centos7_gpu')
+              docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
+            } finally {
+              collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_centos7_gpu.xml')
+            }
           }
         }
       }
@@ -671,16 +750,21 @@ try {
       node('mxnetwindows-cpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-cpu') {
-            init_git_win()
-            unstash 'vc14_cpu'
-            bat '''rmdir /s/q pkg_vc14_cpu
-              7z x -y vc14_cpu.7z'''
-            bat """xcopy C:\\mxnet\\data data /E /I /Y
-              xcopy C:\\mxnet\\model model /E /I /Y
-              call activate py2
-              set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-              del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-              C:\\mxnet\\test_cpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_cpu'
+              bat '''rmdir /s/q pkg_vc14_cpu
+                7z x -y vc14_cpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py2
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
+                C:\\mxnet\\test_cpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python2_cpu.xml')
+            }
           }
         }
       }
@@ -689,16 +773,21 @@ try {
       node('mxnetwindows-cpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-cpu') {
-            init_git_win()
-            unstash 'vc14_cpu'
-            bat '''rmdir /s/q pkg_vc14_cpu
-              7z x -y vc14_cpu.7z'''
-            bat """xcopy C:\\mxnet\\data data /E /I /Y
-              xcopy C:\\mxnet\\model model /E /I /Y
-              call activate py3
-              set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-              del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-              C:\\mxnet\\test_cpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_cpu'
+              bat '''rmdir /s/q pkg_vc14_cpu
+                7z x -y vc14_cpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py3
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
+                C:\\mxnet\\test_cpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python3_cpu.xml')
+            }
           }
         }
       }
@@ -707,16 +796,21 @@ try {
       node('mxnetwindows-gpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
-            init_git_win()
-            unstash 'vc14_gpu'
-            bat '''rmdir /s/q pkg_vc14_gpu
-              7z x -y vc14_gpu.7z'''
-            bat """xcopy C:\\mxnet\\data data /E /I /Y
-              xcopy C:\\mxnet\\model model /E /I /Y
-              call activate py2
-              set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-              del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-              C:\\mxnet\\test_gpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_gpu'
+              bat '''rmdir /s/q pkg_vc14_gpu
+                7z x -y vc14_gpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py2
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
+                C:\\mxnet\\test_gpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python2_gpu.xml')
+            }
           }
         }
       }
@@ -725,16 +819,21 @@ try {
       node('mxnetwindows-gpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
-          init_git_win()
-          unstash 'vc14_gpu'
-          bat '''rmdir /s/q pkg_vc14_gpu
-            7z x -y vc14_gpu.7z'''
-          bat """xcopy C:\\mxnet\\data data /E /I /Y
-            xcopy C:\\mxnet\\model model /E /I /Y
-            call activate py3
-            set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-            del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-            C:\\mxnet\\test_gpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_gpu'
+              bat '''rmdir /s/q pkg_vc14_gpu
+                7z x -y vc14_gpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py3
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
+                C:\\mxnet\\test_gpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python3_gpu.xml')
+            }
           }
         }
       }
@@ -743,16 +842,21 @@ try {
       node('mxnetwindows-gpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
-          init_git_win()
-          unstash 'vc14_gpu_mkldnn'
-          bat '''rmdir /s/q pkg_vc14_gpu_mkldnn
-            7z x -y vc14_gpu_mkldnn.7z'''
-          bat """xcopy C:\\mxnet\\data data /E /I /Y
-            xcopy C:\\mxnet\\model model /E /I /Y
-            call activate py3
-            set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python
-            del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python\\*.pyc
-            C:\\mxnet\\test_gpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_gpu_mkldnn'
+              bat '''rmdir /s/q pkg_vc14_gpu_mkldnn
+                7z x -y vc14_gpu_mkldnn.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py3
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python\\*.pyc
+                C:\\mxnet\\test_gpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python3_mkldnn_Gpu.xml')
+            }
           }
         }
       }
diff --git a/Makefile b/Makefile
index ff4446ab80c..67aaa7cf707 100644
--- a/Makefile
+++ b/Makefile
@@ -155,6 +155,7 @@ endif
 # issue covered with this
 #   -  for Ubuntu 14.04 or lower, lapack is not automatically installed with openblas
 #   -  for Ubuntu, installing atlas will not automatically install the atlas provided lapack library
+#   -  for rhel7.2, try installing the package `lapack-static` via yum will dismiss this warning.
 # silently switching lapack off instead of letting the build fail because of backward compatibility
 ifeq ($(USE_LAPACK), 1)
 ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
@@ -222,7 +223,10 @@ ifeq ($(USE_GPERFTOOLS), 1)
 		ifeq (,$(FIND_LIBFILE))
 			FIND_LIBFILE=$(wildcard /usr/local/lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
 			ifeq (,$(FIND_LIBFILE))
-				USE_GPERFTOOLS=0
+				FIND_LIBFILE=$(wildcard /usr/lib64/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+				ifeq (,$(FIND_LIBFILE))
+					USE_GPERFTOOLS=0
+				endif
 			endif
 		endif
 	endif
@@ -245,7 +249,10 @@ ifneq ($(USE_GPERFTOOLS), 1)
 				ifeq (,$(FIND_LIBFILE))
 					FIND_LIBFILE=$(wildcard /usr/lib/x86_64-linux-gnu/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
 					ifeq (,$(FIND_LIBFILE))
-						USE_JEMALLOC=0
+						FIND_LIBFILE=$(wildcard /usr/lib64/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+						ifeq (,$(FIND_LIBFILE))
+							USE_JEMALLOC=0
+						endif
 					endif
 				endif
 			endif
diff --git a/ci/build.py b/ci/build.py
index 35f8b478abf..680f4574fd8 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -40,7 +40,7 @@
 from subprocess import call, check_call
 from typing import *
 
-CCACHE_MAXSIZE = '10G'
+CCACHE_MAXSIZE = '500G'
 
 def get_platforms(path: Optional[str] = "docker"):
     """Get a list of architectures given our dockerfiles"""
@@ -124,9 +124,12 @@ def buildir() -> str:
 def default_ccache_dir() -> str:
     # Share ccache across containers
     if 'CCACHE_DIR' in os.environ:
-        ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
-        os.makedirs(ccache_dir, exist_ok=True)
-        return ccache_dir
+        try:
+            ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
+            os.makedirs(ccache_dir, exist_ok=True)
+            return ccache_dir
+        except PermissionError:
+            logging.info('Unable to make dirs at %s, falling back to local temp dir', ccache_dir)
     # In osx tmpdir is not mountable by default
     if platform.system() == 'Darwin':
         ccache_dir = "/tmp/_mxnet_ccache"
@@ -157,7 +160,9 @@ def container_run(platform: str,
                '-v', "{}:/work/ccache".format(local_ccache_dir),
                '-u', '{}:{}'.format(os.getuid(), os.getgid()),
                '-e', 'CCACHE_MAXSIZE={}'.format(CCACHE_MAXSIZE),
+               '-e', 'CCACHE_TEMPDIR=/tmp/ccache',  # temp dir should be local and not shared
                '-e', "CCACHE_DIR=/work/ccache",  # this path is inside the container as /work/ccache is mounted
+               '-e', "CCACHE_LOGFILE=/tmp/ccache.log",  # a container-scoped log, useful for ccache verification.
                tag]
     runlist.extend(command)
     cmd = ' '.join(runlist)
diff --git a/ci/docker/Dockerfile.build.android_arm64 b/ci/docker/Dockerfile.build.android_arm64
index 4bd4fd30922..4dbc49d163a 100755
--- a/ci/docker/Dockerfile.build.android_arm64
+++ b/ci/docker/Dockerfile.build.android_arm64
@@ -31,10 +31,7 @@ MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 # extract ccache binary into latest context
 COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
 
-# The cross-compiling emulator
 RUN apt-get update && apt-get install -y \
-  qemu-user \
-  qemu-user-static \
   unzip
 
 ENV CROSS_TRIPLE=aarch64-linux-android
@@ -61,6 +58,7 @@ LABEL org.label-schema.build-date=$BUILD_DATE \
       org.label-schema.schema-version="1.0"
 
 ENV ARCH aarch64
+ENV ANDROID_NDK_REVISION 15c
 
 ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
 ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
diff --git a/ci/docker/Dockerfile.build.arm64 b/ci/docker/Dockerfile.build.arm64
index a1f752bbf64..2a950078b42 100755
--- a/ci/docker/Dockerfile.build.arm64
+++ b/ci/docker/Dockerfile.build.arm64
@@ -33,17 +33,19 @@ FROM mxnetci/dockcross-linux-arm64:05082018
 COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
 
 ENV ARCH aarch64
-ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
 ENV TARGET ARMV8
 
-WORKDIR /work
+WORKDIR /work/deps
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    PREFIX=${CROSS_ROOT} make install
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
+
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
+WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index c073992406f..93be54025f4 100755
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -36,11 +36,14 @@ ENV TARGET ARMV6
 
 WORKDIR /work/deps
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    make PREFIX=$CROSS_ROOT install
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
+
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 627486c0537..95e05e7cb51 100755
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -30,13 +30,20 @@ FROM dockcross/linux-armv7
 # extract ccache binary into latest context
 COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
 
-ENV ARCH armv71
-ENV CC /usr/bin/arm-linux-gnueabihf-gcc
-ENV CXX /usr/bin/arm-linux-gnueabihf-g++
+ENV ARCH armv7l
+ENV HOSTCC gcc
+ENV TARGET ARMV7
 
-RUN apt-get update && \
-    apt-get install -y libopenblas-dev:armhf && \
-    rm -rf /var/lib/apt/lists/*
+WORKDIR /work/deps
+
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
+
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/build
+WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 8a8bb97aa15..098225e5af5 100755
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -37,17 +37,16 @@ FROM mxnetci/dockcross-linux-arm64:05082018
 COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
 
 ENV ARCH aarch64
-ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
 ENV TARGET ARMV8
 
-WORKDIR /work
+WORKDIR /work/deps
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    PREFIX=${CROSS_ROOT} make install
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
 
 ENV OpenBLAS_HOME=${CROSS_ROOT}
 ENV OpenBLAS_DIR=${CROSS_ROOT}
diff --git a/ci/docker/install/android_arm64_ndk.sh b/ci/docker/install/android_arm64_ndk.sh
index 23d9ea2c0e9..04023fbdebe 100755
--- a/ci/docker/install/android_arm64_ndk.sh
+++ b/ci/docker/install/android_arm64_ndk.sh
@@ -22,7 +22,8 @@
 
 set -ex
 pushd .
-export ANDROID_NDK_REVISION=15c
+# This environment variable comes from the docker file
+echo "Downloading android SDK rev ${ANDROID_NDK_REVISION}"
 curl -O https://dl.google.com/android/repository/android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
 unzip ./android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
 cd android-ndk-r${ANDROID_NDK_REVISION} && \
@@ -32,4 +33,4 @@ cd android-ndk-r${ANDROID_NDK_REVISION} && \
     --api 21 \
     --install-dir=${CROSS_ROOT} && \
 
-popd
\ No newline at end of file
+popd
diff --git a/ci/docker/install/arm_openblas.sh b/ci/docker/install/arm_openblas.sh
new file mode 100755
index 00000000000..fa2e5cae9cb
--- /dev/null
+++ b/ci/docker/install/arm_openblas.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git
+
+cd OpenBLAS
+make -j$(nproc)
+PREFIX=${CROSS_ROOT} make install
+
+cd ..
+
+rm -rf OpenBLAS
diff --git a/ci/docker/install/ubuntu_arm.sh b/ci/docker/install/ubuntu_arm.sh
new file mode 100755
index 00000000000..becb012bd18
--- /dev/null
+++ b/ci/docker/install/ubuntu_arm.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+apt update
+apt install -y \
+    unzip
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 36e23879705..6cefeea9fbc 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -31,7 +31,6 @@ clean_repo() {
     git submodule update --init --recursive
 }
 
-# wrap compiler calls with ccache
 build_ccache_wrappers() {
     set -ex
 
@@ -63,34 +62,66 @@ build_ccache_wrappers() {
     export CXX=`pwd`/cxx
 }
 
-# Build commands: Every platform in docker/Dockerfile.build.<platform> should have a corresponding
-# function here with the same suffix:
+build_wheel() {
 
-build_jetson() {
     set -ex
     pushd .
 
-    build_ccache_wrappers
+    PYTHON_DIR=${1:-/work/mxnet/python}
+    BUILD_DIR=${2:-/work/build}
 
-    cp -f make/crosscompile.jetson.mk ./config.mk
+    # build
 
-    make -j$(nproc)
+    export MXNET_LIBRARY_PATH=${BUILD_DIR}/libmxnet.so
 
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
+    cd ${PYTHON_DIR}
     python setup.py bdist_wheel --universal
 
+    # repackage
+
     # Fix pathing issues in the wheel.  We need to move libmxnet.so from the data folder to the
     # mxnet folder, then repackage the wheel.
     WHEEL=`readlink -f dist/*.whl`
     TMPDIR=`mktemp -d`
-    unzip -d $TMPDIR $WHEEL
-    rm $WHEEL
-    cd $TMPDIR
+    unzip -d ${TMPDIR} ${WHEEL}
+    rm ${WHEEL}
+    cd ${TMPDIR}
     mv *.data/data/mxnet/libmxnet.so mxnet
-    zip -r $WHEEL .
-    cp $WHEEL /work/build
-    rm -rf $TMPDIR
+    zip -r ${WHEEL} .
+    cp ${WHEEL} ${BUILD_DIR}
+    rm -rf ${TMPDIR}
+
+    popd
+}
+
+# Build commands: Every platform in docker/Dockerfile.build.<platform> should have a corresponding
+# function here with the same suffix:
+
+build_jetson() {
+    set -ex
+    pushd .
+
+    cp make/crosscompile.jetson.mk ./config.mk
+    make -j$(nproc)
+
+    build_wheel /work/mxnet/python /work/mxnet/lib
+    popd
+}
+
+report_ccache_usage() {
+    set -ex
+    pushd .
+
+    # Show global ccache summary at the end of each run.
+    ccache -s
+    if [ -e $CCACHE_LOGFILE ]
+    then
+        # Display local ccache log, excluding some overly verbose output.
+        cat $CCACHE_LOGFILE | grep -v "Config:" | grep -v "stats.lock"
+    else
+        echo "No ccache log found."
+    fi
+
     popd
 }
 
@@ -107,7 +138,7 @@ build_armv6() {
     # We do not need OpenMP, since most armv6 systems have only 1 core
 
     cmake \
-        -DCMAKE_TOOLCHAIN_FILE=$CROSS_ROOT/Toolchain.cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DUSE_CUDA=OFF \
@@ -120,11 +151,10 @@ build_armv6() {
         -DBUILD_CPP_EXAMPLES=OFF \
         -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
-    ninja
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
-    python setup.py bdist_wheel --universal
-    cp dist/*.whl /work/build
+
+    ninja -v
+    report_ccache_usage
+    build_wheel
     popd
 }
 
@@ -132,21 +162,31 @@ build_armv7() {
     set -ex
     pushd .
     cd /work/build
+
+    # Lapack functionality will be included and statically linked to openblas.
+    # But USE_LAPACK needs to be set to OFF, otherwise the main CMakeLists.txt
+    # file tries to add -llapack. Lapack functionality though, requires -lgfortran
+    # to be linked additionally.
+
     cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DCMAKE_CROSSCOMPILING=ON \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DUSE_CUDA=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_CUDA=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=ON \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_LAPACK=OFF \
+        -DBUILD_CPP_EXAMPLES=OFF \
+        -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
-    ninja
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
-    python setup.py bdist_wheel --universal
-    cp dist/*.whl /work/build
+
+    ninja -v
+    report_ccache_usage
+    build_wheel
     popd
 }
 
@@ -164,7 +204,8 @@ build_amzn_linux_cpu() {
         -DUSE_LAPACK=OFF\
         -DUSE_DIST_KVSTORE=ON\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v
+    report_ccache_usage
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
 }
 
@@ -173,20 +214,22 @@ build_arm64() {
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DUSE_CUDA=OFF\
+        -DSUPPORT_F16C=OFF\
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
+        -DCMAKE_BUILD_TYPE=Release\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v
+    report_ccache_usage
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
     cd /work/mxnet/python
     python setup.py bdist_wheel --universal
     cp dist/*.whl /work/build
 }
 
-build_android_arm64() {
+build_android_armv7() {
     set -ex
     cd /work/build
     cmake \
@@ -194,6 +237,7 @@ build_android_arm64() {
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DUSE_CUDA=OFF\
         -DUSE_SSE=OFF\
+        -DSUPPORT_F16C=OFF\
         -DUSE_LAPACK=OFF\
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
@@ -201,18 +245,44 @@ build_android_arm64() {
         -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v
+    report_ccache_usage
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
     cd /work/mxnet/python
     python setup.py bdist_wheel --universal
     cp dist/*.whl /work/build
 }
 
+build_android_arm64() {
+    set -ex
+    cd /work/build
+# There are other ways for CMake to cross compile android, like setting the following variables
+# below. But right not it doesn't work as expected, we need to find what's the best strategy to 
+# build with CMake in Android.
+#        -DCMAKE_ANDROID_NDK=${CROSS_ROOT} \
+#        -DCMAKE_SYSTEM_VERSION=${ANDROID_NDK_REVISION} \
+#        -DCMAKE_SYSTEM_NAME=Android \
+#
+    cmake\
+        -DANDROID=ON \
+        -DUSE_CUDA=OFF\
+        -DUSE_SSE=OFF\
+        -DUSE_LAPACK=OFF\
+        -DUSE_OPENCV=OFF\
+        -DUSE_OPENMP=OFF\
+        -DUSE_SIGNAL_HANDLER=ON\
+        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
+        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -G Ninja /work/mxnet
+    ninja -v
+}
+
 build_centos7_cpu() {
     set -ex
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
+
     make \
         DEV=1 \
         USE_LAPACK=1 \
@@ -220,6 +290,8 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_centos7_mkldnn() {
@@ -227,6 +299,7 @@ build_centos7_mkldnn() {
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
+
     make \
         DEV=1 \
         USE_LAPACK=1 \
@@ -234,6 +307,8 @@ build_centos7_mkldnn() {
         USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_centos7_gpu() {
@@ -267,6 +342,8 @@ build_ubuntu_cpu_openblas() {
         USE_BLAS=openblas             \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang39() {
@@ -283,6 +360,8 @@ build_ubuntu_cpu_clang39() {
         USE_OPENMP=0                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50() {
@@ -299,6 +378,8 @@ build_ubuntu_cpu_clang50() {
         USE_OPENMP=1                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang39_mkldnn() {
@@ -315,6 +396,8 @@ build_ubuntu_cpu_clang39_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=0                  \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50_mkldnn() {
@@ -331,17 +414,23 @@ build_ubuntu_cpu_clang50_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=1                  \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_mkldnn() {
     set -ex
+
     build_ccache_wrappers
+
     make  \
         DEV=1                         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_gpu() {
@@ -350,7 +439,9 @@ build_ubuntu_gpu() {
 
 build_ubuntu_gpu_mkldnn() {
     set -ex
+
     build_ccache_wrappers
+
     make  \
         DEV=1                         \
         USE_CPP_PACKAGE=1             \
@@ -360,6 +451,8 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDA_PATH=/usr/local/cuda \
         USE_CUDNN=1                   \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_gpu_cuda91_cudnn7() {
@@ -406,6 +499,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    report_ccache_usage
     # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
     cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
     mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
@@ -427,6 +521,7 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
+    report_ccache_usage
 }
 
 
@@ -448,9 +543,9 @@ unittest_ubuntu_python2_cpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-2.7 --verbose tests/python/unittest
-    nosetests-2.7 --verbose tests/python/train
-    nosetests-2.7 --verbose tests/python/quantization
+    nosetests-2.7 --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    nosetests-2.7 --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
+    nosetests-2.7 --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python3_cpu() {
@@ -460,20 +555,19 @@ unittest_ubuntu_python3_cpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/unittest
-    nosetests-3.4 --verbose tests/python/quantization
+    nosetests-3.4 --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    nosetests-3.4 --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python3_cpu_mkldnn() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/unittest
-    nosetests-3.4 --verbose tests/python/quantization
-    nosetests-3.4 --verbose tests/python/mkl
+    nosetests-3.4 --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    nosetests-3.4 --with-xunit --xunit-file nosetests_mkl.xml --verbose tests/python/mkl
 }
 
 unittest_ubuntu_python2_gpu() {
@@ -483,7 +577,7 @@ unittest_ubuntu_python2_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-2.7 --verbose tests/python/gpu
+    nosetests-2.7 --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
 tutorialtest_ubuntu_python3_gpu() {
@@ -494,7 +588,8 @@ tutorialtest_ubuntu_python3_gpu() {
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python3
-    cd /work/mxnet/tests/tutorials && nosetests-3.4 test_tutorials.py --nologcapture
+    cd /work/mxnet/tests/tutorials
+    nosetests-3.4 --with-xunit --xunit-file nosetests_tutorials.xml test_tutorials.py --nologcapture
 }
 
 tutorialtest_ubuntu_python2_gpu() {
@@ -505,7 +600,8 @@ tutorialtest_ubuntu_python2_gpu() {
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python2
-    cd /work/mxnet/tests/tutorials && nosetests-3.4 test_tutorials.py --nologcapture
+    cd /work/mxnet/tests/tutorials
+    nosetests-3.4 --with-xunit --xunit-file nosetests_tutorials.xml test_tutorials.py --nologcapture
 }
 
 unittest_ubuntu_python3_gpu() {
@@ -515,7 +611,7 @@ unittest_ubuntu_python3_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/gpu
+    nosetests-3.4 --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
 # quantization gpu currently only runs on P3 instances
@@ -527,7 +623,7 @@ unittest_ubuntu_python2_quantization_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-2.7 --verbose tests/python/quantization_gpu
+    nosetests-2.7 --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
 # quantization gpu currently only runs on P3 instances
@@ -539,7 +635,7 @@ unittest_ubuntu_python3_quantization_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/quantization_gpu
+    nosetests-3.4 --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
 unittest_ubuntu_cpu_scala() {
@@ -585,14 +681,14 @@ unittest_ubuntu_gpu_R() {
 unittest_centos7_cpu() {
     set -ex
     cd /work/mxnet
-    python3.6 -m "nose" --with-timer --verbose tests/python/unittest
-    python3.6 -m "nose" --with-timer --verbose tests/python/train
+    python3.6 -m "nose" --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    python3.6 -m "nose" --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
 }
 
 unittest_centos7_gpu() {
     set -ex
     cd /work/mxnet
-    python3.6 -m "nose" --with-timer --verbose tests/python/gpu
+    python3.6 -m "nose" --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
 integrationtest_ubuntu_cpu_onnx() {
diff --git a/ci/test_docker_cache.py b/ci/test_docker_cache.py
index fa8833fb002..3f471db5e7a 100644
--- a/ci/test_docker_cache.py
+++ b/ci/test_docker_cache.py
@@ -28,6 +28,7 @@
 import logging
 import subprocess
 import sys
+from unittest.mock import MagicMock
 
 sys.path.append(os.path.dirname(__file__))
 import docker_cache
@@ -87,6 +88,8 @@ def setUp(self):
         base = os.path.split(os.path.realpath(__file__))[0]
         os.chdir(base)
 
+        docker_cache._login_dockerhub = MagicMock()  # Override login
+
         # Stop in case previous execution was dirty
         try:
             self._stop_local_docker_registry()
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index 0c000d9955f..8facde16840 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -77,6 +77,7 @@ def GetConvertEnumVariableToString(self, variable=''):
 
 class Arg:
     typeDict = {'boolean':'bool',\
+        'boolean or None':'dmlc::optional<bool>',\
         'Shape(tuple)':'Shape',\
         'Symbol':'Symbol',\
         'NDArray':'Symbol',\
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index d2d44b07848..a1e40fe6c30 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -1,5 +1,5 @@
 var versionSelect   = defaultVersion = 'v1.2.0';
-var deviceSelect    = 'Linux';
+var platformSelect    = 'Linux';
 var languageSelect  = 'Python';
 var processorSelect = 'CPU';
 var environSelect   = 'Pip';
@@ -16,10 +16,10 @@ $(document).ready(function () {
         $('li a:contains(' + versionSelect + ')').parent().siblings().removeClass('active');
         $('li a:contains(' + versionSelect + ')').parent().addClass('active');
         $('.current-version').html( versionSelect + ' <span class="caret"></span></button>' );
-        if (urlParams.get('device'))
-            deviceSelect = urlParams.get('device');
-        $('button:contains(' + deviceSelect + ')').siblings().removeClass('active');
-        $('button:contains(' + deviceSelect + ')').addClass('active');
+        if (urlParams.get('platform'))
+            platformSelect = urlParams.get('platform');
+        $('button:contains(' + platformSelect + ')').siblings().removeClass('active');
+        $('button:contains(' + platformSelect + ')').addClass('active');
         if (urlParams.get('language'))
             languageSelect = urlParams.get('language');
         $('button:contains(' + languageSelect + ')').siblings().removeClass('active');
@@ -35,9 +35,9 @@ $(document).ready(function () {
         showContent();
         if (window.location.href.includes("/install/index.html")) {
             if (versionSelect.includes(defaultVersion)) {
-                history.pushState(null, null, '/install/index.html?device=' + deviceSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+                history.pushState(null, null, '/install/index.html?platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
             } else {
-                history.pushState(null, null, '/install/index.html?version=' + versionSelect + '&device=' + deviceSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+                history.pushState(null, null, '/install/index.html?version=' + versionSelect + '&platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
             }
         } 
     }
@@ -71,8 +71,8 @@ $(document).ready(function () {
                   history.pushState(null, null, '/install/index.html' + window.location.search.replace( 'version', 'prev' ));
               }
         }
-        else if ($(this).hasClass("Devices")) {
-            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('device'), $(this).text() ));
+        else if ($(this).hasClass("platforms")) {
+            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('platform'), $(this).text() ));
         }
         else if ($(this).hasClass("languages")) {
             history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('language'), $(this).text() ));
diff --git a/docs/_static/mxnet-theme/layout.html b/docs/_static/mxnet-theme/layout.html
index 3028342abd5..d72582cc1c7 100644
--- a/docs/_static/mxnet-theme/layout.html
+++ b/docs/_static/mxnet-theme/layout.html
@@ -102,11 +102,17 @@
        must come *after* these tags. #}
     {{ metatags }}
     {%- block htmltitle %}
-    {%- if pagename != 'index' and 'no title' not in title%}
+    <meta property="og:title"              content="{{ title }}" />
+    <meta property="og:image"              content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" />
+    <meta property="og:image:secure_url"              content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" />
+    {%- if pagename != 'index' and 'no title' not in title %}
+    <meta property="og:description"        content="{{ title }}" />
     <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
     {%- elif pagename == 'index' %}
+    <meta property="og:description"        content="A flexible and efficient library for deep learning." />
     <title>MXNet: A Scalable Deep Learning Framework</title>
     {%- else %}
+    <meta property="og:description"        content="A flexible and efficient library for deep learning." />
     <title>{{ pagename.split('/')[0]|capitalize }}{{ titlesuffix }}</title>
     {%- endif %}
     {%- endblock %}
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index f29301dec7a..12a898aadc2 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -14,7 +14,7 @@ export MXNET_GPU_WORKER_NTHREADS=3
   - Values: Int ```(default=2)```
   - The maximum number of threads to use on each GPU. This parameter is used to parallelize the computation within a single GPU card.
 * MXNET_GPU_COPY_NTHREADS
-  - Values: Int ```(default=1)```
+  - Values: Int ```(default=2)```
   - The maximum number of concurrent threads that do the memory copy job on each GPU.
 * MXNET_CPU_WORKER_NTHREADS
   - Values: Int ```(default=1)```
diff --git a/docs/install/index.md b/docs/install/index.md
index 4b966b62067..a35cf941501 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -20,11 +20,11 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <!-- START - OS Menu -->
 
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active Devices">Linux</button>
-  <button type="button" class="btn btn-default opt Devices">MacOS</button>
-  <button type="button" class="btn btn-default opt Devices">Windows</button>
-  <button type="button" class="btn btn-default opt Devices">Cloud</button>
-  <button type="button" class="btn btn-default opt Devices">Devices</button>
+  <button type="button" class="btn btn-default opt active platforms">Linux</button>
+  <button type="button" class="btn btn-default opt platforms">MacOS</button>
+  <button type="button" class="btn btn-default opt platforms">Windows</button>
+  <button type="button" class="btn btn-default opt platforms">Cloud</button>
+  <button type="button" class="btn btn-default opt platforms">Devices</button>
 </div>
 
 <!-- START - Language Menu -->
@@ -513,12 +513,23 @@ $ wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
 
 <div class="v1-2-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.2
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
 
 ```bash
-$ pip install mxnet-cu90
+nvcc --version
 ```
 
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+```bash
+$ pip install mxnet-cu92
+```
+
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
 sudo apt-get install graphviz
@@ -537,12 +548,23 @@ $ pip install mxnet-cu90mkl
 
 <div class="v1-1-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.1
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
 
 ```bash
-$ pip install mxnet-cu90==1.1.0
+nvcc --version
 ```
 
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+```bash
+$ pip install mxnet-cu91==1.1.0
+```
+
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
 sudo apt-get install graphviz
@@ -551,11 +573,13 @@ pip install graphviz
 
 **Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+**Experimental Choice** If You would like to install MXNet with Intel MKL, try the experimental pip package with MKL:
 ```bash
-$ pip install mxnet-cu90mkl==1.1.0
+$ pip install mxnet-cu91mkl==1.1.0
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-1-0-->
 
 
@@ -621,10 +645,10 @@ $ pip install mxnet-cu90mkl==0.12.0
 
 <div class="v0-11-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 8.0
 
 ```bash
-$ pip install mxnet-cu90==0.11.0
+$ pip install mxnet-cu80==0.11.0
 ```
 
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
@@ -635,9 +659,9 @@ pip install graphviz
 
 **Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+**Experimental Choice** If You would like to install MXNet with Intel MKL, try the experimental pip package with MKL:
 ```bash
-$ pip install mxnet-cu90mkl==0.11.0
+$ pip install mxnet-cu80mkl==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
@@ -687,23 +711,45 @@ Installing *MXNet* with pip requires a latest version of `pip`. Install the late
 
 <div class="v1-2-0">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+Install *MXNet* with GPU support using CUDA 9.2:
 
 ```bash
-(mxnet)$ pip install mxnet-cu90
+(mxnet)$ pip install mxnet-cu92
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-2-0-->
 
 
 <div class="v1-1-0">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+Install *MXNet* with GPU support using CUDA 9.1:
 
 ```bash
-(mxnet)$ pip install mxnet-cu90==1.1.0
+(mxnet)$ pip install mxnet-cu91==1.1.0
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-1-0-->
 
 
@@ -714,6 +760,7 @@ Install *MXNet* with GPU support using CUDA 9.0.
 ```bash
 (mxnet)$ pip install mxnet-cu90==1.0.0
 ```
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
 
 </div> <!-- End of v1-0-0-->
 
@@ -726,33 +773,40 @@ Install *MXNet* with GPU support using CUDA 9.0.
 (mxnet)$ pip install mxnet-cu90==0.12.1
 ```
 
-For *MXNet* 0.12.0 with GPU support using CUDA 9.0.
-
-```bash
-(mxnet)$ pip install mxnet-cu90==0.12.0
-``` 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
 
 </div> <!-- End of v0-12-1-->
 
 
 <div class="v0-11-0">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+Install *MXNet* with GPU support using CUDA 8.0.
 
 ```bash
-(mxnet)$ pip install mxnet-cu90==0.11.0
+(mxnet)$ pip install mxnet-cu80==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
 
 <div class="master">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+Install *MXNet* with GPU support using CUDA 9.2.
 
 ```bash
-(mxnet)$ pip install mxnet-cu90 --pre
+(mxnet)$ pip install mxnet-cu92 --pre
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of master-->
 
 **Step 4**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
@@ -1572,22 +1626,44 @@ Follow the installation instructions [in this guide](./windows_setup.md) to set
 
 <div class="v1-2-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.2.
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
 ```bash
-$ pip install mxnet-cu90
+$ pip install mxnet-cu92
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-2-0-->
 
 <div class="v1-1-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.1.
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
 ```bash
-$ pip install mxnet-cu90==1.1.0
+$ pip install mxnet-cu91==1.1.0
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-1-0-->
 
 <div class="v1-0-0">
@@ -1618,22 +1694,33 @@ $ pip install mxnet-cu90==0.12.0
 
 <div class="v0-11-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 8.0.
 
 ```bash
-$ pip install mxnet-cu90==0.11.0
+$ pip install mxnet-cu80==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
 
 <div class="master">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.2.
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
 ```bash
-$ pip install mxnet-cu90 --pre
+$ pip install mxnet-cu92 --pre
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of master-->
 
 Refer to [#8671](https://github.com/apache/incubator-mxnet/issues/8671) for status on CUDA 9.1 support.
@@ -1644,7 +1731,7 @@ Refer to [#8671](https://github.com/apache/incubator-mxnet/issues/8671) for stat
 
 We provide both options to build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), and [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/).
 
-**Option 1** 
+**Option 1**
 
 To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
 
@@ -1678,7 +1765,7 @@ git clone https://github.com/apache/incubator-mxnet.git --recursive
 "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
 ```
 
-5. Create a build dir using the following command and go to the directory, for example: 
+5. Create a build dir using the following command and go to the directory, for example:
 
 ```r
 mkdir C:\build
@@ -1699,11 +1786,11 @@ NOTE: make sure the DCUDNN_INCLUDE and DCUDNN_LIBRARY pointing to the “include
 msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
 ```
 
-**Option 2** 
+**Option 2**
 
 To build and install MXNet yourself using [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/), you need the following dependencies. Install the required dependencies:
 
-1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition. At least Update 3 of Microsoft Visual Studio 2015 is required to build MXNet from source. Upgrade via it's ```Tools -> Extensions and Updates... | Product Updates``` menu.
 2. Download and install [CMake](https://cmake.org/) if it is not already installed.
 3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
 4. Unzip the OpenCV package.
@@ -1711,10 +1798,12 @@ To build and install MXNet yourself using [Microsoft Visual Studio 2015](https:/
 6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
 7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
 8. Download and install [CUDA](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64) and [cuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
+9. Set the environment variable ```CUDACXX``` to point to the ```CUDA Compiler```(```C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.1\bin\nvcc.exe``` for example).
+10. Set the environment variable ```CUDNN_ROOT``` to point to the ```cuDNN``` directory that contains the ```include```,  ```lib``` and ```bin``` directories (```C:\Downloads\cudnn-9.1-windows7-x64-v7\cuda``` for example).
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
-1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet).
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet) (make sure you also download third parties submodules e.g. ```git clone --recurse-submodules```).
 2. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build```.
 3. In Visual Studio, open the solution file,```.sln```, and compile it.
 These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
@@ -1778,7 +1867,7 @@ Follow the installation instructions [in this guide](./windows_setup.md) to set
 <p> To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>. </p>
 <br/>
 </div> <!-- End of cpu gpu -->
-</div> <!-- End of C++>
+</div> <!-- End of C++ -->
 </div> <!-- End of Windows -->
 
 
@@ -2402,4 +2491,3 @@ Will be available soon.
 # Source Download
 
 <a href="download.html">Download</a> your required version of MXNet.
-
diff --git a/docs/tutorials/gluon/hybrid.md b/docs/tutorials/gluon/hybrid.md
index 3554a15fa3b..f9f2c112f53 100644
--- a/docs/tutorials/gluon/hybrid.md
+++ b/docs/tutorials/gluon/hybrid.md
@@ -87,7 +87,7 @@ net(x)
 Hybrid execution can be activated by simply calling `.hybridize()` on the top
 level layer. The first forward call after activation will try to build a
 computation graph from `hybrid_forward` and cache it. On subsequent forward
-calls the cached graph instead of `hybrid_forward` will be invoked:
+calls the cached graph, instead of `hybrid_forward`, will be invoked:
 
 ```python
 net.hybridize()
@@ -105,23 +105,35 @@ Hybridize will speed up execution and save memory. If the top level layer is
 not a `HybridBlock`, you can still call `.hybridize()` on it and Gluon will try
 to hybridize its children layers instead.
 
+`hybridize` also accepts several options for performance tuning. For example, you
+can do
+
+```python
+net.hybridize(static_alloc=True)
+# or
+net.hybridize(static_alloc=True, static_shape=True)
+```
+
+Please refer to the [API manual](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=hybridize#mxnet.gluon.Block.hybridize)
+for details.
+
 ## Serializing trained model for deployment
 
-Models implemented as `HybridBlock` can be easily serialized for deployment
-using other language front-ends like C, C++ and Scala. To this end, we simply
-forward the model with symbolic variables instead of NDArrays and save the
-output Symbol(s):
+Models implemented as `HybridBlock` can be easily serialized. The serialized
+model can be loaded back later or used for deployment
+with other language front-ends like C, C++ and Scala. To this end, we simply
+use `export` and `SymbolBlock.imports`:
 
 ```python
-x = mx.sym.var('data')
-y = net(x)
-print(y)
-y.save('model.json')
-net.save_params('model.params')
+net.export('model', epoch=1)
 ```
 
-If your network outputs more than one value, you can use `mx.sym.Group` to
-combine them into a grouped Symbol and then save. The saved json and params
-files can then be loaded with C, C++ and Scala interface for prediction.
+Two files `model-symbol.json` and `model-0001.params` are saved on disk.
+You can use other language bindings to load them. You can also load them back
+to gluon with `SymbolBlock`:
+
+```python
+net2 = gluon.SymbolBlock.imports('model-symbol.json', ['data'], 'model-0001.params')
+```
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/naming.md b/docs/tutorials/gluon/naming.md
index 37b63fa08a9..3606a03dcbd 100644
--- a/docs/tutorials/gluon/naming.md
+++ b/docs/tutorials/gluon/naming.md
@@ -203,12 +203,12 @@ except Exception as e:
     Parameter 'model1_dense0_weight' is missing in file 'model.params', which contains parameters: 'model0_mydense_weight', 'model0_dense1_bias', 'model0_dense1_weight', 'model0_dense0_weight', 'model0_dense0_bias', 'model0_mydense_bias'. Please make sure source and target networks have the same prefix.
 
 
-To solve this problem, we use `save_params`/`load_params` instead of `collect_params` and `save`/`load`. `save_params` uses model structure, instead of parameter name, to match parameters.
+To solve this problem, we use `save_parameters`/`load_parameters` instead of `collect_params` and `save`/`load`. `save_parameters` uses model structure, instead of parameter name, to match parameters.
 
 
 ```python
-model0.save_params('model.params')
-model1.load_params('model.params')
+model0.save_parameters('model.params')
+model1.load_parameters('model.params')
 print(mx.nd.load('model.params').keys())
 ```
 
diff --git a/docs/tutorials/gluon/save_load_params.md b/docs/tutorials/gluon/save_load_params.md
index cd876808a86..f5f48125cc1 100644
--- a/docs/tutorials/gluon/save_load_params.md
+++ b/docs/tutorials/gluon/save_load_params.md
@@ -10,7 +10,7 @@ Parameters of any Gluon model can be saved using the `save_params` and `load_par
 
 **2. Save/load model parameters AND architecture**
 
-The Model architecture of `Hybrid` models stays static and don't change during execution. Therefore both model parameters AND architecture can be saved and loaded using `export`, `load_checkpoint` and `load` methods.
+The Model architecture of `Hybrid` models stays static and don't change during execution. Therefore both model parameters AND architecture can be saved and loaded using `export`, `imports` methods.
 
 Let's look at the above methods in more detail. Let's start by importing the modules we'll need.
 
@@ -61,7 +61,7 @@ def build_lenet(net):
         net.add(gluon.nn.Dense(512, activation="relu"))
         # Second fully connected layer with as many neurons as the number of classes
         net.add(gluon.nn.Dense(num_outputs))
-        
+
         return net
 
 # Train a given model using MNIST data
@@ -240,18 +240,10 @@ One of the main reasons to serialize model architecture into a JSON file is to l
 
 ### From Python
 
-Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `mx.model.load_checkpoint` and `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above.
+Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above.
 
 ```python
-# Load the network architecture and parameters
-sym = mx.sym.load('lenet-symbol.json')
-# Create a Gluon Block using the loaded network architecture.
-# 'inputs' parameter specifies the name of the symbol in the computation graph
-# that should be treated as input. 'data' is the default name used for input when
-# a model architecture is saved to a file.
-deserialized_net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('data'))
-# Load the parameters
-deserialized_net.collect_params().load('lenet-0001.params', ctx=ctx)
+deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params")
 ```
 
 `deserialized_net` now contains the network we deserialized from files. Let's test the deserialized network to make sure it works.
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index a970c0a52ef..e095a83419b 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -38,7 +38,7 @@ Select API:&nbsp;
     * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
 * Practitioner Guides
     * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
-    * [Checkpointing and Model Serialization (a.k.a. saving and loading)](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/> ([Alternative](/tutorials/gluon/save_load_params.html))
+    * [Checkpointing and Model Serialization (a.k.a. saving and loading)](/tutorials/gluon/save_load_params.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/> ([Alternative](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html))
     * [Inference using an ONNX model](/tutorials/onnx/inference_on_onnx_model.html)
     * [Fine-tuning an ONNX model on Gluon](/tutorials/onnx/fine_tuning_gluon.html)
     * [Visualizing Decisions of Convolutional Neural Networks](/tutorials/vision/cnn_visualization.html)
diff --git a/example/README.md b/example/README.md
index 0dc6138c2ef..dad3773d0d8 100644
--- a/example/README.md
+++ b/example/README.md
@@ -22,10 +22,12 @@ This page contains a curated list of awesome MXNet examples, tutorials and blogs
 
 If you want to contribute to this list and the examples, please open a new pull request.
 
+
 ### Examples
 
 Example applications or scripts should be submitted in this `example` folder.
 
+
 ### Tutorials
 
 If you have a tutorial idea for the website, download the [Jupyter notebook tutorial template](https://github.com/dmlc/mxnet/tree/master/example/MXNetTutorialTemplate.ipynb).
@@ -218,8 +220,11 @@ If your tutorial depends on specific packages, simply add them to this provision
 
 * Dmitrii Tsybulevskii, 1st place of the [Yelp Restaurant Photo Classification](https://www.kaggle.com/c/yelp-restaurant-photo-classification). Link to [the Kaggle interview](http://blog.kaggle.com/2016/04/28/yelp-restaurant-photo-classification-winners-interview-1st-place-dmitrii-tsybulevskii/).
 
-## <a name="tools-with-mxnet"></a>Tools with MXnet
+## <a name="tools-with-mxnet"></a>Tools with MXNet
+* [Keras-MXNet](https://github.com/awslabs/keras-apache-mxnet) - Keras 2 with an MXNet Backend
+* [MinPy](https://github.com/dmlc/minpy) - Pure numpy practice with third party operator integration and MXNet as backend for GPU computing
+* [MXBoard](https://github.com/awslabs/mxboard) - Model visualizations using TensorBoard
+* [MXNet Model Server](model-server/mms.md) - A flexible and easy to use tool for serving Deep Learning models
+* [MXNet-face](https://github.com/tornadomeet/mxnet-face) - Using MXNet for face-related algorithm by [tornadomeet](https://github.com/tornadomeet) where the single model get 97.13%+-0.88% accuracy on LFW, and with only 20MB size
+* [ONNX-MXNet](https://mxnet.incubator.apache.org/api/python/contrib/onnx.html) - Implements the ONNX model format support within MXNet
 * [TensorFuse](https://github.com/dementrock/tensorfuse) - Common interface for Theano, CGT, TensorFlow, and mxnet (experimental) by [dementrock](https://github.com/dementrock)
-* [MXnet-face](https://github.com/tornadomeet/mxnet-face) - Using mxnet for face-related algorithm by [tornadomeet](https://github.com/tornadomeet) where the single model get 97.13%+-0.88% accuracy on LFW, and with only 20MB size.
-* [MinPy](https://github.com/dmlc/minpy) - Pure numpy practice with third party operator Integration and MXnet as backend for GPU computing
-* [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) - a flexible and easy to use tool for serving Deep Learning models
diff --git a/example/gluon/dcgan.py b/example/gluon/dcgan.py
index 3233f430eea..8ac9c522cf5 100644
--- a/example/gluon/dcgan.py
+++ b/example/gluon/dcgan.py
@@ -229,8 +229,8 @@ def transformer(data, label):
     logging.info('time: %f' % (time.time() - tic))
 
     if check_point:
-        netG.save_params(os.path.join(outf,'generator_epoch_%d.params' %epoch))
-        netD.save_params(os.path.join(outf,'discriminator_epoch_%d.params' % epoch))
+        netG.save_parameters(os.path.join(outf,'generator_epoch_%d.params' %epoch))
+        netD.save_parameters(os.path.join(outf,'discriminator_epoch_%d.params' % epoch))
 
-netG.save_params(os.path.join(outf, 'generator.params'))
-netD.save_params(os.path.join(outf, 'discriminator.params'))
+netG.save_parameters(os.path.join(outf, 'generator.params'))
+netD.save_parameters(os.path.join(outf, 'discriminator.params'))
diff --git a/example/gluon/embedding_learning/train.py b/example/gluon/embedding_learning/train.py
index 46f76b55614..b8a5bf2716c 100644
--- a/example/gluon/embedding_learning/train.py
+++ b/example/gluon/embedding_learning/train.py
@@ -246,7 +246,7 @@ def train(epochs, ctx):
         if val_accs[0] > best_val:
             best_val = val_accs[0]
             logging.info('Saving %s.' % opt.save_model_prefix)
-            net.save_params('%s.params' % opt.save_model_prefix)
+            net.save_parameters('%s.params' % opt.save_model_prefix)
     return best_val
 
 
diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py
index 6e2f1d6a78d..b21e943f17f 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -122,7 +122,7 @@ def get_model(model, ctx, opt):
 
     net = models.get_model(model, **kwargs)
     if opt.resume:
-        net.load_params(opt.resume)
+        net.load_parameters(opt.resume)
     elif not opt.use_pretrained:
         if model in ['alexnet']:
             net.initialize(mx.init.Normal())
@@ -176,12 +176,12 @@ def update_learning_rate(lr, trainer, epoch, ratio, steps):
 def save_checkpoint(epoch, top1, best_acc):
     if opt.save_frequency and (epoch + 1) % opt.save_frequency == 0:
         fname = os.path.join(opt.prefix, '%s_%d_acc_%.4f.params' % (opt.model, epoch, top1))
-        net.save_params(fname)
+        net.save_parameters(fname)
         logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
     if top1 > best_acc[0]:
         best_acc[0] = top1
         fname = os.path.join(opt.prefix, '%s_best.params' % (opt.model))
-        net.save_params(fname)
+        net.save_parameters(fname)
         logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
 
 def train(opt, ctx):
@@ -267,7 +267,7 @@ def main():
                 optimizer = 'sgd',
                 optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
                 initializer = mx.init.Xavier(magnitude=2))
-        mod.save_params('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
+        mod.save_parameters('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
     else:
         if opt.mode == 'hybrid':
             net.hybridize()
diff --git a/example/gluon/mnist.py b/example/gluon/mnist.py
index 198d7ca5ab2..6aea3abc504 100644
--- a/example/gluon/mnist.py
+++ b/example/gluon/mnist.py
@@ -117,7 +117,7 @@ def train(epochs, ctx):
         name, val_acc = test(ctx)
         print('[Epoch %d] Validation: %s=%f'%(epoch, name, val_acc))
 
-    net.save_params('mnist.params')
+    net.save_parameters('mnist.params')
 
 
 if __name__ == '__main__':
diff --git a/example/gluon/style_transfer/main.py b/example/gluon/style_transfer/main.py
index cab8211bc9c..dde992ae700 100644
--- a/example/gluon/style_transfer/main.py
+++ b/example/gluon/style_transfer/main.py
@@ -55,7 +55,7 @@ def train(args):
     style_model.initialize(init=mx.initializer.MSRAPrelu(), ctx=ctx)
     if args.resume is not None:
         print('Resuming, initializing using weight from {}.'.format(args.resume))
-        style_model.load_params(args.resume, ctx=ctx)
+        style_model.load_parameters(args.resume, ctx=ctx)
     print('style_model:',style_model)
     # optimizer and loss
     trainer = gluon.Trainer(style_model.collect_params(), 'adam',
@@ -121,14 +121,14 @@ def train(args):
                     str(count) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
                     args.content_weight) + "_" + str(args.style_weight) + ".params"
                 save_model_path = os.path.join(args.save_model_dir, save_model_filename)
-                style_model.save_params(save_model_path)
+                style_model.save_parameters(save_model_path)
                 print("\nCheckpoint, trained model saved at", save_model_path)
 
     # save model
     save_model_filename = "Final_epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
         args.content_weight) + "_" + str(args.style_weight) + ".params"
     save_model_path = os.path.join(args.save_model_dir, save_model_filename)
-    style_model.save_params(save_model_path)
+    style_model.save_parameters(save_model_path)
     print("\nDone, trained model saved at", save_model_path)
 
 
@@ -143,7 +143,7 @@ def evaluate(args):
     style_image = utils.preprocess_batch(style_image)
     # model
     style_model = net.Net(ngf=args.ngf)
-    style_model.load_params(args.model, ctx=ctx)
+    style_model.load_parameters(args.model, ctx=ctx)
     # forward
     style_model.set_target(style_image)
     output = style_model(content_image)
diff --git a/example/gluon/super_resolution.py b/example/gluon/super_resolution.py
index 38c3bec8949..0f2f21f3c0a 100644
--- a/example/gluon/super_resolution.py
+++ b/example/gluon/super_resolution.py
@@ -168,13 +168,13 @@ def train(epoch, ctx):
         print('training mse at epoch %d: %s=%f'%(i, name, acc))
         test(ctx)
 
-    net.save_params('superres.params')
+    net.save_parameters('superres.params')
 
 def resolve(ctx):
     from PIL import Image
     if isinstance(ctx, list):
         ctx = [ctx[0]]
-    net.load_params('superres.params', ctx=ctx)
+    net.load_parameters('superres.params', ctx=ctx)
     img = Image.open(opt.resolve_img).convert('YCbCr')
     y, cb, cr = img.split()
     data = mx.nd.expand_dims(mx.nd.expand_dims(mx.nd.array(y), axis=0), axis=0)
diff --git a/example/gluon/tree_lstm/main.py b/example/gluon/tree_lstm/main.py
index d2fe464638a..ad5d59f7a47 100644
--- a/example/gluon/tree_lstm/main.py
+++ b/example/gluon/tree_lstm/main.py
@@ -138,7 +138,7 @@ def test(ctx, data_iter, best, mode='validation', num_iter=-1):
         if test_r >= best:
             best = test_r
             logging.info('New optimum found: {}. Checkpointing.'.format(best))
-            net.save_params('childsum_tree_lstm_{}.params'.format(num_iter))
+            net.save_parameters('childsum_tree_lstm_{}.params'.format(num_iter))
             test(ctx, test_iter, -1, 'test')
         return best
 
diff --git a/example/gluon/word_language_model/train.py b/example/gluon/word_language_model/train.py
index 9e152636bb0..7f0a916b79b 100644
--- a/example/gluon/word_language_model/train.py
+++ b/example/gluon/word_language_model/train.py
@@ -185,7 +185,7 @@ def train():
         if val_L < best_val:
             best_val = val_L
             test_L = eval(test_data)
-            model.save_params(args.save)
+            model.save_parameters(args.save)
             print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
         else:
             args.lr = args.lr*0.25
@@ -193,6 +193,6 @@ def train():
 
 if __name__ == '__main__':
     train()
-    model.load_params(args.save, context)
+    model.load_parameters(args.save, context)
     test_L = eval(test_data)
     print('Best test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
diff --git a/example/image-classification/predict-cpp/CMakeLists.txt b/example/image-classification/predict-cpp/CMakeLists.txt
index a2f52b9df3a..c42d1917b76 100644
--- a/example/image-classification/predict-cpp/CMakeLists.txt
+++ b/example/image-classification/predict-cpp/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Check OpenCV
-if(NOT USE_OPENCV OR NOT OpenCV_FOUND)
+if(NOT USE_OPENCV OR NOT OpenCV_FOUND OR OpenCV_VERSION_MAJOR LESS 3)
   message(WARNING "\
-OpenCV should be enabled and found to build image classification example, skipping...")
+OpenCV version >= 3 should be enabled and found to build image classification example, skipping...")
   return()
 endif()
 
diff --git a/example/model-server/mms.md b/example/model-server/mms.md
new file mode 100644
index 00000000000..821996380c7
--- /dev/null
+++ b/example/model-server/mms.md
@@ -0,0 +1,115 @@
+# Model Server for Apache MXNet (incubating)
+
+[Model Server for Apache MXNet (incubating)](https://github.com/awslabs/mxnet-model-server), otherwise known as MXNet Model Server (MMS), is an open source project aimed at providing a simple yet scalable solution for model inference. It is a set of command line tools for packaging model archives and serving them. The tools are written in Python, and have been extended to support containers for easy deployment and scaling. MMS also supports basic logging and advanced metrics with Amazon CloudWatch integration.
+
+
+## Multi-Framework Model Support with ONNX
+
+MMS supports both *symbolic* MXNet and *imperative* Gluon models. While the name implies that MMS is just for MXNet, it is in fact much more flexible, as it can support models in the [ONNX](https://onnx.ai) format. This means that models created and trained in PyTorch, Caffe2, or other ONNX-supporting frameworks can be served with MMS.
+
+To find out more about MXNet's support for ONNX models and using ONNX with MMS, refer to the following resources:
+
+* [MXNet-ONNX Docs](https://mxnet.incubator.apache.org/api/python/contrib/onnx.md)
+* [Export an ONNX Model to Serve with MMS](https://github.com/awslabs/mxnet-model-server/docs/export_from_onnx.md)
+
+
+## Getting Started
+
+To install MMS with ONNX support, make sure you have Python installed, then for Ubuntu run:
+
+```bash
+sudo apt-get install protobuf-compiler libprotoc-dev
+pip install mxnet-model-server
+```
+
+Or for Mac run:
+
+```bash
+conda install -c conda-forge protobuf
+pip install mxnet-model-server
+```
+
+
+## Serving a Model
+
+To serve a model you must first create or download a model archive. Visit the [model zoo](https://github.com/awslabs/mxnet-model-server/docs/model_zoo.md) to browse the models. MMS options can be explored as follows:
+
+```bash
+mxnet-model-server --help
+```
+
+Here is an easy example for serving an object classification model. You can use any URI and the model will be downloaded first, then served from that location:
+
+```bash
+mxnet-model-server \
+  --models squeezenet=https://s3.amazonaws.com/model-server/models/squeezenet_v1.1/squeezenet_v1.1.model
+```
+
+
+### Test Inference on a Model
+
+Assuming you have run the previous `mxnet-model-server` command to start serving the object classification model, you can now upload an image to its `predict` REST API endpoint. The following will download a picture of a kitten, then upload it to the prediction endpoint.
+
+```bash
+curl -O https://s3.amazonaws.com/model-server/inputs/kitten.jpg
+curl -X POST http://127.0.0.1:8080/squeezenet/predict -F "data=@kitten.jpg"
+```
+
+The predict endpoint will return a prediction response in JSON. It will look something like the following result:
+
+```
+{
+  "prediction": [
+    [
+      {
+        "class": "n02124075 Egyptian cat",
+        "probability": 0.9408261179924011
+      },
+...
+```
+
+For more examples of serving models visit the following resources:
+
+* [Quickstart: Model Serving](https://github.com/awslabs/mxnet-model-server/README.md#serve-a-model)
+* [Running the Model Server](https://github.com/awslabs/mxnet-model-server/docs/server.md)
+
+
+## Create a Model Archive
+
+Creating a model archive involves rounding up the required model artifacts, then using the `mxnet-model-export` command line interface. The process for creating archives is likely to evolve. As the project adds features, we recommend that you review the following resources to get the latest instructions:
+
+* [Quickstart: Export a Model](https://github.com/awslabs/mxnet-model-server/README.md#export-a-model)
+* [Model Artifacts](https://github.com/awslabs/mxnet-model-server/docs/export_model_file_tour.md)
+* [Loading and Serving Gluon Models](https://github.com/awslabs/mxnet-model-server/tree/master/examples/gluon_alexnet)
+* [Creating a MMS Model Archive from an ONNX Model](https://github.com/awslabs/mxnet-model-server/docs/export_from_onnx.md)
+* [Create an ONNX model (that will run with MMS) from PyTorch](https://github.com/onnx/onnx-mxnet/blob/master/README.md#quick-start)
+
+
+## Using Containers
+
+Using Docker or other container services with MMS is a great way to scale your inference applications. You can use Docker to pull the latest version:
+
+```
+docker pull awsdeeplearningteam/mms_gpu
+```
+
+It is recommended that you review the following resources for more information:
+
+* [MMS Docker Hub](https://hub.docker.com/u/awsdeeplearningteam/)
+* [Using MMS with Docker Quickstart](https://github.com/awslabs/mxnet-model-server/docker/README.md)
+* [MMS on Fargate](https://github.com/awslabs/mxnet-model-server/docs/mms_on_fargate.md)
+* [Optimized Container Configurations for MMS](https://github.com/awslabs/mxnet-model-server/docs/optimized_config.md)
+* [Orchestrating, monitoring, and scaling with MMS, Amazon Elastic Container Service, AWS Fargate, and Amazon CloudWatch)](https://aws.amazon.com/blogs/machine-learning/apache-mxnet-model-server-adds-optimized-container-images-for-model-serving-at-scale/)
+
+
+## Community & Contributions
+
+The MMS project is open to contributions from the community. If you like the idea of a flexible, scalable, multi-framework serving solution for your models and would like to provide feedback, suggest features, or even jump in and contribute code or examples, please visit the [project page on GitHub](https://github.com/awslabs/mxnet-model-server). You can create an issue there, or join the discussion on the forum.
+
+* [MXNet Forum - MMS Discussions](https://discuss.mxnet.io/c/mxnet-model-server)
+
+
+## Further Reading
+
+* [GitHub](https://github.com/awslabs/mxnet-model-server)
+* [MMS Docs](https://github.com/awslabs/mxnet-model-server/docs)
diff --git a/example/quantization/imagenet_gen_qsym.py b/example/quantization/imagenet_gen_qsym.py
index 045ce62489a..85474b663fa 100644
--- a/example/quantization/imagenet_gen_qsym.py
+++ b/example/quantization/imagenet_gen_qsym.py
@@ -53,6 +53,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model')
+    parser.add_argument('--ctx', type=str, default='gpu')
     parser.add_argument('--model', type=str, choices=['imagenet1k-resnet-152', 'imagenet1k-inception-bn'],
                         help='currently only supports imagenet1k-resnet-152 or imagenet1k-inception-bn')
     parser.add_argument('--batch-size', type=int, default=32)
@@ -91,8 +92,18 @@ def save_params(fname, arg_params, aux_params, logger=None):
                              ' thresholds. This mode is expected to produce the best inference accuracy of all three'
                              ' kinds of quantized models if the calibration dataset is representative enough of the'
                              ' inference dataset.')
+    parser.add_argument('--quantized-dtype', type=str, default='int8', 
+                        choices=['int8', 'uint8'],
+                        help='quantization destination data type for input data')
     args = parser.parse_args()
 
+    if args.ctx == 'gpu':
+        ctx = mx.gpu(0)
+    elif args.ctx == 'cpu':
+        ctx = mx.cpu(0)
+    else:
+        raise ValueError('ctx %s is not supported in this script' % args.ctx)
+
     logging.basicConfig()
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
@@ -129,17 +140,26 @@ def save_params(fname, arg_params, aux_params, logger=None):
     excluded_sym_names = []
     if args.model == 'imagenet1k-resnet-152':
         rgb_mean = '0,0,0'
-        calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
-                                                                 or name.find('sc') != -1
-                                                                 or name.find('fc') != -1)
+        if args.ctx == 'gpu':
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                     or name.find('sc') != -1
+                                                                     or name.find('fc') != -1)
+        else:
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                     or name.find('sc') != -1)
+            excluded_sym_names += ['flatten0', 'fc1']
         if exclude_first_conv:
-            excluded_sym_names = ['conv0']
+            excluded_sym_names += ['conv0']
     elif args.model == 'imagenet1k-inception-bn':
         rgb_mean = '123.68,116.779,103.939'
-        calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
-                                                                 or name.find('fc') != -1)
+        if args.ctx == 'gpu':
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                     or name.find('fc') != -1)
+        else:
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1)
+            excluded_sym_names += ['flatten', 'fc1']
         if exclude_first_conv:
-            excluded_sym_names = ['conv_1']
+            excluded_sym_names += ['conv_1']
     else:
         raise ValueError('model %s is not supported in this script' % args.model)
 
@@ -156,8 +176,9 @@ def save_params(fname, arg_params, aux_params, logger=None):
     if calib_mode == 'none':
         logger.info('Quantizing FP32 model %s' % args.model)
         qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                       excluded_sym_names=excluded_sym_names,
-                                                       calib_mode=calib_mode, logger=logger)
+                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
+                                                       logger=logger)
         sym_name = '%s-symbol.json' % (prefix + '-quantized')
         save_symbol(sym_name, qsym, logger)
     else:
@@ -176,10 +197,11 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                      **mean_args)
 
         cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                        ctx=mx.gpu(0), excluded_sym_names=excluded_sym_names,
+                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                         calib_mode=calib_mode, calib_data=data,
                                                         num_calib_examples=num_calib_batches * batch_size,
-                                                        calib_layer=calib_layer, logger=logger)
+                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        logger=logger)
         if calib_mode == 'entropy':
             suffix = '-quantized-%dbatches-entropy' % num_calib_batches
         elif calib_mode == 'naive':
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index fe3f2661c65..85649530aa0 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -99,6 +99,7 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Score a model on a dataset')
+    parser.add_argument('--ctx', type=str, default='gpu')
     parser.add_argument('--symbol-file', type=str, required=True, help='symbol file path')
     parser.add_argument('--param-file', type=str, required=True, help='param file path')
     parser.add_argument('--batch-size', type=int, default=32)
@@ -122,6 +123,13 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
 
     args = parser.parse_args()
 
+    if args.ctx == 'gpu':
+        ctx = mx.gpu(0)
+    elif args.ctx == 'cpu':
+        ctx = mx.cpu(0)
+    else:
+        raise ValueError('ctx %s is not supported in this script' % args.ctx)
+    
     logging.basicConfig()
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
@@ -172,5 +180,5 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
 
     num_inference_images = args.num_inference_batches * batch_size
     logger.info('Running model %s for inference' % symbol_file)
-    score(sym, arg_params, aux_params, data, [mx.gpu(0)], label_name,
+    score(sym, arg_params, aux_params, data, [ctx], label_name,
           max_num_examples=num_inference_images, logger=logger)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 6b7cf4407ed..55c26bc980b 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -987,6 +987,11 @@ MXNET_DLL int MXCreateCachedOpEx(SymbolHandle handle,
                                  int num_flags,
                                  const char** keys,
                                  const char** vals,
+                                 int num_inputs,
+                                 const char** input_names,
+                                 int num_params,
+                                 const char** param_names,
+                                 NDArrayHandle* params,
                                  CachedOpHandle *out);
 /*!
  * \brief free cached operator
@@ -1431,13 +1436,15 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
  * \param excluded_symbols array of symbols to be excluded from being quantized
  * \param num_offline number of parameters that are quantized offline
  * \param offline_params array of c strings representing the names of params quantized offline
+ * \param quantized_dtype the quantized destination type for input data.
  */
 MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
                                SymbolHandle *ret_sym_handle,
                                const mx_uint num_excluded_symbols,
                                const SymbolHandle *excluded_symbols,
                                const mx_uint num_offline,
-                               const char **offline_params);
+                               const char **offline_params,
+                               const char *quantized_dtype);
 
 /*!
  * \brief Set calibration table to node attributes in the sym
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 7ea60df3302..758ce851321 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -35,6 +35,23 @@
 #include "./ndarray.h"
 
 namespace mxnet {
+/*! \brief CachedOp Parameters */
+struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
+  uint32_t inline_limit;
+  uint32_t forward_bulk_size;
+  uint32_t backward_bulk_size;
+  DMLC_DECLARE_PARAMETER(CachedOpConfig) {
+    DMLC_DECLARE_FIELD(inline_limit)
+    .set_default(2)
+    .describe("Maximum number of operators that can be inlined.");
+    DMLC_DECLARE_FIELD(forward_bulk_size)
+    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .describe("Segment size of bulk execution during forward pass.");
+    DMLC_DECLARE_FIELD(backward_bulk_size)
+    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .describe("Segment size of bulk execution during backward pass.");
+  }
+};
 /*! \brief runtime functions for NDArray */
 class Imperative {
  public:
@@ -77,6 +94,67 @@ class Imperative {
              && info.out_grads.size() == 1;
     }
   };
+  class CachedOp {
+   public:
+    CachedOp(
+        const nnvm::Symbol& sym,
+        const std::vector<std::pair<std::string, std::string> >& flags,
+        const std::vector<std::string> arg_names,
+        const std::unordered_map<std::string, std::vector<NDArray> >& params);
+    uint32_t num_inputs() {
+      return fwd_graph_.indexed_graph().input_nodes().size();
+    }
+    uint32_t num_outputs() {
+      return fwd_graph_.outputs.size();
+    }
+    uint32_t num_backward_inputs() {
+      return bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size();
+    }
+    std::vector<bool>& save_inputs() {
+      return save_inputs_;
+    }
+    std::vector<bool>& save_outputs() {
+      return save_outputs_;
+    }
+    const std::unordered_set<uint32_t>& mutable_input_nodes() {
+      return fwd_graph_.indexed_graph().mutable_input_nodes();
+    }
+    nnvm::Graph GetForwardGraph(const bool recording,
+                                const std::vector<NDArray*>& inputs);
+    nnvm::Graph GetBackwardGraph(const OpStatePtr& state,
+                                 const std::vector<OpReqType>& reqs,
+                                 const std::vector<NDArray*>& inputs);
+    std::vector<nnvm::NodeEntry> Gradient(const nnvm::NodePtr& node,
+                                          const std::vector<nnvm::NodeEntry>& ograds);
+    void Forward(const std::shared_ptr<CachedOp>& op_ptr,
+                 const std::vector<NDArray*>& args,
+                 const std::vector<NDArray*>& outputs);
+    void Backward(const bool retain_graph,
+                  const OpStatePtr& state,
+                  const std::vector<NDArray*>& inputs,
+                  const std::vector<OpReqType>& reqs,
+                  const std::vector<NDArray*>& outputs);
+
+   private:
+    struct CachedOpState {
+      std::vector<NDArray> buff;
+      std::vector<OpStatePtr> states;
+    };
+    std::mutex mutex_;
+    CachedOpConfig config_;
+    nnvm::Graph fwd_graph_;
+    nnvm::Graph grad_graph_;
+    nnvm::Graph full_graph_;
+    std::unordered_map<Context, std::vector<NDArray> > params_;
+    bool inlining_;
+    std::vector<nnvm::NodeEntry> ograd_entries_;
+    std::vector<bool> curr_grad_req_;
+    std::vector<uint32_t> bwd_in_dep_, bwd_out_dep_, bwd_ograd_dep_;
+    std::vector<uint32_t> fwd_args_idx_;
+    std::vector<uint32_t> fwd_params_idx_;
+    std::vector<uint32_t> bwd_input_eid_;
+    std::vector<bool> save_inputs_, save_outputs_;
+  };
   /*! \brief whether operator recording is on. */
   bool is_training() const {
     return is_train_;
@@ -144,6 +222,15 @@ class Imperative {
       uint32_t num_inputs, uint32_t num_outputs,
       std::vector<bool> *p_save_inputs,
       std::vector<bool> *p_save_outputs);
+  void RunGraph(
+      const bool retain_graph,
+      const nnvm::IndexedGraph& idx,
+      const std::vector<NDArray*> arrays,
+      size_t node_start, size_t node_end,
+      std::vector<OpReqType>&& array_reqs,
+      std::vector<uint32_t>&& ref_count,
+      std::vector<OpStatePtr> *p_states,
+      const DispatchModeVector& dispatch_modes);
   /*! \brief indicate whether is training. */
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local bool is_train_;
@@ -160,5 +247,7 @@ class Imperative {
   int backward_bulk_size_{0};
 };
 
+using CachedOpPtr = std::shared_ptr<Imperative::CachedOp>;
+
 }  // namespace mxnet
 #endif  // MXNET_IMPERATIVE_H_
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index ae96fd87b0d..e243eb71c47 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -155,14 +155,6 @@ class NDArray {
     return byte_offset_ > 0 || shape() != ptr_->storage_shape;
   }
 
-  /* \brief Check whether the two arrays are the same array */
-  inline bool IsSame(const NDArray& other) {
-    return ptr_ == other.ptr_ &&
-        shape_ == other.shape_ &&
-        byte_offset_ == other.byte_offset_ &&
-        dtype_ == other.dtype_;
-  }
-
   /*!
    * \return the shape of current NDArray.
    */
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index f4694efad29..3969d8445be 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -126,36 +126,25 @@ class OpStatePtr {
   template<typename T, typename... Args>
   static OpStatePtr Create(Args&&... args) {
     OpStatePtr ret;
-    auto state = new T(std::forward<Args>(args)...);
-    auto var = Engine::Get()->NewVariable();
-    ret.ptr_.reset(
-      new OpState(var, state),
-      [](OpState* p) {
-        Engine::Get()->DeleteVariable([](RunContext s) {}, Context::CPU(), p->var);
-        delete reinterpret_cast<T*>(p->state);
-        delete p;
-      });
+    ret.ptr_ = std::make_shared<OpState>();
+    ret.ptr_->var_ = Engine::Get()->NewVariable();
+    ret.ptr_->state_.construct<T>(std::forward<Args>(args)...);
 
     return ret;
   }
   /* \brief Get engine variable associated with this state */
   engine::VarHandle get_var() const {
-    return ptr_->var;
+    return ptr_->var_;
   }
   /* \brief Get state of type T */
   template<typename T>
   T& get_state() const {
-    return *reinterpret_cast<T*>(ptr_->state);
+    return dmlc::get<T>(ptr_->state_);
   }
   /* \brief clear state */
   void reset() {
     ptr_.reset();
   }
-  /* \brief checks whether the managed object is managed only by the current
-            OpStatePtr instance */
-  bool unique() const {
-    return ptr_.unique();
-  }
   /* \brief Whether state is empty */
   explicit operator bool() const {
     return ptr_ ? true : false;
@@ -164,12 +153,16 @@ class OpStatePtr {
  private:
   /* \brief state structure */
   struct OpState {
-    engine::VarHandle var;
-    void* state;
-
-    OpState(engine::VarHandle var_, void* state_) : var(var_), state(state_) {}
+    OpState() {}
     OpState(const OpState& other) = delete;
     OpState& operator=(const OpState& other) = delete;
+
+    ~OpState() {
+      Engine::Get()->DeleteVariable([](RunContext s) {}, Context::CPU(), var_);
+    }
+
+    engine::VarHandle var_;
+    dmlc::any state_;
   };
   /* \brief shared pointer to state */
   std::shared_ptr<OpState> ptr_;
diff --git a/python/README.md b/python/README.md
index c1aaa580afc..1ab7aa4464a 100644
--- a/python/README.md
+++ b/python/README.md
@@ -13,9 +13,9 @@ For running unit tests, you will need the [nose PyPi package](https://pypi.pytho
 pip install --upgrade nose
 ```
 
-Once ```nose``` is installed, run the following from MXNet root directory:
+Once ```nose``` is installed, run the following from MXNet root directory (please make sure the installation path of ```nosetests``` is included in your ```$PATH``` environment variable):
 ```
 nosetests tests/python/unittest
 nosetests tests/python/train
 
-```
\ No newline at end of file
+```
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index f324545a235..d2cae0c45aa 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -105,14 +105,28 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
 class CachedOp(object):
     """Cached operator handle."""
     __slots__ = ["handle"]
-    def __init__(self, sym, flags=()):
+    def __init__(self, sym, flags=(), inputs=None, params=None):
         self.handle = CachedOpHandle()
+        param_names = []
+        param_arrays = []
+        if inputs is None:
+            assert params is None, "When inputs is None params must also be None."
+            inputs = sym.list_inputs()
+        elif params is not None:
+            for name, arrs in params.items():
+                param_arrays.extend(arrs)
+                param_names.extend([name] * len(arrs))
 
         check_call(_LIB.MXCreateCachedOpEx(
             sym.handle,
             len(flags),
             c_str_array([key for key, _ in flags]),
             c_str_array([str(val) for _, val in flags]),
+            len(inputs),
+            c_str_array(inputs),
+            len(param_names),
+            c_str_array(param_names),
+            c_handle_array(param_arrays),
             ctypes.byref(self.handle)))
 
     def __del__(self):
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index c9c58a9c9ba..1314b97028a 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -72,7 +72,8 @@ def _quantize_params(qsym, params):
     return quantized_params
 
 
-def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
+def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
+                     quantized_dtype='int8'):
     """Given a symbol object representing a neural network of data type FP32,
     quantize it into a INT8 network.
 
@@ -86,6 +87,8 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
         Names of the parameters that users want to quantize offline. It's always recommended to
         quantize parameters offline so that quantizing parameters during the inference can be
         avoided.
+    quantized_dtype: str
+        The quantized destination type for input data.
     """
     num_excluded_symbols = 0
     excluded_handles = []
@@ -108,7 +111,8 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
                                      mx_uint(num_excluded_symbols),
                                      c_array(SymbolHandle, excluded_handles),
                                      mx_uint(num_offline),
-                                     c_array(ctypes.c_char_p, offline)))
+                                     c_array(ctypes.c_char_p, offline),
+                                     c_str(quantized_dtype)))
     return Symbol(out)
 
 
@@ -401,7 +405,8 @@ def _load_params(params, logger=logging):
 def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, calib_mode='entropy',
-                   calib_data=None, num_calib_examples=None, calib_layer=None, logger=logging):
+                   calib_data=None, num_calib_examples=None, calib_layer=None,
+                   quantized_dtype='int8', logger=logging):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -451,6 +456,9 @@ def quantize_model(sym, arg_params, aux_params,
         calibrate this layer. If yes, the statistics of the layer's output will be collected;
         otherwise, no information of the layer's output will be collected. If not provided,
         all the layers' outputs that need requantization will be collected.
+    quantized_dtype : str
+        The quantized destination type for input data. Currently support 'int8'
+        and 'uint8', default value is 'int8'.
     logger : Object
         A logging object for printing information during the process of quantization.
 
@@ -473,8 +481,13 @@ def quantize_model(sym, arg_params, aux_params,
             idx = nodes.list_outputs().index(sym_name + '_output')
             excluded_syms.append(nodes[idx])
     logger.info('Quantizing symbol')
+
+    if quantized_dtype != 'int8' and quantized_dtype != 'uint8':
+        raise ValueError('unknown quantized_dtype %s received,'
+                         ' expected `int8` or `uint8`' % quantized_dtype)
     qsym = _quantize_symbol(sym, excluded_symbols=excluded_syms,
-                            offline_params=list(arg_params.keys()))
+                            offline_params=list(arg_params.keys()),
+                            quantized_dtype=quantized_dtype)
 
     logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params)
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 7406a5d6c75..3b97c0578ca 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable= arguments-differ
+# pylint: disable= arguments-differ, too-many-lines
 """Base container class for all neural network models."""
 __all__ = ['Block', 'HybridBlock', 'SymbolBlock']
 
@@ -149,7 +149,8 @@ def forward(self, x):
 
 
     Child :py:class:`Block` assigned this way will be registered and :py:meth:`collect_params`
-    will collect their Parameters recursively.
+    will collect their Parameters recursively. You can also manually register
+    child blocks with :py:meth:`register_child`.
 
     Parameters
     ----------
@@ -268,12 +269,12 @@ def collect_params(self, select=None):
         children's Parameters(default), also can returns the select :py:class:`ParameterDict`
         which match some given regular expressions.
 
-        For example, collect the specified parameter in ['conv1_weight', 'conv1_bias', 'fc_weight',
+        For example, collect the specified parameters in ['conv1_weight', 'conv1_bias', 'fc_weight',
         'fc_bias']::
 
             model.collect_params('conv1_weight|conv1_bias|fc_weight|fc_bias')
 
-        or collect all paramters which their name ends with 'weight' or 'bias', this can be done
+        or collect all parameters whose names end with 'weight' or 'bias', this can be done
         using regular expressions::
 
             model.collect_params('.*weight|.*bias')
@@ -307,9 +308,23 @@ def _collect_params_with_prefix(self, prefix=''):
             ret.update(child._collect_params_with_prefix(prefix + name))
         return ret
 
-    def save_params(self, filename):
+    def save_parameters(self, filename):
         """Save parameters to file.
+        This function is to be used to save parameters of a Gluon model, note that
+        the saved parameters are not meant to be loaded in a different language binding for now.
+        Saving parameters using `.save_parameters()` is different than
+        `.collect_params().save()` and `.save_params()`, which are deprecated ways
+        to save the parameters of a model and should be avoided.
+
+        If your model is hybridizable and you want to export a serialized version of the
+        structure of the model as well as its parameters please refer to
+        :py:meth:`HybridBlock.export`. Such model can then be loaded back in any language binding
+        or even in Gluon using a :py:class:`SymbolBlock`.
+        Refer to this tutorial for a complete overview of saving/loading models with
+        MXNet: https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html
 
+        Parameters
+        ----------
         filename : str
             Path to file.
         """
@@ -317,14 +332,35 @@ def save_params(self, filename):
         arg_dict = {key : val._reduce() for key, val in params.items()}
         ndarray.save(filename, arg_dict)
 
-    def load_params(self, filename, ctx=None, allow_missing=False,
-                    ignore_extra=False):
+    def save_params(self, filename):
+        """[Deprecated] Please use save_parameters.
+
+        Save parameters to file.
+
+        filename : str
+            Path to file.
+        """
+        warnings.warn("save_params is deprecated. Please use save_parameters.")
+        try:
+            self.collect_params().save(filename, strip_prefix=self.prefix)
+        except ValueError as e:
+            raise ValueError('%s\nsave_params is deprecated. Using ' \
+                              'save_parameters may resolve this error.'%e.message)
+
+    def load_parameters(self, filename, ctx=None, allow_missing=False,
+                        ignore_extra=False):
         """Load parameters from file.
+        This function is to be used to load parameters of a Gluon model that were
+        saved using the `.save_parameters()` function. Any other use is undefined behaviour.
+        Refer to this tutorial for a complete overview of saving/loading models with
+        MXNet: https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html
 
+        Parameters
+        ----------
         filename : str
             Path to parameter file.
         ctx : Context or list of Context, default cpu()
-            Context(s) initialize loaded parameters on.
+            Context(s) to initialize loaded parameters on.
         allow_missing : bool, default False
             Whether to silently skip loading parameters not represents in the file.
         ignore_extra : bool, default False
@@ -358,6 +394,25 @@ def load_params(self, filename, ctx=None, allow_missing=False,
             if name in params:
                 params[name]._load_init(loaded[name], ctx)
 
+    def load_params(self, filename, ctx=None, allow_missing=False,
+                    ignore_extra=False):
+        """[Deprecated] Please use load_parameters.
+
+        Load parameters from file.
+
+        filename : str
+            Path to parameter file.
+        ctx : Context or list of Context, default cpu()
+            Context(s) to initialize loaded parameters on.
+        allow_missing : bool, default False
+            Whether to silently skip loading parameters not represents in the file.
+        ignore_extra : bool, default False
+            Whether to silently ignore parameters from the file that are not
+            present in this Block.
+        """
+        warnings.warn("load_params is deprecated. Please use load_parameters.")
+        self.load_parameters(filename, ctx, allow_missing, ignore_extra)
+
     def register_child(self, block, name=None):
         """Registers block as a child of self. :py:class:`Block` s assigned to self as
         attributes will be registered automatically."""
@@ -447,16 +502,8 @@ def hybridize(self, active=True, **kwargs):
         ----------
         active : bool, default True
             Whether to turn hybrid on or off.
-        static_alloc : bool, default False
-            Statically allocate memory to improve speed. Memory usage may increase.
-        static_shape : bool, default False
-            Optimize for invariant input shapes between iterations. Must also
-            set static_alloc to True. Change of input shapes is still allowed
-            but slower.
-        forward_bulk_size : int, default 15
-            Segment size of bulk execution during forward pass.
-        backward_bulk_size : int, default 15
-            Segment size of bulk execution during backward pass.
+        **kwargs : string
+            Additional flags for hybridized operator.
         """
         for cld in self._children.values():
             cld.hybridize(active, **kwargs)
@@ -601,9 +648,31 @@ def _summary_hook(block, _, outputs):
 class HybridBlock(Block):
     """`HybridBlock` supports forwarding with both Symbol and NDArray.
 
+    `HybridBlock` is similar to `Block`, with a few differences::
+
+        import mxnet as mx
+        from mxnet.gluon import HybridBlock, nn
+
+        class Model(HybridBlock):
+            def __init__(self, **kwargs):
+                super(Model, self).__init__(**kwargs)
+                # use name_scope to give child Blocks appropriate names.
+                with self.name_scope():
+                    self.dense0 = nn.Dense(20)
+                    self.dense1 = nn.Dense(20)
+
+            def hybrid_forward(self, F, x):
+                x = F.relu(self.dense0(x))
+                return F.relu(self.dense1(x))
+
+        model = Model()
+        model.initialize(ctx=mx.cpu(0))
+        model.hybridize()
+        model(mx.nd.zeros((10, 10), ctx=mx.cpu(0)))
+
     Forward computation in :py:class:`HybridBlock` must be static to work with :py:class:`Symbol` s,
     i.e. you cannot call :py:meth:`NDArray.asnumpy`, :py:attr:`NDArray.shape`,
-    :py:attr:`NDArray.dtype`, etc on tensors.
+    :py:attr:`NDArray.dtype`, `NDArray` indexing (`x[i]`) etc on tensors.
     Also, you cannot use branching or loop logic that bases on non-constant
     expressions like random numbers or intermediate results, since they change
     the graph structure for each iteration.
@@ -613,9 +682,12 @@ class HybridBlock(Block):
     representing the forward computation and cache it. On subsequent forwards,
     the cached graph will be used instead of :py:meth:`hybrid_forward`.
 
-    Refer `Hybrid tutorial <http://mxnet.io/tutorials/gluon/hybrid.html>`_ to see
-    the end-to-end usage.
+    Please see references for detailed tutorial.
 
+    References
+    ----------
+        `Hybrid - Faster training and easy deployment
+        <http://mxnet.io/tutorials/gluon/hybrid.html>`_
     """
     def __init__(self, prefix=None, params=None):
         super(HybridBlock, self).__init__(prefix=prefix, params=params)
@@ -624,7 +696,7 @@ def __init__(self, prefix=None, params=None):
         self._out_format = None
         self._in_format = None
         self._active = False
-        self._flags = []
+        self._flags = {}
 
     def __setattr__(self, name, value):
         """Registers parameters."""
@@ -651,43 +723,39 @@ def _get_graph(self, *args):
         return self._cached_graph
 
     def _build_cache(self, *args):
-        data, out = self._get_graph(*args)
-        data_names = {data.name : i for i, data in enumerate(data)}
-        params = self.collect_params()
-        input_names = out.list_inputs()
+        inputs, out = self._get_graph(*args)
+        input_names = [i.name for i in inputs]
 
+        params = self.collect_params()
         param_names = set(params.keys())
-        expected_names = set(input_names)
+        expected_names = set(out.list_inputs())
         for name in expected_names:
-            assert name in param_names or name in data_names, \
+            assert name in param_names or name in input_names, \
                 "Unknown input to HybridBlock: %s"%name
 
-        used_data_names = [i for i in data_names if i in expected_names]
-        if len(used_data_names) != len(data_names):
-            unused = ', '.join(['%d-th'%i for name, i in data_names.items()
+        used_input_names = [i for i in input_names if i in expected_names]
+        if len(used_input_names) != len(input_names):
+            unused = ', '.join(['%d-th'%i for i, name in enumerate(input_names)
                                 if name not in expected_names])
             warnings.warn("The %s input to HybridBlock is not used by any "
                           "computation. Is this intended?"%unused, stacklevel=4)
 
-        used_param_names = [i for i in param_names if i in expected_names]
+        used_param_names = set(i for i in param_names if i in expected_names)
         if len(used_param_names) != len(param_names):
-            unused = ', '.join(list(param_names - set(used_param_names)))
+            unused = ', '.join(list(param_names - used_param_names))
             warnings.warn("Parameter %s is not used by any computation. "
                           "Is this intended?"%unused, stacklevel=4)
 
-        data_indices = []
-        param_indices = []
-        self._cached_op_args = []
-        for i, name in enumerate(input_names):
-            if name in data_names:
-                data_indices.append(i)
-                self._cached_op_args.append((True, data_names[name]))
-            else:
-                param_indices.append(i)
-                self._cached_op_args.append((False, params[name]))
-        flags = [('data_indices', data_indices), ('param_indices', param_indices)] + \
-                self._flags
-        self._cached_op = ndarray.CachedOp(out, flags)
+        used_params = {k: params[k] for k in used_param_names}
+        try:
+            param_dict = {k: v.list_data() for k, v in used_params.items()}
+        except DeferredInitializationError:
+            self._deferred_infer_shape(*args)
+            for i in used_params.values():
+                i._finish_deferred_init()
+            param_dict = {k: v.list_data() for k, v in used_params.items()}
+
+        self._cached_op = ndarray.CachedOp(out, self._flags, input_names, param_dict)
 
     def _deferred_infer_shape(self, *args):
         try:
@@ -703,19 +771,7 @@ def _call_cached_op(self, *args):
 
         args, fmt = _flatten(args, "input")
         assert fmt == self._in_format, "Invalid input format"
-        try:
-            cargs = [args[i] if is_arg else i.data()
-                     for is_arg, i in self._cached_op_args]
-        except DeferredInitializationError:
-            self._deferred_infer_shape(*args)
-            cargs = []
-            for is_arg, i in self._cached_op_args:
-                if is_arg:
-                    cargs.append(args[i])
-                else:
-                    i._finish_deferred_init()
-                    cargs.append(i.data())
-        out = self._cached_op(*cargs)
+        out = self._cached_op(*args)
         if isinstance(out, NDArray):
             out = [out]
         return _regroup(out, self._out_format)[0]
@@ -736,7 +792,7 @@ def register_child(self, block, name=None):
 
     def hybridize(self, active=True, **kwargs):
         self._active = active
-        self._flags = list(kwargs.items())
+        self._flags = kwargs.items()
         self._clear_cached_op()
         if active and self._forward_hooks or self._forward_pre_hooks:
             warnings.warn('"{}" is being hybridized while still having forward hook/pre-hook. '
@@ -771,8 +827,8 @@ def infer_type(self, *args):
         self._infer_attrs('infer_type', 'dtype', *args)
 
     def export(self, path, epoch=0):
-        """Export HybridBlock to json format that can be loaded by `mxnet.mod.Module`
-        or the C++ interface.
+        """Export HybridBlock to json format that can be loaded by
+        `SymbolBlock.imports`, `mxnet.mod.Module` or the C++ interface.
 
         .. note:: When there are only one input, it will have name `data`. When there
                   Are more than one inputs, they will be named as `data0`, `data1`, etc.
@@ -886,6 +942,50 @@ class SymbolBlock(HybridBlock):
     >>> x = mx.nd.random.normal(shape=(16, 3, 224, 224))
     >>> print(feat_model(x))
     """
+    @staticmethod
+    def imports(symbol_file, input_names, param_file=None, ctx=None):
+        """Import model previously saved by `HybridBlock.export` or
+        `Module.save_checkpoint` as a SymbolBlock for use in Gluon.
+
+        Parameters
+        ----------
+        symbol_file : str
+            Path to symbol file.
+        input_names : list of str
+            List of input variable names
+        param_file : str, optional
+            Path to parameter file.
+        ctx : Context, default None
+            The context to initialize SymbolBlock on.
+
+        Returns
+        -------
+        SymbolBlock
+            SymbolBlock loaded from symbol and parameter files.
+
+        Examples
+        --------
+        >>> net1 = gluon.model_zoo.vision.resnet18_v1(
+        ...     prefix='resnet', pretrained=True)
+        >>> net1.hybridize()
+        >>> x = mx.nd.random.normal(shape=(1, 3, 32, 32))
+        >>> out1 = net1(x)
+        >>> net1.export('net1', epoch=1)
+        >>>
+        >>> net2 = gluon.SymbolBlock.imports(
+        ...     'net1-symbol.json', ['data'], 'net1-0001.params')
+        >>> out2 = net2(x)
+        """
+        sym = symbol.load(symbol_file)
+        if isinstance(input_names, str):
+            input_names = [input_names]
+        inputs = [symbol.var(i) for i in input_names]
+        ret = SymbolBlock(sym, inputs)
+        if param_file is not None:
+            ret.collect_params().load(param_file, ctx=ctx)
+        return ret
+
+
     def __init__(self, outputs, inputs, params=None):
         super(SymbolBlock, self).__init__(prefix=None, params=None)
         self._prefix = ''
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 7ec1c32d5e3..2e35a404b00 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -196,7 +196,7 @@ class RandomResizedCrop(Block):
         - **out**: output tensor with (H x W x C) shape.
     """
     def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0/4.0, 4.0/3.0),
-                 interpolation=2):
+                 interpolation=1):
         super(RandomResizedCrop, self).__init__()
         if isinstance(size, numeric_types):
             size = (size, size)
@@ -233,7 +233,7 @@ class CenterCrop(Block):
     >>> transformer(image)
     <NDArray 500x1000x3 @cpu(0)>
     """
-    def __init__(self, size, interpolation=2):
+    def __init__(self, size, interpolation=1):
         super(CenterCrop, self).__init__()
         if isinstance(size, numeric_types):
             size = (size, size)
@@ -250,6 +250,9 @@ class Resize(Block):
     ----------
     size : int or tuple of (W, H)
         Size of output image.
+    keep_ratio : bool
+        Whether to resize the short edge or both edges to `size`,
+        if size is give as an integer.
     interpolation : int
         Interpolation method for resizing. By default uses bilinear
         interpolation. See OpenCV's resize function for available choices.
@@ -268,14 +271,28 @@ class Resize(Block):
     >>> transformer(image)
     <NDArray 500x1000x3 @cpu(0)>
     """
-    def __init__(self, size, interpolation=2):
+    def __init__(self, size, keep_ratio=False, interpolation=1):
         super(Resize, self).__init__()
-        if isinstance(size, numeric_types):
-            size = (size, size)
-        self._args = tuple(size) + (interpolation,)
+        self._keep = keep_ratio
+        self._size = size
+        self._interpolation = interpolation
 
     def forward(self, x):
-        return image.imresize(x, *self._args)
+        if isinstance(self._size, numeric_types):
+            if not self._keep:
+                wsize = self._size
+                hsize = self._size
+            else:
+                h, w, _ = x.shape
+                if h > w:
+                    wsize = self._size
+                    hsize = int(h * wsize / w)
+                else:
+                    hsize = self._size
+                    wsize = int(w * hsize / h)
+        else:
+            wsize, hsize = self._size
+        return image.imresize(x, wsize, hsize, self._interpolation)
 
 
 class RandomFlipLeftRight(HybridBlock):
diff --git a/python/mxnet/gluon/model_zoo/vision/alexnet.py b/python/mxnet/gluon/model_zoo/vision/alexnet.py
index 55499470460..fdb006258c2 100644
--- a/python/mxnet/gluon/model_zoo/vision/alexnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/alexnet.py
@@ -83,5 +83,5 @@ def alexnet(pretrained=False, ctx=cpu(),
     net = AlexNet(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('alexnet', root=root), ctx=ctx)
+        net.load_parameters(get_model_file('alexnet', root=root), ctx=ctx)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/densenet.py b/python/mxnet/gluon/model_zoo/vision/densenet.py
index 835336739a6..b03f5ce8d52 100644
--- a/python/mxnet/gluon/model_zoo/vision/densenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/densenet.py
@@ -141,7 +141,7 @@ def get_densenet(num_layers, pretrained=False, ctx=cpu(),
     net = DenseNet(num_init_features, growth_rate, block_config, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('densenet%d'%(num_layers), root=root), ctx=ctx)
+        net.load_parameters(get_model_file('densenet%d'%(num_layers), root=root), ctx=ctx)
     return net
 
 def densenet121(**kwargs):
diff --git a/python/mxnet/gluon/model_zoo/vision/inception.py b/python/mxnet/gluon/model_zoo/vision/inception.py
index 6d75050b83f..7c54691f1b5 100644
--- a/python/mxnet/gluon/model_zoo/vision/inception.py
+++ b/python/mxnet/gluon/model_zoo/vision/inception.py
@@ -216,5 +216,5 @@ def inception_v3(pretrained=False, ctx=cpu(),
     net = Inception3(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('inceptionv3', root=root), ctx=ctx)
+        net.load_parameters(get_model_file('inceptionv3', root=root), ctx=ctx)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/mobilenet.py b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
index 5b4c9a8e615..1a2c9b94619 100644
--- a/python/mxnet/gluon/model_zoo/vision/mobilenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
@@ -213,7 +213,7 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
         version_suffix = '{0:.2f}'.format(multiplier)
         if version_suffix in ('1.00', '0.50'):
             version_suffix = version_suffix[:-1]
-        net.load_params(
+        net.load_parameters(
             get_model_file('mobilenet%s' % version_suffix, root=root), ctx=ctx)
     return net
 
@@ -245,7 +245,7 @@ def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
         version_suffix = '{0:.2f}'.format(multiplier)
         if version_suffix in ('1.00', '0.50'):
             version_suffix = version_suffix[:-1]
-        net.load_params(
+        net.load_parameters(
             get_model_file('mobilenetv2_%s' % version_suffix, root=root), ctx=ctx)
     return net
 
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index 5ee67b510a8..da279b89583 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -386,8 +386,8 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
     net = resnet_class(block_class, layers, channels, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('resnet%d_v%d'%(num_layers, version),
-                                       root=root), ctx=ctx)
+        net.load_parameters(get_model_file('resnet%d_v%d'%(num_layers, version),
+                                           root=root), ctx=ctx)
     return net
 
 def resnet18_v1(**kwargs):
diff --git a/python/mxnet/gluon/model_zoo/vision/squeezenet.py b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
index 09f62a52074..aaff4c36dfa 100644
--- a/python/mxnet/gluon/model_zoo/vision/squeezenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
@@ -132,7 +132,7 @@ def get_squeezenet(version, pretrained=False, ctx=cpu(),
     net = SqueezeNet(version, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('squeezenet%s'%version, root=root), ctx=ctx)
+        net.load_parameters(get_model_file('squeezenet%s'%version, root=root), ctx=ctx)
     return net
 
 def squeezenet1_0(**kwargs):
diff --git a/python/mxnet/gluon/model_zoo/vision/vgg.py b/python/mxnet/gluon/model_zoo/vision/vgg.py
index dbae5385898..a3b1685b413 100644
--- a/python/mxnet/gluon/model_zoo/vision/vgg.py
+++ b/python/mxnet/gluon/model_zoo/vision/vgg.py
@@ -114,8 +114,8 @@ def get_vgg(num_layers, pretrained=False, ctx=cpu(),
     if pretrained:
         from ..model_store import get_model_file
         batch_norm_suffix = '_bn' if kwargs.get('batch_norm') else ''
-        net.load_params(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix),
-                                       root=root), ctx=ctx)
+        net.load_parameters(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix),
+                                           root=root), ctx=ctx)
     return net
 
 def vgg11(**kwargs):
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 2fbf7d8786d..24f30270ad6 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -675,7 +675,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
 class _Pooling(HybridBlock):
     """Abstract class for different pooling layers."""
     def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
-                 pool_type, **kwargs):
+                 pool_type, count_include_pad=None, **kwargs):
         super(_Pooling, self).__init__(**kwargs)
         if strides is None:
             strides = pool_size
@@ -687,6 +687,8 @@ def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
             'kernel': pool_size, 'stride': strides, 'pad': padding,
             'global_pool': global_pool, 'pool_type': pool_type,
             'pooling_convention': 'full' if ceil_mode else 'valid'}
+        if count_include_pad is not None:
+            self._kwargs['count_include_pad'] = count_include_pad
 
     def _alias(self):
         return 'pool'
@@ -863,6 +865,8 @@ class AvgPool1D(_Pooling):
         respectively. padding is applied on 'W' dimension.
     ceil_mode : bool, default False
         When `True`, will use ceil instead of floor to compute the output shape.
+    count_include_pad : bool, default True
+        When 'False', will exclude padding elements when computing the average value.
 
 
     Inputs:
@@ -879,13 +883,13 @@ class AvgPool1D(_Pooling):
           equation.
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
-                 ceil_mode=False, **kwargs):
+                 ceil_mode=False, count_include_pad=True, **kwargs):
         assert layout == 'NCW', "Only supports 'NCW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
         super(AvgPool1D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
 
 
 class AvgPool2D(_Pooling):
@@ -907,6 +911,8 @@ class AvgPool2D(_Pooling):
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
         When True, will use ceil instead of floor to compute the output shape.
+    count_include_pad : bool, default True
+        When 'False', will exclude padding elements when computing the average value.
 
 
     Inputs:
@@ -926,13 +932,13 @@ class AvgPool2D(_Pooling):
           equation.
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
-                 ceil_mode=False, layout='NCHW', **kwargs):
+                 ceil_mode=False, layout='NCHW', count_include_pad=True, **kwargs):
         assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
         super(AvgPool2D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
 
 
 class AvgPool3D(_Pooling):
@@ -955,6 +961,8 @@ class AvgPool3D(_Pooling):
         dimension.
     ceil_mode : bool, default False
         When True, will use ceil instead of floor to compute the output shape.
+    count_include_pad : bool, default True
+        When 'False', will exclude padding elements when computing the average value.
 
 
     Inputs:
@@ -975,13 +983,13 @@ class AvgPool3D(_Pooling):
           equation.
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
-                 ceil_mode=False, layout='NCDHW', **kwargs):
+                 ceil_mode=False, layout='NCDHW', count_include_pad=True, **kwargs):
         assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
         super(AvgPool3D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
 
 
 class GlobalMaxPool1D(_Pooling):
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index c0d89fbd4cc..73fca6050ac 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -391,6 +391,8 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
     def reset_ctx(self, ctx):
         """Re-assign Parameter to other contexts.
 
+        Parameters
+        ----------
         ctx : Context or list of Context, default ``context.current_context()``.
             Assign Parameter to given context. If ctx is a list of Context, a
             copy will be made for each context.
@@ -587,8 +589,8 @@ def __init__(self, **kwargs):
                 super(Block, self).__init__(**kwargs)
                 self.const = self.params.get_constant('const', [[1,2],[3,4]])
 
-    Parameter
-    ---------
+    Parameters
+    ----------
     name : str
         Name of the parameter.
     value : array-like
@@ -739,7 +741,7 @@ def get_constant(self, name, value=None):
         found, :py:func:`get` will create a new :py:class:`Constant` with key-word
         arguments and insert it to self.
 
-        Constants
+        Parameters
         ----------
         name : str
             Name of the desired Constant. It will be prepended with this dictionary's
@@ -814,6 +816,8 @@ def zero_grad(self):
     def reset_ctx(self, ctx):
         """Re-assign all Parameters to other contexts.
 
+        Parameters
+        ----------
         ctx : Context or list of Context, default :py:meth:`context.current_context()`.
             Assign Parameter to given context. If ctx is a list of Context, a
             copy will be made for each context.
@@ -846,6 +850,8 @@ def setattr(self, name, value):
     def save(self, filename, strip_prefix=''):
         """Save parameters to file.
 
+        Parameters
+        ----------
         filename : str
             Path to parameter file.
         strip_prefix : str, default ''
@@ -870,6 +876,8 @@ def load(self, filename, ctx=None, allow_missing=False,
              ignore_extra=False, restore_prefix=''):
         """Load parameters from file.
 
+        Parameters
+        ----------
         filename : str
             Path to parameter file.
         ctx : Context or list of Context
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 1297c3da9a7..ef9026d4507 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -161,7 +161,7 @@ def _legacy_init(self, name, arr):
         Parameters
         ----------
         name : str
-            Name of corrosponding NDArray.
+            Name of corresponding NDArray.
 
         arr : NDArray
             NDArray to be initialized.
@@ -424,12 +424,14 @@ def _init_weight(self, _, arr):
 
 @register
 class Constant(Initializer):
-    """Initializes the weights to a scalar value.
+    """Initializes the weights to a given value.
+    The value passed in can be a scalar or a NDarray that matches the shape
+    of the parameter to be set.
 
     Parameters
     ----------
-    value : float
-        Fill value.
+    value : float, NDArray
+        Value to set.
     """
     def __init__(self, value):
         super(Constant, self).__init__(value=value)
@@ -651,7 +653,7 @@ def _init_weight(self, _, arr):
 
 @register
 class LSTMBias(Initializer):
-    """Initialize all bias of an LSTMCell to 0.0 except for
+    """Initialize all biases of an LSTMCell to 0.0 except for
     the forget gate whose bias is set to custom value.
 
     Parameters
diff --git a/python/mxnet/rnn/io.py b/python/mxnet/rnn/io.py
index 8eba9d21e39..a8890c9e7e7 100644
--- a/python/mxnet/rnn/io.py
+++ b/python/mxnet/rnn/io.py
@@ -27,7 +27,8 @@
 from ..io import DataIter, DataBatch, DataDesc
 from .. import ndarray
 
-def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n', start_label=0):
+def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n',
+                     start_label=0, unknown_token=None):
     """Encode sentences and (optionally) build a mapping
     from string tokens to integer indices. Unknown keys
     will be added to vocabulary.
@@ -46,6 +47,9 @@ def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n',
         of sentence by default.
     start_label : int
         lowest index.
+    unknown_token: str
+        Symbol to represent unknown token.
+        If not specified, unknown token will be skipped.
 
     Returns
     -------
@@ -65,9 +69,11 @@ def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n',
         coded = []
         for word in sent:
             if word not in vocab:
-                assert new_vocab, "Unknown token %s"%word
+                assert (new_vocab or unknown_token), "Unknown token %s"%word
                 if idx == invalid_label:
                     idx += 1
+                if unknown_token:
+                    word = unknown_token
                 vocab[word] = idx
                 idx += 1
             coded.append(vocab[word])
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 686802d3c48..19fe0749598 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1644,16 +1644,20 @@ def discard_stderr():
     with discard_stderr():
         ...
     """
+    with open(os.devnull, 'w') as bit_bucket:
+        try:
+            stderr_fileno = sys.stderr.fileno()
+            old_stderr = os.dup(stderr_fileno)
+            try:
+                os.dup2(bit_bucket.fileno(), stderr_fileno)
+                yield
+            finally:
+                os.dup2(old_stderr, stderr_fileno)
+        except AttributeError:
+            # On some systems is stderr not a file descriptor but actually a virtual pipeline
+            # that can not be copied
+            yield
 
-    try:
-        stderr_fileno = sys.stderr.fileno()
-        old_stderr = os.dup(stderr_fileno)
-        bit_bucket = open(os.devnull, 'w')
-        os.dup2(bit_bucket.fileno(), stderr_fileno)
-        yield
-    finally:
-        os.dup2(old_stderr, stderr_fileno)
-        bit_bucket.close()
 
 class DummyIter(mx.io.DataIter):
     """A dummy iterator that always returns the same batch of data
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
index 7289df19712..87c9bc72be0 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
@@ -224,13 +224,24 @@ class FeedForward private(
     var i = 0
     while (data.hasNext && i != numBatch) {
       val batch = data.next()
-      i += 1
-      ExecutorManager.loadData(batch, dataArrays)
-      predExec.forward(isTrain = false)
-      val padded = batch.pad
-      val realSize = batchSize - padded
-      for ((list, nd) <- outputs zip predExec.outputs) {
-        list += nd.slice(0, realSize).copy()
+      try {
+        i += 1
+        ExecutorManager.loadData(batch, dataArrays)
+        predExec.forward(isTrain = false)
+        val padded = batch.pad
+        val realSize = batchSize - padded
+        for ((list, nd) <- outputs zip predExec.outputs) {
+          // The slice is being written to a value so that dispose can be called after the copy.
+          // The one liner nd.slice().copy() leads to leaking the memory of the slice.
+          val ndSliced = nd.slice(0, realSize)
+          try {
+            list += ndSliced.copy()
+          } finally {
+            ndSliced.dispose()
+          }
+        }
+      } finally {
+        batch.dispose()
       }
     }
     // TODO(Yizhi): we can use Symbol.concat to do the same thing. Can it be more efficient?
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
index 51089382097..70c64877887 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
@@ -28,7 +28,8 @@ import scala.collection.immutable.ListMap
 /**
  * NDArrayIter object in mxnet. Taking NDArray to get dataiter.
  *
- * @param data NDArrayIter supports single or multiple data and label.
+ * @param data Specify the data as well as the name.
+ *             NDArrayIter supports single or multiple data and label.
  * @param label Same as data, but is not fed to the model during testing.
  * @param dataBatchSize Batch Size
  * @param shuffle Whether to shuffle the data
@@ -38,15 +39,35 @@ import scala.collection.immutable.ListMap
  * the size of data does not match batch_size. Roll over is intended
  * for training and can cause problems if used for prediction.
  */
-class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = IndexedSeq.empty,
-                  private val dataBatchSize: Int = 1, shuffle: Boolean = false,
-                  lastBatchHandle: String = "pad",
-                  dataName: String = "data", labelName: String = "label") extends DataIter {
-  private val logger = LoggerFactory.getLogger(classOf[NDArrayIter])
+class NDArrayIter(data: IndexedSeq[(String, NDArray)],
+                  label: IndexedSeq[(String, NDArray)],
+                  private val dataBatchSize: Int, shuffle: Boolean,
+                  lastBatchHandle: String) extends DataIter {
+
+  /**
+   * @param data Specify the data. Data names will be data_0, data_1, ..., etc.
+   * @param label Same as data, but is not fed to the model during testing.
+   *              Label names will be label_0, label_1, ..., etc.
+   * @param dataBatchSize Batch Size
+   * @param shuffle Whether to shuffle the data
+   * @param lastBatchHandle "pad", "discard" or "roll_over". How to handle the last batch
+   *
+   * This iterator will pad, discard or roll over the last batch if
+   * the size of data does not match batch_size. Roll over is intended
+   * for training and can cause problems if used for prediction.
+   */
+  def this(data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = IndexedSeq.empty,
+           dataBatchSize: Int = 1, shuffle: Boolean = false,
+           lastBatchHandle: String = "pad",
+           dataName: String = "data", labelName: String = "label") {
+    this(IO.initData(data, allowEmpty = false, dataName),
+      IO.initData(label, allowEmpty = true, labelName),
+      dataBatchSize, shuffle, lastBatchHandle)
+  }
 
+  private val logger = LoggerFactory.getLogger(classOf[NDArrayIter])
 
-  private val (_dataList: IndexedSeq[NDArray],
-  _labelList: IndexedSeq[NDArray]) = {
+  val (initData: IndexedSeq[(String, NDArray)], initLabel: IndexedSeq[(String, NDArray)]) = {
     // data should not be null and size > 0
     require(data != null && data.size > 0,
       "data should not be null and data.size should not be zero")
@@ -55,17 +76,17 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
       "label should not be null. Use IndexedSeq.empty if there are no labels")
 
     // shuffle is not supported currently
-    require(shuffle == false, "shuffle is not supported currently")
+    require(!shuffle, "shuffle is not supported currently")
 
     // discard final part if lastBatchHandle equals discard
     if (lastBatchHandle.equals("discard")) {
-      val dataSize = data(0).shape(0)
+      val dataSize = data(0)._2.shape(0)
       require(dataBatchSize <= dataSize,
         "batch_size need to be smaller than data size when not padding.")
       val keepSize = dataSize - dataSize % dataBatchSize
-      val dataList = data.map(ndArray => {ndArray.slice(0, keepSize)})
+      val dataList = data.map { case (name, ndArray) => (name, ndArray.slice(0, keepSize)) }
       if (!label.isEmpty) {
-        val labelList = label.map(ndArray => {ndArray.slice(0, keepSize)})
+        val labelList = label.map { case (name, ndArray) => (name, ndArray.slice(0, keepSize)) }
         (dataList, labelList)
       } else {
         (dataList, label)
@@ -75,13 +96,9 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
     }
   }
 
-
-  val initData: IndexedSeq[(String, NDArray)] = IO.initData(_dataList, false, dataName)
-  val initLabel: IndexedSeq[(String, NDArray)] = IO.initData(_labelList, true, labelName)
-  val numData = _dataList(0).shape(0)
-  val numSource = initData.size
-  var cursor = -dataBatchSize
-
+  val numData = initData(0)._2.shape(0)
+  val numSource: MXUint = initData.size
+  private var cursor = -dataBatchSize
 
   private val (_provideData: ListMap[String, Shape],
                _provideLabel: ListMap[String, Shape]) = {
@@ -112,8 +129,8 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * reset the iterator
    */
   override def reset(): Unit = {
-    if (lastBatchHandle.equals("roll_over") && cursor>numData) {
-      cursor = -dataBatchSize + (cursor%numData)%dataBatchSize
+    if (lastBatchHandle.equals("roll_over") && cursor > numData) {
+      cursor = -dataBatchSize + (cursor%numData) % dataBatchSize
     } else {
       cursor = -dataBatchSize
     }
@@ -154,16 +171,16 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
     newArray
   }
 
-  private def _getData(data: IndexedSeq[NDArray]): IndexedSeq[NDArray] = {
+  private def _getData(data: IndexedSeq[(String, NDArray)]): IndexedSeq[NDArray] = {
     require(cursor < numData, "DataIter needs reset.")
     if (data == null) {
       null
     } else {
       if (cursor + dataBatchSize <= numData) {
-        data.map(ndArray => {ndArray.slice(cursor, cursor + dataBatchSize)}).toIndexedSeq
+        data.map { case (_, ndArray) => ndArray.slice(cursor, cursor + dataBatchSize) }
       } else {
         // padding
-        data.map(_padData).toIndexedSeq
+        data.map { case (_, ndArray) => _padData(ndArray) }
       }
     }
   }
@@ -173,7 +190,7 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * @return the data of current batch
    */
   override def getData(): IndexedSeq[NDArray] = {
-    _getData(_dataList)
+    _getData(initData)
   }
 
   /**
@@ -181,7 +198,7 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * @return the label of current batch
    */
   override def getLabel(): IndexedSeq[NDArray] = {
-    _getData(_labelList)
+    _getData(initLabel)
   }
 
   /**
@@ -189,7 +206,7 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * @return
    */
   override def getIndex(): IndexedSeq[Long] = {
-    (cursor.toLong to (cursor + dataBatchSize).toLong).toIndexedSeq
+    cursor.toLong to (cursor + dataBatchSize).toLong
   }
 
   /**
@@ -213,3 +230,66 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
 
   override def batchSize: Int = dataBatchSize
 }
+
+object NDArrayIter {
+
+  /**
+   * Builder class for NDArrayIter.
+   */
+  class Builder() {
+    private var data: IndexedSeq[(String, NDArray)] = IndexedSeq.empty
+    private var label: IndexedSeq[(String, NDArray)] = IndexedSeq.empty
+    private var dataBatchSize: Int = 1
+    private var lastBatchHandle: String = "pad"
+
+    /**
+     * Add one data input with its name.
+     * @param name Data name.
+     * @param data Data nd-array.
+     * @return The builder object itself.
+     */
+    def addData(name: String, data: NDArray): Builder = {
+      this.data = this.data ++ IndexedSeq((name, data))
+      this
+    }
+
+    /**
+     * Add one label input with its name.
+     * @param name Label name.
+     * @param label Label nd-array.
+     * @return The builder object itself.
+     */
+    def addLabel(name: String, label: NDArray): Builder = {
+      this.label = this.label ++ IndexedSeq((name, label))
+      this
+    }
+
+    /**
+     * Set the batch size of the iterator.
+     * @param batchSize batch size.
+     * @return The builder object itself.
+     */
+    def setBatchSize(batchSize: Int): Builder = {
+      this.dataBatchSize = batchSize
+      this
+    }
+
+    /**
+     * How to handle the last batch.
+     * @param lastBatchHandle Can be "pad", "discard" or "roll_over".
+     * @return The builder object itself.
+     */
+    def setLastBatchHandle(lastBatchHandle: String): Builder = {
+      this.lastBatchHandle = lastBatchHandle
+      this
+    }
+
+    /**
+     * Build the NDArrayIter object.
+     * @return the built object.
+     */
+    def build(): NDArrayIter = {
+      new NDArrayIter(data, label, dataBatchSize, false, lastBatchHandle)
+    }
+  }
+}
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
index 0f4b7c0e7a3..1b922b3c05b 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
@@ -24,7 +24,7 @@ import scala.sys.process._
 
 class IOSuite extends FunSuite with BeforeAndAfterAll {
 
-  private var tu = new TestUtil
+  private val tu = new TestUtil
 
   test("test MNISTIter & MNISTPack") {
     // get data
@@ -258,7 +258,11 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     assert(batchCount === nBatch0)
 
     // test discard
-    val dataIter1 = new NDArrayIter(data, label, 128, false, "discard")
+    val dataIter1 = new NDArrayIter.Builder()
+      .addData("data0", data(0)).addData("data1", data(1))
+      .addLabel("label", label(0))
+      .setBatchSize(128)
+      .setLastBatchHandle("discard").build()
     val nBatch1 = 7
     batchCount = 0
     while(dataIter1.hasNext) {
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 43ff1f78fe1..f2b806094af 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -36,24 +36,6 @@
       </properties>
     </profile>
   </profiles>
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-        <configuration>
-          <argLine>
-            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
-            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
-          </argLine>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.scalastyle</groupId>
-        <artifactId>scalastyle-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
diff --git a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
index 74bc1dbb71f..72bbbe0fed0 100644
--- a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
+++ b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
@@ -46,24 +46,26 @@ class MXNetGeneralSuite extends SharedSparkContext {
       "/dataset/mxnet-spark-test/train.txt" + " -P " + testDataDir + " -q") !
   }
 
-  override def beforeAll(): Unit = {
-    val tempDirFile = Files.createTempDirectory(s"mxnet-spark-test-${System.currentTimeMillis()}").
-      toFile
-    testDataDir = tempDirFile.getPath
-    tempDirFile.deleteOnExit()
-    downloadTestData()
-  }
-
+//  override def beforeAll(): Unit = {
+//  val tempDirFile = Files.createTempDirectory(s"mxnet-spark-test-${System.currentTimeMillis()}").
+//      toFile
+//    testDataDir = tempDirFile.getPath
+//    tempDirFile.deleteOnExit()
+//    downloadTestData()
+//  }
 
-  test("run spark with MLP") {
-    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
-    val model = buildMlp().fit(trainData)
-    assert(model != null)
-  }
+  test("Dummy test on Spark") {
 
-  test("run spark with LeNet") {
-    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
-    val model = buildLeNet().fit(trainData)
-    assert(model != null)
   }
+//  test("run spark with MLP") {
+//    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
+//    val model = buildMlp().fit(trainData)
+//    assert(model != null)
+//  }
+//
+//  test("run spark with LeNet") {
+//    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
+//    val model = buildLeNet().fit(trainData)
+//    assert(model != null)
+//  }
 }
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 34bd4b20aa5..9aabe04656e 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -36,7 +36,6 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "../imperative/imperative_utils.h"
-#include "../imperative/cached_op.h"
 
 using namespace mxnet;
 
@@ -161,8 +160,12 @@ int MXCreateCachedOp(SymbolHandle handle,
   std::vector<std::string> input_names;
   input_names.reserve(inputs.size());
   for (const auto& i : inputs) input_names.push_back(i->attrs.name);
-  *out = new CachedOpPtr(new CachedOp(
-      *sym, std::vector<std::pair<std::string, std::string> >()));
+  *out = new std::shared_ptr<Imperative::CachedOp>(
+      new Imperative::CachedOp(
+        *sym,
+        std::vector<std::pair<std::string, std::string> >(),
+        input_names,
+        std::unordered_map<std::string, std::vector<NDArray> >()));
   API_END();
 }
 
@@ -170,6 +173,11 @@ int MXCreateCachedOpEx(SymbolHandle handle,
                        int num_flags,
                        const char** keys,
                        const char** vals,
+                       int num_args,
+                       const char** arg_names,
+                       int num_params,
+                       const char** param_names,
+                       NDArrayHandle* params,
                        CachedOpHandle *out) {
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(handle);
 
@@ -178,7 +186,17 @@ int MXCreateCachedOpEx(SymbolHandle handle,
   for (int i = 0; i < num_flags; ++i) {
     flags.push_back({keys[i], vals[i]});
   }
-  *out = new CachedOpPtr(new CachedOp(*sym, flags));
+  std::vector<std::string> args;
+  for (int i = 0; i < num_args; ++i) {
+    args.push_back(arg_names[i]);
+  }
+  std::unordered_map<std::string, std::vector<NDArray> > param_dict;
+  for (int i = 0; i < num_params; ++i) {
+    param_dict[param_names[i]].emplace_back(
+        *reinterpret_cast<NDArray*>(params[i]));
+  }
+  *out = new std::shared_ptr<Imperative::CachedOp>(
+      new Imperative::CachedOp(*sym, flags, args, param_dict));
   API_END();
 }
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 4666b6adf0c..e5e9b522890 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -577,7 +577,8 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
                      const mx_uint num_excluded_symbols,
                      const SymbolHandle *excluded_symbols,
                      const mx_uint num_offline,
-                     const char **offline_params) {
+                     const char **offline_params,
+                     const char *quantized_dtype) {
   nnvm::Symbol *s = new nnvm::Symbol();
   API_BEGIN();
   nnvm::Symbol *sym = static_cast<nnvm::Symbol*>(sym_handle);
@@ -594,7 +595,9 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
   for (size_t i = 0; i < num_offline; ++i) {
     offline.emplace(offline_params[i]);
   }
+  std::string quantized_type(quantized_dtype);
   g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
+  g.attrs["quantized_dtype"] = std::make_shared<nnvm::any>(std::move(quantized_type));
   g = ApplyPass(std::move(g), "QuantizeGraph");
   s->outputs = g.outputs;
   *ret_sym_handle = s;
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index e70cc197c0c..dc0436e02a8 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -278,8 +278,6 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
 }
 
 void ThreadedEngine::Push(OprHandle op, Context exec_ctx, int priority, bool profiling) {
-  BulkFlush();
-
   ThreadedOpr* threaded_opr = ThreadedOpr::CastFromBase(op);
   OprBlock* opr_block = OprBlock::New();
   opr_block->opr = threaded_opr;
@@ -325,6 +323,7 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx,
         << device_count_;
   }
 #endif
+  BulkFlush();
   ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name, wait);
   opr->temporary = true;
   const bool profiling = profiler_->IsProfiling(profiler::Profiler::kImperative);
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index 2f77380baf8..97f258c1061 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -76,6 +76,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     if (is_worker_) return;
     gpu_worker_nthreads_ = common::GetNumThreadsPerGPU();
     cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+    gpu_copy_nthreads_ = dmlc::GetEnv("MXNET_GPU_COPY_NTHREADS", 2);
     // create CPU task
     int cpu_priority_nthreads = dmlc::GetEnv("MXNET_CPU_PRIORITY_NTHREADS", 4);
     cpu_priority_worker_.reset(new ThreadWorkerBlock<kPriorityQueue>());
@@ -128,8 +129,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
         const FnProperty prop = opr_block->opr->prop;
         const bool is_copy = (prop == FnProperty::kCopyFromGPU ||
                               prop == FnProperty::kCopyToGPU);
-        const size_t nthread = gpu_worker_nthreads_;
         if (is_copy) {
+          const size_t nthread = gpu_copy_nthreads_;
           auto ptr = gpu_copy_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
             // Signify to kernel that GPU is being used, so reserve cores as necessary
             OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
@@ -150,6 +151,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
             }
           }
         } else {
+          const size_t nthread = gpu_worker_nthreads_;
           auto ptr = gpu_normal_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
             // Signify to kernel that GPU is being used, so reserve cores as necessary
             OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
@@ -194,6 +196,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   size_t cpu_worker_nthreads_;
   /*! \brief number of concurrent thread each gpu worker uses */
   size_t gpu_worker_nthreads_;
+  /*! \brief number of concurrent thread each gpu copy worker uses */
+  size_t gpu_copy_nthreads_;
   // cpu worker
   common::LazyAllocArray<ThreadWorkerBlock<kWorkerQueue> > cpu_normal_workers_;
   // cpu priority worker
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 72919d90c62..697e4869a04 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -134,10 +134,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_.get_var();
   }
 
-  OpStatePtr state() const override {
-    return state_;
-  }
-
   explicit StatefulComputeExecutor(const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
                                    ExecType exec_type,
@@ -146,6 +142,7 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
         state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
+  friend Graph AttachOpExecs(Graph g);
   OpStatePtr state_;
   FStatefulCompute fcompute_;
   ExecType exec_type_;
@@ -173,16 +170,13 @@ class StatefulComputeExExecutor : public OpExecutor {
     return state_.get_var();
   }
 
-  OpStatePtr state() const override {
-    return state_;
-  }
-
   explicit StatefulComputeExExecutor(const OpStatePtr& state,
                                      const FStatefulComputeEx& fcompute,
                                      ExecType exec_type)
       : state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
+  friend Graph AttachOpExecs(Graph g);
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -247,15 +241,16 @@ class FComputeExExecutor : public OpExecutor {
   ExecType exec_type_;
 };
 
-void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
+// pass to attach operator executors
+Graph AttachOpExecs(Graph g) {
   using nnvm::DTypeVector;
   using nnvm::ShapeVector;
   using nnvm::FMutateInputs;
 
-  static auto& fcreate_op_state = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& fmutate_inputs = nnvm::Op::GetAttr<FMutateInputs>("FMutateInputs");
-  static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
-  static auto& is_layer_backward = nnvm::Op::GetAttr<bool>("TIsLayerOpBackward");
+  auto& fcreate_op_state = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  auto& fmutate_inputs = nnvm::Op::GetAttr<FMutateInputs>("FMutateInputs");
+  auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
+  auto& is_layer_backward = nnvm::Op::GetAttr<bool>("TIsLayerOpBackward");
 
   const auto& vdtype = g.GetAttr<DTypeVector>("dtype");
   const auto& vshape = g.GetAttr<ShapeVector>("shape");
@@ -264,88 +259,82 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
 
   // get the graph
   const auto& idx = g.indexed_graph();
-  OpExecVector& ret = *p_ret;
+  std::vector<std::shared_ptr<OpExecutor> > ret(idx.num_nodes());
 
   // initialize the nodes
-  const auto& inode = idx[i];
-  if (inode.source->is_variable()) return;
-  const nnvm::Op *op = inode.source->op();
-  ExecType exec_type = ExecType::kSync;
-  std::vector<uint32_t> mutate_index;
-  if (fmutate_inputs.count(op)) {
-    mutate_index = fmutate_inputs[op](inode.source->attrs);
-  }
-  if (fexec_type.count(op)) {
-    exec_type = fexec_type[op](inode.source->attrs);
-  }
-  CHECK(dispatch_modes[i] != DispatchMode::kUndefined);
-  if (fcreate_op_state.count(op)) {
-    std::vector<TShape> ishape;
-    std::vector<int> itype;
-    for (const auto& e : inode.inputs) {
-      ishape.emplace_back(vshape[idx.entry_id(e)]);
-      itype.emplace_back(vdtype[idx.entry_id(e)]);
-    }
-
-    OpStatePtr state = fcreate_op_state[op](
-        inode.source->attrs, vctx[i], ishape, itype);
-    FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
-        op, "FStatefulComputeEx", vctx[i]);
-    // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
-    if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
-    } else {
-      FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
-          op, "FStatefulCompute", vctx[i]);
-      CHECK(fcompute != nullptr)
-          << "One of FStatefulCompute and FStatefulComputeEx must be registered "
-          << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
-                                                         exec_type, mutate_index);
+  for (size_t i = 0; i < idx.num_nodes(); ++i) {
+    const auto& inode = idx[i];
+    if (inode.source->is_variable()) continue;
+    const nnvm::Op *op = inode.source->op();
+    ExecType exec_type = ExecType::kSync;
+    std::vector<uint32_t> mutate_index;
+    if (fmutate_inputs.count(op)) {
+      mutate_index = fmutate_inputs[op](inode.source->attrs);
     }
-  } else if (is_layer_backward.get(op, false)) {
-    CHECK_GE(inode.control_deps.size(), 1);
-    uint32_t fwd_id = inode.control_deps[0];
-    CHECK(vctx[fwd_id] == vctx[i]);
-    CHECK(ret[fwd_id] != nullptr);
-    FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
-        op, "FStatefulComputeEx", vctx[i]);
-    // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
-    if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          ret[fwd_id].get()->state(), fcompute_ex, exec_type);
-    } else {
-      FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
-          op, "FStatefulCompute", vctx[i]);
-      CHECK(fcompute != nullptr)
-          << "One of FStatefulCompute and FStatefulComputeEx must be registered "
-          << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(
-          ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
+    if (fexec_type.count(op)) {
+      exec_type = fexec_type[op](inode.source->attrs);
     }
-  } else {
-    FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
-    FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
-    if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<FComputeExExecutor>(
-          inode.source->attrs, fcomp_ex, exec_type);
-    } else if (fcompute != nullptr) {
-      ret[i] = std::make_shared<FComputeExecutor>(
-          inode.source->attrs, fcompute, exec_type, mutate_index);
+    CHECK(dispatch_modes[i] != DispatchMode::kUndefined);
+    if (fcreate_op_state.count(op)) {
+      std::vector<TShape> ishape;
+      std::vector<int> itype;
+      for (const auto& e : inode.inputs) {
+        ishape.emplace_back(vshape[idx.entry_id(e)]);
+        itype.emplace_back(vdtype[idx.entry_id(e)]);
+      }
+
+      OpStatePtr state = fcreate_op_state[op](
+          inode.source->attrs, vctx[i], ishape, itype);
+      FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
+          op, "FStatefulComputeEx", vctx[i]);
+      // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
+      if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
+      } else {
+        FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
+            op, "FStatefulCompute", vctx[i]);
+        CHECK(fcompute != nullptr)
+            << "One of FStatefulCompute and FStatefulComputeEx must be registered "
+            << "for stateful operator " << op->name;
+        ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+                                                           exec_type, mutate_index);
+      }
+    } else if (is_layer_backward.get(op, false)) {
+      CHECK_GE(inode.control_deps.size(), 1);
+      uint32_t fwd_id = inode.control_deps[0];
+      CHECK(vctx[fwd_id] == vctx[i]);
+      CHECK(ret[fwd_id] != nullptr);
+      FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
+          op, "FStatefulComputeEx", vctx[i]);
+      // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
+      if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<StatefulComputeExExecutor>(
+            dynamic_cast<StatefulComputeExExecutor*>(ret[fwd_id].get())->state_,
+            fcompute_ex, exec_type);
+      } else {
+        FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
+            op, "FStatefulCompute", vctx[i]);
+        CHECK(fcompute != nullptr)
+            << "One of FStatefulCompute and FStatefulComputeEx must be registered "
+            << "for stateful operator " << op->name;
+        ret[i] = std::make_shared<StatefulComputeExecutor>(
+            dynamic_cast<StatefulComputeExecutor*>(ret[fwd_id].get())->state_,
+            fcompute, exec_type, mutate_index);
+      }
     } else {
-      LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
+      FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
+      FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
+      if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<FComputeExExecutor>(
+            inode.source->attrs, fcomp_ex, exec_type);
+      } else if (fcompute != nullptr) {
+        ret[i] = std::make_shared<FComputeExecutor>(
+            inode.source->attrs, fcompute, exec_type, mutate_index);
+      } else {
+        LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
+      }
     }
   }
-}
-
-
-// pass to attach operator executors
-Graph AttachOpExecs(Graph g) {
-  const auto& idx = g.indexed_graph();
-  OpExecVector ret(idx.num_nodes());
-  for (size_t i = 0; i < idx.num_nodes(); ++i) {
-    CreateOpExecs(g, &ret, i);
-  }
   g.attrs["op_execs"] = std::make_shared<nnvm::any>(ret);
   return g;
 }
diff --git a/src/executor/attach_op_resource_pass.cc b/src/executor/attach_op_resource_pass.cc
index 56122cda6ff..681866296e1 100644
--- a/src/executor/attach_op_resource_pass.cc
+++ b/src/executor/attach_op_resource_pass.cc
@@ -30,15 +30,12 @@
 namespace mxnet {
 namespace exec {
 
-void AttachOpResources(
-    const Graph& g,
-    const OpExecVector& op_execs,
-    size_t start_nid,
-    size_t end_nid) {
+Graph AttachOpResources(Graph g) {
   static auto& fresource =
       nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
   static auto& fresource_ex =
       nnvm::Op::GetAttr<FResourceRequestEx>("FResourceRequestEx");
+  auto& op_execs = nnvm::get<OpExecVector>(*g.attrs.at("op_execs"));
   const auto& vctx = g.GetAttr<ContextVector>("context");
   const auto& vdispatch = g.GetAttr<DispatchModeVector>("dispatch_mode");
   const auto& dev_masks = g.GetAttr<DevMaskVector>("dev_mask");
@@ -46,7 +43,7 @@ void AttachOpResources(
   // Use global resource pool for each executor for now.
   std::map<Context, Resource> cached_temp;
   // Resource allocation
-  for (uint32_t nid = start_nid; nid < end_nid; ++nid) {
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
     const Context &ctx = vctx[nid];
@@ -87,12 +84,7 @@ void AttachOpResources(
       requested.push_back(ResourceManager::Get()->Request(ctx, ResourceRequest::kTempSpace));
     }
   }
+  return g;
 }
-
-void AttachOpResources(const Graph& g) {
-  const auto& op_execs = g.GetAttr<OpExecVector>("op_execs");
-  AttachOpResources(g, op_execs, 0, g.indexed_graph().num_nodes());
-}
-
 }  // namespace exec
 }  // namespace mxnet
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 26a24911894..99b1b162eae 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -82,10 +82,6 @@ class OpExecutor {
   virtual engine::VarHandle var() const {
     return nullptr;
   }
-  /*! \return return operator state */
-  virtual OpStatePtr state() const {
-    return OpStatePtr();
-  }
 };
 
 /*!
@@ -106,14 +102,6 @@ using ContextVector = std::vector<Context>;
  */
 using DevMaskVector = std::vector<int>;
 
-/*!
- * \brief create OpExecutor for a node in graph
- *
- * \param g input graph
- * \param p_ret OpExecVector for input and output
- * \param i the id of the node
- */
-void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i);
 /*!
  * \brief Attach OpExecutor to the graph attributes.
  *
@@ -127,20 +115,12 @@ Graph AttachOpExecs(Graph g);
  * \brief Attach Resource to the OpExecVector of the graph.
  *
  * \param g input graph need to contain op_exec attribute.
- */
-void AttachOpResources(const Graph& g);
-/*!
- * \brief Attach Resource to the OpExecVector
  *
- * \param g input graph
- * \param op_execs OpExecutor vector
- * \param start_nid starting node id
- * \param end_nid end node id
+ * \return graph with new attribute "op_exec" of type OpExecVector
+ *  The fields on the OpExecVector are not yet been setup.
  */
-void AttachOpResources(const Graph& g,
-                       const OpExecVector& op_execs,
-                       size_t start_nid,
-                       size_t end_nid);
+Graph AttachOpResources(Graph g);
+
 /*!
  * \brief Discover chance of inplace addto operators.
  *  i.e. z = plus(z, source_op), and encourage it to become z += source_op.
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 831b5f90023..e28867d5488 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -912,7 +912,7 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol,
   }
 
   g = AttachOpExecs(g);
-  AttachOpResources(g);
+  g = AttachOpResources(g);
   graph_ = std::move(g);
 
   if (shared_exec != nullptr) {
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index b40605bd25e..140b5a5d81e 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -19,78 +19,16 @@
 #include <unordered_set>
 #include <iostream>
 #include "./imperative_utils.h"
-#include "./cached_op.h"
-#include "../executor/exec_pass.h"
-#include "../profiler/profiler.h"
-
 
 namespace mxnet {
 
 DMLC_REGISTER_PARAMETER(CachedOpConfig);
 
-struct CachedOp::GraphInfo {
-  nnvm::Graph fwd_graph;
-  nnvm::Graph full_graph;
-  std::vector<OpReqType> bwd_output_reqs;
-  std::vector<uint32_t> bwd_input_eid;
-};
-
-struct CachedOp::DynamicRuntime {
-  GraphInfo info;
-  std::vector<NDArray> buff;
-  std::vector<OpStatePtr> op_states;
-};
-
-struct CachedOp::CachedOpState {
-  CachedOpState(const Context& context_,
-                const nnvm::Graph& fwd_graph_,
-                const nnvm::Graph& full_graph_) {
-    context = context_;
-    info.fwd_graph = fwd_graph_;
-    info.full_graph = full_graph_;
-
-    size_t max_nodes = info.full_graph.indexed_graph().num_nodes();
-    size_t max_entries = info.full_graph.indexed_graph().num_node_entries();
-    info.fwd_graph.attrs["context"] = std::make_shared<dmlc::any>(
-        std::vector<Context>(info.fwd_graph.indexed_graph().num_nodes(), context));
-    info.full_graph.attrs["context"] = std::make_shared<dmlc::any>(
-        std::vector<Context>(max_nodes, context));
-
-    buff.resize(max_entries);
-    arrays.resize(max_entries);
-    array_reqs.resize(max_entries);
-    dynamic_entries.resize(max_entries, false);
-    op_states.resize(max_nodes);
-    execs.resize(max_nodes);
-    opr_segs.resize(max_nodes);
-  }
-
-  std::mutex mutex;
-  Context context;
-  GraphInfo info;
-
-  bool recording = false;
-  bool fwd_alloc = false;
-  bool bwd_alloc = false;
-  bool fwd_exec_init = false;
-  bool bwd_exec_init = false;
-
-  std::vector<NDArray> buff;
-  std::vector<NDArray*> arrays;
-  std::vector<OpReqType> array_reqs;
-
-  std::vector<OpStatePtr> op_states;
-  std::vector<std::shared_ptr<exec::OpExecutor> > execs;
-  std::vector<imperative::EngineOprSeg> opr_segs;
-
-  std::vector<bool> dynamic_entries;
-  std::multimap<size_t, NDArray> fwd_reuse_pool;
-  std::multimap<size_t, NDArray> bwd_reuse_pool;
-};
-
-CachedOp::CachedOp(
+Imperative::CachedOp::CachedOp(
     const nnvm::Symbol& sym,
-    const std::vector<std::pair<std::string, std::string> >& flags) {
+    const std::vector<std::pair<std::string, std::string> >& flags,
+    const std::vector<std::string> arg_names,
+    const std::unordered_map<std::string, std::vector<NDArray> >& params) {
   using namespace nnvm;
   using namespace imperative;
   static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
@@ -130,22 +68,34 @@ CachedOp::CachedOp(
     fwd_graph_.attrs["forward_ref_count"] =
         std::make_shared<dmlc::any>(std::move(ref_count));
 
-    inlining_ = !config_.static_alloc &&
-        (idx.num_nodes() - idx.input_nodes().size()) <= config_.inline_limit;
+    inlining_ = (idx.num_nodes() - idx.input_nodes().size()) <= config_.inline_limit;
   }
 
   // Set params
   {
     const auto& idx = fwd_graph_.indexed_graph();
-    if (config_.data_indices.ndim() || config_.param_indices.ndim()) {
-      CHECK_EQ(config_.data_indices.ndim() + config_.param_indices.ndim(),
-               idx.input_nodes().size());
-    } else {
-      std::vector<uint32_t> tmp;
-      for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-        tmp.push_back(i);
+    std::unordered_map<std::string, size_t> arg_name_to_id;
+    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
+      const auto& name = idx[idx.input_nodes()[i]].source->attrs.name;
+      auto iter = params.find(name);
+      if (iter == params.end()) {
+        arg_name_to_id[name] = i;
+        continue;
+      }
+      fwd_params_idx_.push_back(i);
+      for (const auto& param : iter->second) {
+        params_[param.ctx()].emplace_back(param);
       }
-      config_.data_indices.assign(tmp.begin(), tmp.end());
+    }
+
+    CHECK_EQ(arg_name_to_id.size(), arg_names.size())
+        << "CachedOp expects " << arg_name_to_id.size()
+        << " inputs, given " << arg_names.size();
+
+    for (const auto& name : arg_names) {
+      auto iter = arg_name_to_id.find(name);
+      CHECK(iter != arg_name_to_id.end()) << "Unexpected input name " << name;
+      fwd_args_idx_.push_back(iter->second);
     }
   }
 
@@ -157,14 +107,9 @@ CachedOp::CachedOp(
     }
 
     std::vector<NodeEntry> xs;
-    const auto& idx = fwd_graph_.indexed_graph();
-    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-      auto nid = idx.input_nodes()[i];
-      if (idx.mutable_input_nodes().count(nid)) continue;
-      fwd_input_to_grad_output_[i] = xs.size();
-      xs.emplace_back(NodeEntry{idx[nid].weak_ref.lock(), 0, 0});
-    }
-
+    std::vector<NodePtr> args = sym.ListInputs(Symbol::kReadOnlyArgs);
+    xs.reserve(args.size());
+    for (const auto& i : args) xs.emplace_back(NodeEntry{i, 0, 0});
     CHECK_GT(xs.size(), 0)
         << "There are no inputs in computation graph that require gradients.";
 
@@ -180,7 +125,7 @@ CachedOp::CachedOp(
     size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
 
     full_graph_.outputs = fwd_graph_.outputs;
-    bwd_output_reqs_ = std::vector<OpReqType>(grad_graph_.outputs.size(), kWriteTo);
+    curr_grad_req_ = std::vector<bool>(grad_graph_.outputs.size(), true);
     for (const auto& i : grad_graph_.outputs) full_graph_.outputs.emplace_back(i);
     const auto& idx = full_graph_.indexed_graph();
 
@@ -224,10 +169,7 @@ CachedOp::CachedOp(
   }
 }
 
-CachedOp::~CachedOp() {
-}
-
-std::vector<nnvm::NodeEntry> CachedOp::Gradient(
+std::vector<nnvm::NodeEntry> Imperative::CachedOp::Gradient(
     const nnvm::NodePtr& node,
     const std::vector<nnvm::NodeEntry>& ograds) {
   using namespace nnvm;
@@ -264,15 +206,13 @@ std::vector<nnvm::NodeEntry> CachedOp::Gradient(
   return ret;
 }
 
-
-bool CachedOp::SetForwardGraph(
-    GraphInfo* info,
-    const bool recording,
-    const std::vector<NDArray*>& inputs) {
+nnvm::Graph Imperative::CachedOp::GetForwardGraph(
+    const bool recording, const std::vector<NDArray*>& inputs) {
   using namespace nnvm;
   using namespace imperative;
+  std::lock_guard<std::mutex> lock(mutex_);
   CHECK_EQ(inputs.size(), num_inputs());
-  nnvm::Graph& g = info->fwd_graph;
+  nnvm::Graph& g = fwd_graph_;
 
   ShapeVector shape_inputs;
   DTypeVector dtype_inputs;
@@ -297,22 +237,18 @@ bool CachedOp::SetForwardGraph(
     g.attrs.erase("forward_mem_plan");
     g.attrs.erase("full_mem_plan");
   } else if (g.attrs.count(recording ? "full_mem_plan" : "forward_mem_plan")) {
-    return true;
+    return g;
   }
 
   const auto& idx = g.indexed_graph();
 
   StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
+  for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
   const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
   CHECK_EQ(stypes.size(), storage.size());
   for (size_t i = 0; i < stypes.size(); i++) {
-    if (stypes[i] != kDefaultStorage) storage[i] = exec::kDynamicStorageID;
-  }
-  for (const auto i : idx.input_nodes()) {
-    storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
-  }
-  for (size_t i = 0; i < idx.outputs().size(); ++i) {
-    storage[idx.entry_id(idx.outputs()[i])] = exec::kExternalStorageID;
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
   }
 
   auto mem_plan = PlanMemory(
@@ -321,50 +257,51 @@ bool CachedOp::SetForwardGraph(
   g.attrs[recording ? "full_mem_plan" : "forward_mem_plan"] =
       std::make_shared<dmlc::any>(std::move(mem_plan));
 
-  return false;
+  return g;
 }
 
-bool CachedOp::SetBackwardGraph(
-    GraphInfo* info,
+nnvm::Graph Imperative::CachedOp::GetBackwardGraph(
+    const OpStatePtr& op_state,
     const std::vector<OpReqType>& reqs,
-    const std::vector<NDArray*>& inputs,
-    bool detect_inplace_addto) {
+    const std::vector<NDArray*>& inputs) {
   using namespace nnvm;
   using namespace imperative;
   std::lock_guard<std::mutex> lock(mutex_);
-  Context default_ctx = inputs[0]->ctx();
-  nnvm::Graph& g = info->full_graph;
-
-  if (info->bwd_output_reqs != reqs) {
-    info->bwd_output_reqs = reqs;
-    info->bwd_input_eid.clear();
+  nnvm::Graph& g = full_graph_;
+  auto& state = op_state.get_state<CachedOpState>();
+  bool req_match = true;
+  for (size_t i = 0; i < reqs.size(); ++i) {
+    if (curr_grad_req_[i] != (reqs[i] != kNullOp)) {
+      curr_grad_req_[i] = reqs[i] != kNullOp;
+      req_match = false;
+    }
+  }
+  if (!req_match) {
     g = nnvm::Graph();
     g.outputs = fwd_graph_.outputs;
     for (size_t i = 0; i < grad_graph_.outputs.size(); ++i) {
-      if (info->bwd_output_reqs[i] == kNullOp) continue;
-      g.outputs.emplace_back(grad_graph_.outputs[i]);
+      if (curr_grad_req_[i]) g.outputs.emplace_back(grad_graph_.outputs[i]);
     }
-    g.attrs["context"] = std::make_shared<dmlc::any>(
-        std::vector<Context>(g.indexed_graph().num_nodes(), default_ctx));
+    bwd_input_eid_.clear();
   }
 
   const auto& idx = g.indexed_graph();
 
-  if (info->bwd_input_eid.size() != inputs.size()) {
-    info->bwd_input_eid.clear();
+  if (bwd_input_eid_.size() != inputs.size()) {
+    bwd_input_eid_.clear();
     for (const auto& i : bwd_ograd_dep_) {
       auto eid = idx.entry_id(ograd_entries_[i]);
-      info->bwd_input_eid.push_back(eid);
+      bwd_input_eid_.push_back(eid);
     }
     for (const auto& i : bwd_in_dep_) {
       auto eid = idx.entry_id(idx.input_nodes()[i], 0);
-      info->bwd_input_eid.push_back(eid);
+      bwd_input_eid_.push_back(eid);
     }
     for (const auto& i : bwd_out_dep_) {
       auto eid = idx.entry_id(idx.outputs()[i]);
-      info->bwd_input_eid.push_back(eid);
+      bwd_input_eid_.push_back(eid);
     }
-    CHECK_EQ(inputs.size(), info->bwd_input_eid.size());
+    CHECK_EQ(inputs.size(), bwd_input_eid_.size());
   }
 
   size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
@@ -375,22 +312,25 @@ bool CachedOp::SetBackwardGraph(
     for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
     }
-    for (size_t i = 0; i < inputs.size(); ++i) ++ref_count[info->bwd_input_eid[i]];
+    for (size_t i = 0; i < inputs.size(); ++i) ++ref_count[bwd_input_eid_[i]];
     for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)];
     g.attrs["backward_ref_count"] = std::make_shared<dmlc::any>(std::move(ref_count));
   }
 
-  auto shapes = info->fwd_graph.GetAttr<ShapeVector>("shape");
-  shapes.resize(idx.num_node_entries(), TShape());
-  auto dtypes = info->fwd_graph.GetAttr<DTypeVector>("dtype");
-  dtypes.resize(idx.num_node_entries(), -1);
-  auto stypes = info->fwd_graph.GetAttr<StorageTypeVector>("storage_type");
-  stypes.resize(idx.num_node_entries(), -1);
+  ShapeVector shapes(idx.num_node_entries(), TShape());
+  DTypeVector dtypes(idx.num_node_entries(), -1);
+  StorageTypeVector stypes(idx.num_node_entries(), -1);
+
+  for (size_t i = 0; i < num_forward_entries; ++i) {
+    shapes[i] = state.buff[i].shape();
+    dtypes[i] = state.buff[i].dtype();
+    stypes[i] = state.buff[i].storage_type();
+  }
 
   for (size_t i = 0; i < inputs.size(); ++i) {
-    shapes[info->bwd_input_eid[i]] = inputs[i]->shape();
-    dtypes[info->bwd_input_eid[i]] = inputs[i]->dtype();
-    stypes[info->bwd_input_eid[i]] = inputs[i]->storage_type();
+    shapes[bwd_input_eid_[i]] = inputs[i]->shape();
+    dtypes[bwd_input_eid_[i]] = inputs[i]->dtype();
+    stypes[bwd_input_eid_[i]] = inputs[i]->storage_type();
   }
 
   std::pair<uint32_t, uint32_t> node_range, entry_range;
@@ -402,353 +342,79 @@ bool CachedOp::SetBackwardGraph(
                               node_range, entry_range);
   match &= CheckAndInferType(&g, std::move(dtypes), false,
                              node_range, entry_range);
-  exec::DevMaskVector dev_mask(idx.num_nodes(), default_ctx.dev_mask());
+  exec::DevMaskVector dev_mask(idx.num_nodes(), inputs[0]->ctx().dev_mask());
   match &= CheckAndInferStorageType(&g, std::move(dev_mask), std::move(stypes),
                                     false, node_range, entry_range);
 
   if (!match) {
     g.attrs.erase("backward_mem_plan");
   } else if (g.attrs.count("backward_mem_plan")) {
-    return true;
+    return g;
   }
 
   StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
-  const auto& bwd_stypes = g.GetAttr<StorageTypeVector>("storage_type");
-  for (size_t i = 0; i < bwd_stypes.size(); i++) {
-    if (bwd_stypes[i] != kDefaultStorage) storage[i] = exec::kDynamicStorageID;
-  }
   for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID;
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
+  for (size_t i = 0; i < stypes.size(); i++) {
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
+  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
-      {num_forward_nodes, idx.num_nodes()},
-      {num_forward_entries, idx.num_node_entries()},
-      detect_inplace_addto);
+      {num_forward_nodes, idx.num_nodes()}, {num_forward_entries, idx.num_node_entries()});
   g.attrs["backward_mem_plan"] = std::make_shared<dmlc::any>(std::move(mem_plan));
 
-  return false;
-}
-
-OpStatePtr CachedOp::GetCachedOpState(
-    const Context& ctx) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  for (const auto& i : cached_op_states_[ctx]) {
-    // only create one state per device when not using static memory
-    if (!config_.static_alloc || i.unique()) {
-      return i;
-    }
-  }
-  auto state_ptr = OpStatePtr::Create<CachedOpState>(ctx, fwd_graph_, full_graph_);
-
-  cached_op_states_[ctx].push_back(state_ptr);
-  return state_ptr;
-}
-
-void CachedOp::StaticAllocMemory(
-    const OpStatePtr& state_ptr,
-    bool recording,
-    bool keep_fwd) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  auto& state = state_ptr.get_state<CachedOpState>();
-  const auto& default_ctx = state.context;
-  nnvm::Graph& g = keep_fwd ? state.info.full_graph : state.info.fwd_graph;
-  const auto& idx = g.indexed_graph();
-  const auto& vstorage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(
-      keep_fwd ? "backward_mem_plan" : (recording ? "full_mem_plan" : "forward_mem_plan"));
-  std::vector<int> addto_entry;
-  if (g.attrs.count("addto_entry")) {
-    addto_entry = g.GetAttr<std::vector<int> >("addto_entry");
-  }
-  size_t start_eid =
-      keep_fwd ? state.info.fwd_graph.indexed_graph().num_node_entries() : 0;
-  size_t end_eid = idx.num_node_entries();
-
-  if (!keep_fwd) state.fwd_alloc = false;
-  state.bwd_alloc = false;
-  for (size_t i = start_eid; i < state.buff.size(); ++i) {
-    state.buff[i] = NDArray();
-    state.arrays[i] = &state.buff[i];
-    state.array_reqs[i] = kNullOp;
-    state.dynamic_entries[i] = false;
-  }
-
-  for (auto i : idx.input_nodes()) {
-    auto eid = idx.entry_id(i, 0);
-    if (eid >= start_eid) state.dynamic_entries[eid] = true;
-  }
-  for (auto i : idx.outputs()) {
-    auto eid = idx.entry_id(i);
-    if (eid >= start_eid) state.dynamic_entries[eid] = true;
-  }
-
-  for (size_t i = start_eid; i < end_eid; ++i) {
-    if (addto_entry.size() && addto_entry[i]) {
-      state.array_reqs[i] = kAddTo;
-    } else if (vstorage_inplace[i] >= 0) {
-      state.array_reqs[i] = kWriteInplace;
-    } else if (vstorage_inplace[i] == -2) {
-      // -2 indicate that the entry is never referenced.
-      state.array_reqs[i] = kNullOp;
-    } else {
-      state.array_reqs[i] = kWriteTo;
-    }
-  }
-
-  auto& reuse_pool = keep_fwd ? state.bwd_reuse_pool : state.fwd_reuse_pool;
-  reuse_pool = imperative::AllocateMemory(
-      g, idx, default_ctx, start_eid, end_eid, mem_plan,
-      state.arrays, &state.array_reqs, std::move(reuse_pool));
-
-  state.recording = recording;
-  if (keep_fwd) {
-    state.bwd_alloc = true;
-  } else {
-    state.fwd_alloc = true;
-  }
+  return g;
 }
 
-void CachedOp::StaticInitExec(
-    const OpStatePtr& state_ptr,
-    bool recording,
-    bool keep_fwd) {
+void Imperative::CachedOp::Forward(
+    const std::shared_ptr<CachedOp>& op_ptr,
+    const std::vector<NDArray*>& args,
+    const std::vector<NDArray*>& outputs) {
   using namespace nnvm;
   using namespace imperative;
+  static const auto cached_op = nnvm::Op::Get("_CachedOp");
 
-  auto& state = state_ptr.get_state<CachedOpState>();
-  const auto& default_ctx = state.context;
-  nnvm::Graph& g = keep_fwd ? state.info.full_graph : state.info.fwd_graph;
-  const auto& idx = g.indexed_graph();
-  std::vector<int> skip_plus_node;
-  if (g.attrs.count("skip_plus_node")) {
-    skip_plus_node = g.GetAttr<std::vector<int> >("skip_plus_node");
-  }
-  size_t start_nid =
-      keep_fwd ? state.info.fwd_graph.indexed_graph().num_nodes() : 0;
-  size_t end_nid = idx.num_nodes();
-
-  if (!keep_fwd) state.fwd_exec_init = false;
-  state.bwd_exec_init = false;
-
-  for (size_t i = start_nid; i < state.execs.size(); ++i) {
-    state.execs[i].reset();
-    state.opr_segs[i] = EngineOprSeg();
-  }
-
-  if (!config_.static_shape) {
-    for (size_t i = start_nid; i < end_nid; ++i) {
-      state.opr_segs[i].next_nid = i + 1;
-      state.opr_segs[i].skip = skip_plus_node.size() && skip_plus_node[i];
-    }
-  } else {
-    for (size_t i = start_nid; i < end_nid; ++i) {
-      exec::CreateOpExecs(g, &state.execs, i);
-    }
-    exec::AttachOpResources(g, state.execs, start_nid, end_nid);
-
-    for (size_t i = start_nid; i < end_nid; ++i) {
-      bool skip = idx[i].source->is_variable();
-      for (size_t j = 0; !skip && j < idx[i].inputs.size(); ++j) {
-        skip = state.dynamic_entries[idx.entry_id(idx[i].inputs[j])];
-      }
-      for (size_t j = 0; !skip && j < idx[i].source->num_outputs(); ++j) {
-        skip = state.dynamic_entries[idx.entry_id(i, j)];
-      }
-      if (skip) continue;
-      SetupOpExec(g, i, state.execs[i], state.arrays, state.array_reqs);
-    }
+  CHECK_EQ(args.size(), fwd_args_idx_.size())
+      << "CachedOp requires " << fwd_args_idx_.size()
+      << " inputs but got " << args.size();
 
-    size_t bulk_size = idx.num_nodes();
-    std::unordered_set<uint32_t> excludes;
-    if (recording || keep_fwd) {
-      bulk_size = keep_fwd ? config_.backward_bulk_size : config_.forward_bulk_size;
-      for (const auto& i : idx.outputs()) excludes.insert(idx.entry_id(i));
-      for (const auto& i : idx.input_nodes()) excludes.insert(idx.entry_id(i, 0));
-    }
+  Context default_ctx = args[0]->ctx();
 
-    CreateEngineOpSeg(idx, default_ctx, start_nid, end_nid, bulk_size, excludes,
-                      state.execs, skip_plus_node, &state.opr_segs);
-  }
 
-  if (keep_fwd) {
-    state.bwd_exec_init = true;
-  } else {
-    state.fwd_exec_init = true;
+  std::vector<NDArray*> inputs(num_inputs());
+  for (index_t i = 0; i < fwd_args_idx_.size(); ++i) {
+    inputs[fwd_args_idx_[i]] = args[i];
   }
-}
-
-void CachedOp::StaticRunOps(
-    const Context& default_ctx,
-    const nnvm::Graph& g,
-    const OpStatePtr& state_ptr,
-    size_t start_nid,
-    size_t end_nid) {
-  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
-
-  bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning;
-  bool is_training = Imperative::Get()->is_training();
-  auto& state = state_ptr.get_state<CachedOpState>();
-  const auto& idx = g.indexed_graph();
-  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
-  const auto& op_execs = state.execs;
-
-  std::vector<NDArray*> ndinputs, ndoutputs;
-  nnvm::ShapeVector arg_shapes;
-  nnvm::DTypeVector arg_dtypes;
-  std::vector<OpReqType> req;
+  if (fwd_params_idx_.size()) {
+    CHECK(params_.find(default_ctx) != params_.end())
+        << "CachedOp is not initialized on context " << default_ctx;
 
-  for (size_t i = start_nid; config_.static_shape && i < end_nid; ++i) {
-    if (op_execs[i]) op_execs[i]->op_ctx.is_train = is_training;
-  }
-
-  for (size_t i = start_nid; i < end_nid; i = state.opr_segs[i].next_nid) {
-    const auto& opr_seg = state.opr_segs[i];
-    if (opr_seg.skip) continue;
-    if (opr_seg.opr != nullptr) {
-      Engine::Get()->Push(opr_seg.opr.get(), default_ctx, 0, profiling);
-    } else {
-      const nnvm::IndexedGraph::Node& node = idx[i];
-      if (node.source->is_variable()) continue;
-      auto num_outputs = node.source->num_outputs();
-      ndinputs.clear();
-      ndinputs.reserve(node.inputs.size());
-      for (const auto& j : node.inputs) {
-        ndinputs.emplace_back(state.arrays[idx.entry_id(j)]);
-        CHECK(!ndinputs.back()->is_none());
-      }
-      ndoutputs.clear();
-      ndoutputs.reserve(num_outputs);
-      req.clear();
-      req.reserve(num_outputs);
-      for (size_t j = 0; j < num_outputs; ++j) {
-        size_t eid = idx.entry_id(i, j);
-        ndoutputs.emplace_back(state.arrays[eid]);
-        req.push_back(state.array_reqs[eid]);
-        CHECK(req.back() == kNullOp || !ndoutputs.back()->is_none());
-      }
-      const DispatchMode dispatch_mode = dispatch_modes[i];
-      if (createop.count(node.source->op())) {
-        arg_shapes.clear();
-        arg_dtypes.clear();
-        arg_shapes.reserve(ndinputs.size());
-        arg_dtypes.reserve(ndinputs.size());
-        for (size_t i = 0; i < ndinputs.size(); ++i) {
-          arg_shapes.emplace_back(ndinputs[i]->shape());
-          arg_dtypes.emplace_back(ndinputs[i]->dtype());
-        }
-        state.op_states[i] = createop[node.source->op()](
-            node.source->attrs, default_ctx, arg_shapes, arg_dtypes);
-        Imperative::Get()->InvokeOp(
-            default_ctx, node.source->attrs, ndinputs, ndoutputs, req,
-            dispatch_mode, state.op_states[i]);
-      } else if (is_layer_backward.get(node.source->op(), false)) {
-        nnvm::Node* fwd_node = node.source->control_deps[0].get();
-        auto fwd_node_id = idx.node_id(fwd_node);
-        Imperative::Get()->InvokeOp(
-            default_ctx, node.source->attrs, ndinputs, ndoutputs,
-            req, dispatch_mode, state.op_states[fwd_node_id]);
-      } else {
-        Imperative::Get()->InvokeOp(
-            default_ctx, node.source->attrs, ndinputs, ndoutputs, req,
-            dispatch_mode);
-      }
+    for (size_t i = 0; i < fwd_params_idx_.size(); ++i) {
+      inputs[fwd_params_idx_[i]] = &params_[default_ctx][i];
     }
   }
-}
-
-OpStatePtr CachedOp::StaticForward(
-    const Context& default_ctx,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace nnvm;
-  using namespace imperative;
 
+  // Initialize
   bool recording = Imperative::Get()->is_recording();
-  auto state_ptr = GetCachedOpState(default_ctx);
-  auto& state = state_ptr.get_state<CachedOpState>();
-  std::lock_guard<std::mutex> lock(state.mutex);
-
-  bool match = SetForwardGraph(&state.info, recording, inputs);
-  match = match && state.recording != recording;
-
-  nnvm::Graph& g = state.info.fwd_graph;
+  nnvm::Graph g = GetForwardGraph(recording, inputs);
   const auto& idx = g.indexed_graph();
-  if (!state.fwd_alloc || !match)  {
-    StaticAllocMemory(state_ptr, recording, false);
-  }
-
-  if (config_.static_shape) {
-    for (auto i : config_.param_indices) {
-      auto nid = idx.input_nodes()[i];
-      if (!state.arrays[idx.entry_id(nid, 0)]->IsSame(*inputs[i])) {
-        match = false;
-        auto ptr = &state.buff[idx.entry_id(nid, 0)];
-        CHECK_EQ(state.arrays[idx.entry_id(nid, 0)], ptr);
-        *state.arrays[idx.entry_id(nid, 0)] = *inputs[i];
-        state.dynamic_entries[idx.entry_id(nid, 0)] = false;
-      }
-    }
-    for (auto i : config_.data_indices) {
-      auto eid = idx.entry_id(idx.input_nodes()[i], 0);
-      state.arrays[eid] = inputs[i];
-    }
-  } else {
-    for (size_t i = 0; i < num_inputs(); ++i) {
-      auto nid = idx.input_nodes()[i];
-      state.arrays[idx.entry_id(nid, 0)] = inputs[i];
-    }
-  }
-
-  if (!state.fwd_exec_init || !match) {
-    StaticInitExec(state_ptr, recording, false);
-  }
-
-  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
-  const auto& shapes = g.GetAttr<ShapeVector>("shape");
-  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  size_t num_inputs = idx.input_nodes().size();
 
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto eid = idx.entry_id(idx.outputs()[i]);
-    state.arrays[eid] = outputs[i];
-    if (!outputs[i]->is_none()) continue;
-    *outputs[i] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
-                          shapes[eid], default_ctx, true, dtypes[eid]);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i]->ctx(), default_ctx)
+        << "CachedOp requires all inputs to live on the same context. But "
+        << idx[idx.input_nodes()[0]].source->attrs.name << " is on " << default_ctx
+        << " while " << idx[idx.input_nodes()[i]].source->attrs.name << " is on "
+        << inputs[i]->ctx();
   }
 
-  StaticRunOps(default_ctx, g, state_ptr, 0, idx.num_nodes());
-
-  return recording ? state_ptr : OpStatePtr();
-}
-
-
-OpStatePtr CachedOp::DynamicForward(
-    const Context& default_ctx,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  // Initialize
-  bool recording = Imperative::Get()->is_recording();
-  auto op_state = OpStatePtr::Create<DynamicRuntime>();
-  auto& runtime = op_state.get_state<DynamicRuntime>();
-  {
-    auto state_ptr = GetCachedOpState(default_ctx);
-    auto& state = state_ptr.get_state<CachedOpState>();
-    std::lock_guard<std::mutex> lock(state.mutex);
-    SetForwardGraph(&state.info, recording, inputs);
-    runtime.info.fwd_graph = state.info.fwd_graph;
-  }
-  nnvm::Graph& g = runtime.info.fwd_graph;
-  const auto& idx = g.indexed_graph();
-  size_t num_inputs = idx.input_nodes().size();
-  auto& buff = runtime.buff;
-  auto& states = runtime.op_states;
+  auto op_state_ptr = OpStatePtr::Create<CachedOpState>();
+  auto& cached_op_state = op_state_ptr.get_state<CachedOpState>();
+  auto& buff = cached_op_state.buff;
+  auto& states = cached_op_state.states;
 
   // Allocate entries
   states.resize(idx.num_nodes());
@@ -780,98 +446,57 @@ OpStatePtr CachedOp::DynamicForward(
   AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
 
-  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
-  const auto& shapes = g.GetAttr<ShapeVector>("shape");
-  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
-
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto eid = idx.entry_id(idx.outputs()[i]);
-    arrays[eid] = outputs[i];
-    if (!outputs[i]->is_none()) continue;
-    *outputs[i] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
-                          shapes[eid], default_ctx, true, dtypes[eid]);
-  }
-
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
   if (recording && !inlining_) Imperative::Get()->set_is_recording(false);
+  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.forward_bulk_size);
 
-  RunGraph(false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
-           std::move(ref_count), &states, dispatch_modes);
+  Imperative::Get()->RunGraph(
+      false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
+      std::move(ref_count), &states, dispatch_modes);
 
+  Engine::Get()->set_bulk_size(prev_bulk_size);
   Imperative::Get()->set_is_recording(recording);
 
-  return op_state;
-}
-
-void CachedOp::Forward(
-    const std::shared_ptr<CachedOp>& op_ptr,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
-  static const auto cached_op = nnvm::Op::Get("_CachedOp");
-
-  CHECK_EQ(inputs.size(), num_inputs());
-
-  Context default_ctx = inputs[0]->ctx();
-
-  const auto& idx = fwd_graph_.indexed_graph();
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i]->ctx(), default_ctx)
-        << "CachedOp requires all inputs to live on the same context. But "
-        << idx[idx.input_nodes()[0]].source->attrs.name
-        << " is on " << default_ctx << " while "
-        << idx[idx.input_nodes()[i]].source->attrs.name
-        << " is on " << inputs[i]->ctx();
-  }
-
-  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.forward_bulk_size);
-
-  OpStatePtr op_state;
-  if (config_.static_alloc) {
-    op_state = StaticForward(default_ctx, inputs, outputs);
-  } else {
-    op_state = DynamicForward(default_ctx, inputs, outputs);
+  for (size_t i = 0; i < idx.num_node_entries(); ++i) {
+    if (arrays[i] == &buff[i]) continue;
+    buff[i].shape_ = arrays[i]->shape_;
+    buff[i].dtype_ = arrays[i]->dtype_;
+    buff[i].storage_type_ = arrays[i]->storage_type_;
   }
 
-  Engine::Get()->set_bulk_size(prev_bulk_size);
-
-  if (Imperative::Get()->is_recording() && !inlining_) {
+  if (recording && !inlining_) {
     nnvm::NodeAttrs attrs;
     attrs.op = cached_op;
     attrs.name = "_cachedop";
     attrs.parsed = op_ptr;
     Imperative::Get()->RecordOp(
-        std::move(attrs), inputs, outputs, op_state,
+        std::move(attrs), inputs, outputs, op_state_ptr,
         &save_inputs(), &save_outputs());
   }
 }
 
 
-void CachedOp::DynamicBackward(
+void Imperative::CachedOp::Backward(
     const bool retain_graph,
-    const OpStatePtr& op_state,
+    const OpStatePtr& state,
     const std::vector<NDArray*>& inputs,
     const std::vector<OpReqType>& reqs,
     const std::vector<NDArray*>& outputs) {
   using namespace nnvm;
   using namespace imperative;
+  CHECK(!Imperative::Get()->is_recording())
+      << "CachedOp does not support higher order gradients. "
+      << "If you want to do backward with create_graph=True please "
+      << "do not use hybridize.";
 
   // Initialize
-  Context default_ctx = outputs[0]->ctx();
-  auto& runtime = op_state.get_state<DynamicRuntime>();
-  {
-    auto state_ptr = GetCachedOpState(default_ctx);
-    auto& state = state_ptr.get_state<CachedOpState>();
-    std::lock_guard<std::mutex> lock(state.mutex);
-    state.info.fwd_graph = runtime.info.fwd_graph;
-    SetBackwardGraph(&state.info, reqs, inputs);
-    runtime.info.full_graph = state.info.full_graph;
-    runtime.info.bwd_input_eid = state.info.bwd_input_eid;
-  }
-  nnvm::Graph& g = runtime.info.full_graph;
+  nnvm::Graph g = GetBackwardGraph(state, reqs, inputs);
   const auto& idx = g.indexed_graph();
-  auto& buff = runtime.buff;
-  auto& states = runtime.op_states;
+
+  auto& cached_op_state = state.get_state<CachedOpState>();
+  auto& buff = cached_op_state.buff;
+  auto& states = cached_op_state.states;
 
   size_t num_forward_outputs = fwd_graph_.outputs.size();
   size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
@@ -881,7 +506,7 @@ void CachedOp::DynamicBackward(
   arrays.reserve(buff.size());
   for (size_t i = 0; i < buff.size(); ++i) arrays.push_back(&buff[i]);
   for (size_t i = 0; i < inputs.size(); ++i) {
-    arrays[runtime.info.bwd_input_eid[i]] = inputs[i];
+    arrays[bwd_input_eid_[i]] = inputs[i];
   }
   for (size_t i = 0, j = num_forward_outputs; i < reqs.size(); ++i) {
     if (reqs[i] == kNullOp) continue;
@@ -905,14 +530,20 @@ void CachedOp::DynamicBackward(
     if (ref_count[i] == 0) array_reqs[i] = kNullOp;
   }
 
+  Context default_ctx = outputs[0]->ctx();
   const auto& mem_plan = g.GetAttr<MemoryPlanVector >("backward_mem_plan");
   AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
-  RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
-           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.backward_bulk_size);
+
+  Imperative::Get()->RunGraph(
+      retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
+      std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+
+  Engine::Get()->set_bulk_size(prev_bulk_size);
 
   if (retain_graph) {
     buff.resize(num_forward_entries);
@@ -922,99 +553,6 @@ void CachedOp::DynamicBackward(
   }
 }
 
-void CachedOp::StaticBackward(
-    const bool retain_graph,
-    const OpStatePtr& state_ptr,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<OpReqType>& reqs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  Context default_ctx = outputs[0]->ctx();
-
-  auto& state = state_ptr.get_state<CachedOpState>();
-  std::lock_guard<std::mutex> lock(state.mutex);
-
-  bool match = SetBackwardGraph(&state.info, reqs, inputs, true);
-
-  nnvm::Graph& g = state.info.full_graph;
-  const auto& idx = g.indexed_graph();
-  auto num_forward_nodes = state.info.fwd_graph.indexed_graph().num_nodes();
-
-  if (!state.bwd_alloc || !match) {
-    StaticAllocMemory(state_ptr, true, true);
-  }
-
-  if (config_.static_shape) {
-    for (auto i : config_.param_indices) {
-      const auto iter = fwd_input_to_grad_output_.find(i);
-      if (iter == fwd_input_to_grad_output_.end()) continue;
-      auto entry = grad_graph_.outputs[iter->second];
-      if (!idx.exist(entry.node.get())) continue;
-      auto eid = idx.entry_id(entry);
-      if (!state.arrays[eid]->IsSame(*outputs[iter->second]) ||
-          !(state.array_reqs[eid] == reqs[iter->second])) {
-        match = false;
-        state.array_reqs[eid] = reqs[iter->second];
-        *state.arrays[eid] = *outputs[iter->second];
-        state.dynamic_entries[eid] = false;
-      }
-    }
-    for (auto i : config_.data_indices) {
-      const auto iter = fwd_input_to_grad_output_.find(i);
-      if (iter == fwd_input_to_grad_output_.end()) continue;
-      auto entry = grad_graph_.outputs[iter->second];
-      if (!idx.exist(entry.node.get())) continue;
-      auto eid = idx.entry_id(entry);
-      state.array_reqs[eid] = reqs[iter->second];
-      state.arrays[eid] = outputs[iter->second];
-    }
-  } else {
-    for (size_t i = 0; i < grad_graph_.outputs.size(); ++i) {
-      auto entry = grad_graph_.outputs[i];
-      if (!idx.exist(entry.node.get())) continue;
-      auto eid = idx.entry_id(entry);
-      state.array_reqs[eid] = reqs[i];
-      state.arrays[eid] = outputs[i];
-    }
-  }
-
-  if (!state.bwd_exec_init || !match) {
-    StaticInitExec(state_ptr, true, true);
-  }
-
-  for (size_t i = 0; i < state.info.bwd_input_eid.size(); ++i) {
-    auto eid = state.info.bwd_input_eid[i];
-    if (state.dynamic_entries[eid]) state.arrays[eid] = inputs[i];
-  }
-
-  StaticRunOps(default_ctx, g, state_ptr, num_forward_nodes, idx.num_nodes());
-}
-
-void CachedOp::Backward(
-    const bool retain_graph,
-    const OpStatePtr& state,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<OpReqType>& reqs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace imperative;
-  CHECK(!Imperative::Get()->is_recording())
-      << "CachedOp does not support higher order gradients. "
-      << "If you want to do backward with create_graph=True please "
-      << "do not use hybridize.";
-
-  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.backward_bulk_size);
-
-  if (config_.static_alloc) {
-    StaticBackward(retain_graph, state, inputs, reqs, outputs);
-  } else {
-    DynamicBackward(retain_graph, state, inputs, reqs, outputs);
-  }
-
-  Engine::Get()->set_bulk_size(prev_bulk_size);
-}
-
 
 NNVM_REGISTER_OP(_CachedOp)
 .set_num_inputs([](const NodeAttrs& attrs) {
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
deleted file mode 100644
index 60a40c5e4a5..00000000000
--- a/src/imperative/cached_op.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef MXNET_IMPERATIVE_CACHED_OP_H_
-#define MXNET_IMPERATIVE_CACHED_OP_H_
-
-#include <mxnet/imperative.h>
-#include <vector>
-#include <atomic>
-#include <utility>
-#include <string>
-#include <unordered_map>
-
-namespace mxnet {
-/*! \brief CachedOp Parameters */
-struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
-  uint32_t inline_limit;
-  uint32_t forward_bulk_size;
-  uint32_t backward_bulk_size;
-  bool static_alloc;
-  bool static_shape;
-  nnvm::Tuple<uint32_t> data_indices;
-  nnvm::Tuple<uint32_t> param_indices;
-  DMLC_DECLARE_PARAMETER(CachedOpConfig) {
-    DMLC_DECLARE_FIELD(static_alloc)
-    .set_default(false)
-    .describe("Statically allocate memory to improve speed. "
-              "Memory usage may increase.");
-    DMLC_DECLARE_FIELD(static_shape)
-    .set_default(false)
-    .describe("Optimize for invariant input shapes between iterations. "
-              "Must also set static_alloc to True. "
-              "Change of input shapes is still allowed but slower.");
-    DMLC_DECLARE_FIELD(inline_limit)
-    .set_default(2)
-    .describe("Maximum number of operators that can be inlined.");
-    DMLC_DECLARE_FIELD(forward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
-    .describe("Segment size of bulk execution during forward pass.");
-    DMLC_DECLARE_FIELD(backward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
-    .describe("Segment size of bulk execution during backward pass.");
-    DMLC_DECLARE_FIELD(data_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
-    .describe("Position of argument variables.");
-    DMLC_DECLARE_FIELD(param_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
-    .describe("Position of parameters.");
-  }
-};
-
-class CachedOp {
- public:
-  CachedOp(
-      const nnvm::Symbol& sym,
-      const std::vector<std::pair<std::string, std::string> >& flags);
-  ~CachedOp();
-  uint32_t num_inputs() {
-    return fwd_graph_.indexed_graph().input_nodes().size();
-  }
-  uint32_t num_outputs() {
-    return fwd_graph_.outputs.size();
-  }
-  uint32_t num_backward_inputs() {
-    return bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size();
-  }
-  std::vector<bool>& save_inputs() {
-    return save_inputs_;
-  }
-  std::vector<bool>& save_outputs() {
-    return save_outputs_;
-  }
-  const std::unordered_set<uint32_t>& mutable_input_nodes() {
-    return fwd_graph_.indexed_graph().mutable_input_nodes();
-  }
-  std::vector<nnvm::NodeEntry> Gradient(
-      const nnvm::NodePtr& node,
-      const std::vector<nnvm::NodeEntry>& ograds);
-  void Forward(
-      const std::shared_ptr<CachedOp>& op_ptr,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
-  void Backward(
-      const bool retain_graph,
-      const OpStatePtr& state,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& outputs);
-
- private:
-  struct GraphInfo;
-  struct DynamicRuntime;
-  struct CachedOpState;
-
-  OpStatePtr GetCachedOpState(const Context& ctx);
-  bool SetForwardGraph(
-      GraphInfo* info,
-      const bool recording,
-      const std::vector<NDArray*>& inputs);
-  bool SetBackwardGraph(
-      GraphInfo* info,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& inputs,
-      bool detect_inplace_addto = false);
-  OpStatePtr DynamicForward(
-      const Context& default_ctx,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
-  void DynamicBackward(
-      const bool retain_graph,
-      const OpStatePtr& op_state,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& outputs);
-  void StaticAllocMemory(
-      const OpStatePtr& state_ptr,
-      bool recording,
-      bool keep_fwd);
-  void StaticInitExec(
-      const OpStatePtr& state_ptr,
-      bool recording,
-      bool keep_fwd);
-  void StaticRunOps(
-      const Context& default_ctx,
-      const nnvm::Graph& g,
-      const OpStatePtr& state_ptr,
-      size_t start_nid,
-      size_t end_nid);
-  OpStatePtr StaticForward(
-      const Context& default_ctx,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
-  void StaticBackward(
-      const bool retain_graph,
-      const OpStatePtr& state_ptr,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& outputs);
-
-  CachedOpConfig config_;
-  nnvm::Graph fwd_graph_;
-  nnvm::Graph grad_graph_;
-  nnvm::Graph full_graph_;
-  bool inlining_;
-  std::vector<nnvm::NodeEntry> ograd_entries_;
-  std::vector<uint32_t> bwd_in_dep_, bwd_out_dep_, bwd_ograd_dep_;
-  std::unordered_map<uint32_t, uint32_t> fwd_input_to_grad_output_;
-  std::vector<bool> save_inputs_, save_outputs_;
-  std::vector<OpReqType> bwd_output_reqs_;
-
-  std::mutex mutex_;
-  std::unordered_map<Context, std::vector<OpStatePtr> > cached_op_states_;
-};
-
-using CachedOpPtr = std::shared_ptr<CachedOp>;
-
-}  // namespace mxnet
-#endif  // MXNET_IMPERATIVE_CACHED_OP_H_
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index e1654259a2f..7caf305eac7 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -19,7 +19,6 @@
 #include <unordered_set>
 #include <iostream>
 #include "./imperative_utils.h"
-#include "./cached_op.h"
 
 namespace mxnet {
 #if DMLC_CXX11_THREAD_LOCAL
@@ -267,6 +266,95 @@ void Imperative::RecordOp(
   }
 }
 
+void Imperative::RunGraph(
+    const bool retain_graph,
+    const nnvm::IndexedGraph& idx,
+    const std::vector<NDArray*> arrays,
+    size_t node_start, size_t node_end,
+    std::vector<OpReqType>&& array_reqs,
+    std::vector<uint32_t>&& ref_count,
+    std::vector<OpStatePtr> *p_states,
+    const DispatchModeVector &dispatch_modes) {
+  using namespace nnvm;
+  using namespace imperative;
+  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
+  static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
+
+  std::vector<OpStatePtr>& states = *p_states;
+  bool recording = is_recording();
+
+  std::vector<NDArray*> ndinputs, ndoutputs;
+  ShapeVector arg_shapes;
+  DTypeVector arg_dtypes;
+  std::vector<OpReqType> req;
+
+  for (size_t i = node_start; i < node_end; ++i) {
+    const nnvm::IndexedGraph::Node& node = idx[i];
+    if (node.source->op() == nullptr) continue;
+    auto num_outputs = node.source->num_outputs();
+    ndinputs.clear();
+    ndinputs.reserve(node.inputs.size());
+    for (const auto& j : node.inputs) {
+      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
+      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
+    }
+    ndoutputs.clear();
+    ndoutputs.reserve(num_outputs);
+    req.clear();
+    req.reserve(num_outputs);
+    for (size_t j = 0; j < num_outputs; ++j) {
+      size_t eid = idx.entry_id(i, j);
+      ndoutputs.emplace_back(arrays[eid]);
+      req.push_back(array_reqs[eid]);
+      CHECK(!ndoutputs.back()->is_none());
+    }
+    const Context& ctx = ndoutputs[0]->ctx();
+    const DispatchMode dispatch_mode = dispatch_modes[i];
+    if (node.source->op() == bwd_cached_op) {
+      const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
+      nnvm::Node* fwd_node = node.source->control_deps[0].get();
+      auto fwd_node_id = idx.node_id(fwd_node);
+      cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
+    } else if (createop.count(node.source->op())) {
+      arg_shapes.clear();
+      arg_dtypes.clear();
+      arg_shapes.reserve(ndinputs.size());
+      arg_dtypes.reserve(ndinputs.size());
+      for (size_t i = 0; i < ndinputs.size(); ++i) {
+        arg_shapes.emplace_back(ndinputs[i]->shape());
+        arg_dtypes.emplace_back(ndinputs[i]->dtype());
+      }
+      states[i] = createop[node.source->op()](
+          node.source->attrs, ctx, arg_shapes, arg_dtypes);
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode, states[i]);
+      if (recording) RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[i]);
+    } else if (is_layer_backward.get(node.source->op(), false)) {
+      nnvm::Node* fwd_node = node.source->control_deps[0].get();
+      auto fwd_node_id = idx.node_id(fwd_node);
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+               req, dispatch_mode, states[fwd_node_id]);
+      if (recording) {
+        RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[fwd_node_id]);
+      }
+    } else {
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode);
+      if (recording) RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs);
+    }
+
+    for (const auto& j : node.inputs) {
+      size_t eid = idx.entry_id(j);
+      --ref_count[eid];
+      if (ref_count[eid] == 0) arrays[eid]->ptr_.reset();
+    }
+    for (size_t j = 0; j < ndoutputs.size(); ++j) {
+      size_t eid = idx.entry_id(i, j);
+      if (ref_count[eid] == 0) arrays[eid]->ptr_.reset();
+    }
+  }
+}
+
+
 std::vector<NDArray*> Imperative::Backward(
     const std::vector<NDArray*>& outputs,
     const std::vector<NDArray*>& ograds,
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
deleted file mode 100644
index 464aefc220d..00000000000
--- a/src/imperative/imperative_utils.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "./imperative_utils.h"
-#include "./cached_op.h"
-
-namespace mxnet {
-namespace imperative {
-void RunGraph(
-    const bool retain_graph,
-    const nnvm::IndexedGraph& idx,
-    const std::vector<NDArray*> arrays,
-    size_t node_start, size_t node_end,
-    std::vector<OpReqType>&& array_reqs,
-    std::vector<uint32_t>&& ref_count,
-    std::vector<OpStatePtr> *p_states,
-    const DispatchModeVector &dispatch_modes) {
-  using namespace nnvm;
-  using namespace imperative;
-  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
-  static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
-
-  const auto imp = Imperative::Get();
-
-  std::vector<OpStatePtr>& states = *p_states;
-  bool recording = imp->is_recording();
-
-  std::vector<NDArray*> ndinputs, ndoutputs;
-  ShapeVector arg_shapes;
-  DTypeVector arg_dtypes;
-  std::vector<OpReqType> req;
-
-  for (size_t i = node_start; i < node_end; ++i) {
-    const nnvm::IndexedGraph::Node& node = idx[i];
-    if (node.source->op() == nullptr) continue;
-    auto num_outputs = node.source->num_outputs();
-    ndinputs.clear();
-    ndinputs.reserve(node.inputs.size());
-    for (const auto& j : node.inputs) {
-      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
-      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
-    }
-    ndoutputs.clear();
-    ndoutputs.reserve(num_outputs);
-    req.clear();
-    req.reserve(num_outputs);
-    for (size_t j = 0; j < num_outputs; ++j) {
-      size_t eid = idx.entry_id(i, j);
-      ndoutputs.emplace_back(arrays[eid]);
-      req.push_back(array_reqs[eid]);
-      CHECK(array_reqs[eid] == kNullOp || !ndoutputs.back()->is_none());
-    }
-    const Context& ctx = ndoutputs[0]->ctx();
-    const DispatchMode dispatch_mode = dispatch_modes[i];
-    if (node.source->op() == bwd_cached_op) {
-      const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
-      nnvm::Node* fwd_node = node.source->control_deps[0].get();
-      auto fwd_node_id = idx.node_id(fwd_node);
-      cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
-    } else if (createop.count(node.source->op())) {
-      arg_shapes.clear();
-      arg_dtypes.clear();
-      arg_shapes.reserve(ndinputs.size());
-      arg_dtypes.reserve(ndinputs.size());
-      for (size_t i = 0; i < ndinputs.size(); ++i) {
-        arg_shapes.emplace_back(ndinputs[i]->shape());
-        arg_dtypes.emplace_back(ndinputs[i]->dtype());
-      }
-      states[i] = createop[node.source->op()](
-          node.source->attrs, ctx, arg_shapes, arg_dtypes);
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode, states[i]);
-      if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[i]);
-      }
-    } else if (is_layer_backward.get(node.source->op(), false)) {
-      nnvm::Node* fwd_node = node.source->control_deps[0].get();
-      auto fwd_node_id = idx.node_id(fwd_node);
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
-               req, dispatch_mode, states[fwd_node_id]);
-      if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[fwd_node_id]);
-      }
-    } else {
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode);
-      if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs);
-      }
-    }
-
-    for (const auto& j : node.inputs) {
-      size_t eid = idx.entry_id(j);
-      --ref_count[eid];
-      if (ref_count[eid] == 0) *arrays[eid] = NDArray();
-    }
-    for (size_t j = 0; j < ndoutputs.size(); ++j) {
-      size_t eid = idx.entry_id(i, j);
-      if (ref_count[eid] == 0) *arrays[eid] = NDArray();
-    }
-  }
-}
-
-}  // namespace imperative
-}  // namespace mxnet
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 726531d0299..06b7e058dd1 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -23,7 +23,6 @@
 #include <utility>
 #include <algorithm>
 #include <vector>
-#include <map>
 #include <string>
 #include "../executor/graph_executor.h"
 #include "../executor/exec_pass.h"
@@ -39,24 +38,11 @@ namespace mxnet {
 namespace imperative {
 
 struct MemoryPlanInfo {
-  int storage_id;
-  uint32_t root;
+  uint32_t sid;
   size_t size;
   bool inplace;
 };
 
-struct EngineOprDeleter {
-  void operator()(engine::Opr* handle) {
-    Engine::Get()->DeleteOperator(handle);
-  }
-};
-
-struct EngineOprSeg {
-  bool skip;
-  size_t next_nid;
-  std::unique_ptr<engine::Opr, EngineOprDeleter> opr;
-};
-
 using MemoryPlanVector = std::vector<MemoryPlanInfo>;
 
 inline Context GetContext(const nnvm::NodeAttrs& attrs,
@@ -729,12 +715,10 @@ inline std::vector<Context> PlaceDevice(const nnvm::IndexedGraph& idx) {
 
 
 inline MemoryPlanVector PlanMemory(
-    nnvm::Graph* p_g,
-    nnvm::StorageVector&& storage,
+    nnvm::Graph* p_g, nnvm::StorageVector&& storage,
     const std::vector<uint32_t>& ref_count,
     const std::pair<uint32_t, uint32_t>& node_range = {0, 0},
-    const std::pair<uint32_t, uint32_t>& entry_range = {0, 0},
-    bool detect_inplace_addto = false) {
+    const std::pair<uint32_t, uint32_t>& entry_range = {0, 0}) {
   using namespace nnvm;
   nnvm::Graph& g = *p_g;
   const auto& idx = g.indexed_graph();
@@ -744,31 +728,31 @@ inline MemoryPlanVector PlanMemory(
   g.attrs["ref_count"] = std::make_shared<dmlc::any>(ref_count);
   g.attrs["storage"] = std::make_shared<dmlc::any>(std::move(storage));
   g = nnvm::ApplyPass(g, "PlanMemory");
-  if (detect_inplace_addto) g = exec::DetectInplaceAddTo(g);
 
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<ShapeVector>("shape");
-  const auto& storage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
-  const auto& storage_ids = g.GetAttr<StorageVector>("storage_id");
+  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  auto storage_ids = g.MoveCopyAttr<StorageVector>("storage_id");
+  auto storage_inplace = g.MoveCopyAttr<std::vector<int> >("storage_inplace_index");
   uint32_t entry_start = entry_range.first;
   uint32_t entry_end =
       entry_range.second > entry_start ? entry_range.second : idx.num_node_entries();
   MemoryPlanVector mem_plan(idx.num_node_entries());
-  std::unordered_map<int, uint32_t> sid_to_root;
+  std::unordered_map<int, uint32_t> sid_to_loc;
 
   for (uint32_t i = entry_start; i < entry_end; ++i) {
+    if (stypes[i] != kDefaultStorage) continue;
     if (storage_ids[i] < 0) {
-      mem_plan[i] = {storage_ids[i], i, 0, false};
-    } else if (!sid_to_root.count(storage_ids[i])) {
+      mem_plan[i] = {i, mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size(), false};
+    } else if (!sid_to_loc.count(storage_ids[i])) {
       CHECK_LT(storage_inplace[i], 0);
-      sid_to_root[storage_ids[i]] = i;
-      mem_plan[i] = {storage_ids[i], i,
-                     mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size(),
-                     false};
+      sid_to_loc[storage_ids[i]] = i;
+      mem_plan[i].sid = i;
+      mem_plan[i].size = mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size();
     } else {
-      uint32_t root = sid_to_root[storage_ids[i]];
-      mem_plan[i] = {storage_ids[i], root, 0, storage_inplace[i] >= 0};
-      mem_plan[root].size = std::max(mem_plan[root].size,
+      uint32_t loc = sid_to_loc[storage_ids[i]];
+      mem_plan[i] = {loc, 0, storage_inplace[i] >= 0};
+      mem_plan[loc].size = std::max(mem_plan[loc].size,
           mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size());
     }
   }
@@ -777,213 +761,39 @@ inline MemoryPlanVector PlanMemory(
 }
 
 
-inline std::multimap<size_t, NDArray> AllocateMemory(
-    const nnvm::Graph& g,
-    const nnvm::IndexedGraph& idx,
-    const Context& default_ctx,
-    const uint32_t entry_start, const uint32_t entry_end,
-    const MemoryPlanVector& mem_plan,
-    const std::vector<NDArray*>& arrays,
-    std::vector<OpReqType> *array_reqs,
-    std::multimap<size_t, NDArray>&& pool = std::multimap<size_t, NDArray>()) {
+inline void AllocateMemory(const nnvm::Graph& g,
+                    const nnvm::IndexedGraph& idx,
+                    const Context& default_ctx,
+                    const uint32_t entry_start, const uint32_t entry_end,
+                    const MemoryPlanVector& mem_plan,
+                    const std::vector<NDArray*>& arrays,
+                    std::vector<OpReqType> *array_reqs) {
   using namespace nnvm;
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<ShapeVector>("shape");
   const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
 
-  std::multimap<size_t, NDArray> new_pool;
-
   for (uint32_t i = entry_start; i < entry_end; ++i) {
-    if (mem_plan[i].storage_id == exec::kExternalStorageID) continue;
-    CHECK(arrays[i]->is_none());
-    if (mem_plan[i].storage_id == exec::kDynamicStorageID) {
-      *arrays[i] = NDArray(static_cast<NDArrayStorageType>(stypes[i]),
-                           shapes[i], default_ctx, true, dtypes[i]);
-      continue;
-    }
-    CHECK_EQ(stypes[i], kDefaultStorage);
-    if (mem_plan[i].root == i) {
-      CHECK_GT(mem_plan[i].size, 0);
-      auto iter = pool.lower_bound(mem_plan[i].size);
-      if (iter != pool.end()) {
-        *arrays[i] = iter->second.AsArray(shapes[i], dtypes[i]);
-        new_pool.insert(*iter);
-        pool.erase(iter);
-      } else {
+    if (!arrays[i]->is_none()) continue;
+    if (stypes[i] == kDefaultStorage) {
+      if (mem_plan[i].sid == i) {
+        CHECK_GT(mem_plan[i].size, 0);
         NDArray buff(TShape({static_cast<nnvm::dim_t>(mem_plan[i].size)}),
                      default_ctx, true, mshadow::kUint8);
         *arrays[i] = buff.AsArray(shapes[i], dtypes[i]);
-        new_pool.insert({mem_plan[i].size, buff});
-      }
-    } else {
-      CHECK_GE(mem_plan[mem_plan[i].root].storage_id, 0);
-      *arrays[i] = arrays[mem_plan[i].root]->AsArray(shapes[i], dtypes[i]);
-      if (mem_plan[i].inplace && array_reqs->at(i) == kWriteTo) {
-        array_reqs->at(i) = kWriteInplace;
-      }
-    }
-  }
-
-  return new_pool;
-}
-
-inline void SetupOpExec(
-    const nnvm::Graph& g,
-    size_t nid,
-    const std::shared_ptr<exec::OpExecutor>& exec,
-    const std::vector<NDArray*> arrays,
-    const std::vector<OpReqType> array_reqs) {
-  const auto& idx = g.indexed_graph();
-  const auto& inode = idx[nid];
-  CHECK_EQ(exec->in_array.size(), 0U);
-  CHECK_EQ(exec->out_array.size(), 0U);
-  for (const auto& e : inode.inputs) {
-    CHECK(!arrays[idx.entry_id(e)]->is_none()) << inode.source->attrs.name;
-    exec->in_array.push_back(*arrays[idx.entry_id(e)]);
-  }
-  for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
-    uint32_t eid = idx.entry_id(nid, index);
-    CHECK(!arrays[eid]->is_none()) << inode.source->attrs.name;
-    exec->out_array.push_back(*arrays[eid]);
-    exec->req.push_back(array_reqs[eid]);
-  }
-
-  exec->Setup();
-}
-
-inline Engine::OprHandle CreateEngineOp(
-    const Context& default_ctx,
-    const std::vector<std::shared_ptr<exec::OpExecutor> >& execs) {
-  CHECK_GT(execs.size(), 0);
-  std::vector<Engine::VarHandle> use_vars, mutate_vars;
-
-  for (const auto& exec : execs) {
-    CHECK_GT(exec->out_array.size(), 0);
-    CHECK(execs.size() == 1 || exec->exec_type() == ExecType::kSync);
-
-    // the variables
-    for (const auto& nd : exec->in_array) {
-      use_vars.push_back(nd.var());
-    }
-    for (auto& r : exec->op_ctx.requested) {
-      mutate_vars.push_back(r.var);
-    }
-    for (auto& nd : exec->out_array) {
-      mutate_vars.push_back(nd.var());
-    }
-    if (exec->var() != nullptr) {
-      mutate_vars.push_back(exec->var());
-    }
-  }
-
-  // dedup vars
-  Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars);
-  bool is_gpu = default_ctx.dev_mask() == gpu::kDevMask;
-  bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync;
-
-  auto exec_fun = [execs, is_async, is_gpu] (
-      RunContext ctx, Engine::CallbackOnComplete on_complete) {
-    if (is_async) {
-      execs[0]->op_ctx.async_on_complete = on_complete;
-    }
-    for (const auto& exec : execs) exec->Run(ctx, is_gpu);
-    // call on complete only if it is async op
-    if (!is_async) {
-      if (is_gpu) {
-      #if MXNET_USE_CUDA
-        // Wait GPU kernel to finish.
-        ctx.get_stream<gpu>()->Wait();
-      #else
-        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-      #endif
-      }
-      on_complete();
-    }
-  };
-
-  return Engine::Get()->NewOperator(
-      exec_fun, use_vars, mutate_vars, FnProperty::kNormal);
-}
-
-inline void CreateEngineOpSeg(
-    const nnvm::IndexedGraph& idx,
-    const Context default_ctx,
-    const size_t start_nid,
-    const size_t end_nid,
-    const size_t bulk_size,
-    const std::unordered_set<uint32_t>& excludes,
-    const std::vector<std::shared_ptr<exec::OpExecutor> >& execs,
-    const std::vector<int> skip_plus_node,
-    std::vector<EngineOprSeg> *opr_segs) {
-  size_t seg_start = start_nid;
-  std::vector<std::shared_ptr<exec::OpExecutor> > seg_execs;
-  for (size_t nid = start_nid; nid < end_nid; ++nid) {
-    const auto& node = idx[nid];
-    if (node.source->is_variable()) continue;
-    if (skip_plus_node.size() && skip_plus_node[nid]) continue;
-    auto& exec = execs[nid];
-    bool is_async = exec->exec_type() != ExecType::kSync;
-    bool valid = exec->out_array.size() > 0;
-
-    // Stop at async nodes and invalid node (due to input/output is not allocated)
-    bool stop = is_async || !valid || seg_execs.size() >= bulk_size;
-    for (size_t i = 0; i < node.inputs.size() && !stop; ++i) {
-      if (excludes.count(idx.entry_id(node.inputs[i]))) stop = true;
-    }
-    auto num_outputs = node.source->num_outputs();
-    for (size_t i = 0; i < num_outputs && !stop; ++i) {
-      if (excludes.count(idx.entry_id(nid, i))) stop = true;
-    }
-
-    // Create opr segment for previous nodes.
-    if (stop && nid > seg_start) {
-      auto& seg = (*opr_segs)[seg_start];
-      if (seg_execs.size()) {
-        seg = EngineOprSeg{false, nid};
-        seg.opr.reset(CreateEngineOp(default_ctx, seg_execs));
       } else {
-        seg = EngineOprSeg{true, nid, nullptr};
+        *arrays[i] = arrays[mem_plan[i].sid]->AsArray(shapes[i], dtypes[i]);
+        if (mem_plan[i].inplace && array_reqs->at(i) == kWriteTo) {
+          array_reqs->at(i) = kWriteInplace;
+        }
       }
-      seg_start = nid;
-      seg_execs.clear();
-    }
-
-    seg_execs.push_back(exec);
-
-    auto& seg = (*opr_segs)[nid];
-    if (is_async) {
-      seg = EngineOprSeg{false, nid + 1};
-      seg.opr.reset(CreateEngineOp(default_ctx, seg_execs));
-      seg_execs.clear();
-      seg_start = nid + 1;
-    } else if (!valid) {
-      seg = EngineOprSeg{false, nid + 1, nullptr};
-      seg_execs.clear();
-      seg_start = nid + 1;
-    }
-  }
-  // The last segment
-  if (end_nid > seg_start) {
-    auto& seg = (*opr_segs)[seg_start];
-    if (seg_execs.size()) {
-      seg = EngineOprSeg{false, end_nid};
-      seg.opr.reset(CreateEngineOp(default_ctx, seg_execs));
     } else {
-      seg = EngineOprSeg{true, end_nid, nullptr};
+      *arrays[i] = NDArray(static_cast<NDArrayStorageType>(stypes[i]),
+                           shapes[i], default_ctx, true, dtypes[i]);
     }
   }
 }
 
-
-void RunGraph(const bool retain_graph,
-              const nnvm::IndexedGraph& idx,
-              const std::vector<NDArray*> arrays,
-              size_t node_start, size_t node_end,
-              std::vector<OpReqType>&& array_reqs,
-              std::vector<uint32_t>&& ref_count,
-              std::vector<OpStatePtr> *p_states,
-              const DispatchModeVector &dispatch_modes);
-
 }  // namespace imperative
 }  // namespace mxnet
 
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index c99280ac7ea..8b93e8358d5 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -39,6 +39,7 @@
 #include "./mshadow_op.h"
 #include "./random/sampler.h"
 #include "./random/sample_op.h"
+#include "./tensor/elemwise_binary_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
@@ -72,12 +73,6 @@ struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
   }
 };
 
-struct prelu_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a > 0.0f ? 0.0f : a;
-  }
-};
-
 template<typename xpu, typename DType>
 class LeakyReLUOp : public Operator {
  public:
@@ -98,28 +93,51 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 3, DType> data;
     Tensor<xpu, 3, DType> out;
     Tensor<xpu, 3, DType> mask;
-    Tensor<xpu, 1, DType> weight;
     int n = in_data[leakyrelu::kData].shape_[0];
     int k = in_data[leakyrelu::kData].shape_[1];
     Shape<3> dshape = Shape3(n, k, in_data[leakyrelu::kData].Size()/n/k);
     data = in_data[leakyrelu::kData].get_with_shape<xpu, 3, DType>(dshape, s);
     out = out_data[leakyrelu::kOut].get_with_shape<xpu, 3, DType>(dshape, s);
+    if (req[leakyrelu::kOut] == kNullOp) {
+      return;
+    }
     switch (param_.act_type) {
       case leakyrelu::kLeakyReLU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
             s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(param_.slope));
         });
         break;
       }
       case leakyrelu::kPReLU: {
-        weight = in_data[leakyrelu::kGamma].get<xpu, 1, DType>(s);
-        if (weight.shape_.Size() == 1) {
-          Assign(out, req[leakyrelu::kOut],
-                 F<mshadow_op::xelu>(data, mshadow::expr::broadcast_scalar(weight, out.shape_)));
+        TShape gshape = expand_shape(in_data[leakyrelu::kGamma].shape_,
+                                     in_data[leakyrelu::kData].shape_);
+        TShape new_lshape, new_rshape, new_oshape;
+        const int ndim = op::BinaryBroadcastShapeCompact(in_data[leakyrelu::kData].shape_,
+                                                         gshape,
+                                                         out_data[leakyrelu::kOut].shape_,
+                                                         &new_lshape, &new_rshape, &new_oshape);
+        if (!ndim) {
+          MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
+            const size_t size = (minthree(out_data[leakyrelu::kOut].Size(),
+                                          in_data[leakyrelu::kData].Size(),
+                                          in_data[leakyrelu::kGamma].Size())
+            + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
+                s, size, out_data[leakyrelu::kOut].dptr<DType>(),
+            in_data[leakyrelu::kData].dptr<DType>(), in_data[leakyrelu::kGamma].dptr<DType>());
+          });
         } else {
-          Assign(out, req[leakyrelu::kOut],
-                 F<mshadow_op::xelu>(data, mshadow::expr::broadcast<1>(weight, out.shape_)));
+          BROADCAST_NDIM_SWITCH(ndim, NDim, {
+            mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
+            mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
+            mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType,
+                                                               mshadow_op::xelu>, xpu>::
+            template LaunchEx(s, new_oshape.Size(), req[leakyrelu::kOut], lstride, rstride, oshape,
+            in_data[leakyrelu::kData].dptr<DType>(), in_data[leakyrelu::kGamma].dptr<DType>(),
+            out_data[leakyrelu::kOut].dptr<DType>());
+          });
         }
         break;
       }
@@ -134,23 +152,23 @@ class LeakyReLUOp : public Operator {
           Tensor<xpu, 1, DType> out = mask.FlatTo1D();
           sampler.Sample(low, high, out, pgen, s);
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::mul, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
               s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_,
               DType(param_.upper_bound - param_.lower_bound));
           });
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::plus, Req>, xpu>::Launch(
               s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_,
               DType(param_.lower_bound));
           });
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
               s, mask.size(0) * mask.size(1) * mask.size(2), out.dptr_, data.dptr_, mask.dptr_);
           });
         } else {
           const float slope = (param_.lower_bound + param_.upper_bound) / 2.0f;
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
               s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(slope));
           });
         }
@@ -158,7 +176,7 @@ class LeakyReLUOp : public Operator {
       }
       case leakyrelu::kELU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::elu, Req>, xpu>::Launch(
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::elu, Req>, xpu>::Launch(
             s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_,
             DType(param_.slope));
         });
@@ -188,8 +206,6 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 3, DType> gdata;
     Tensor<xpu, 3, DType> grad;
     Tensor<xpu, 3, DType> mask;
-    Tensor<xpu, 1, DType> weight;
-    Tensor<xpu, 1, DType> grad_weight;
     int n = out_grad[leakyrelu::kOut].shape_[0];
     int k = out_grad[leakyrelu::kOut].shape_[1];
     Shape<3> dshape = Shape3(n, k, out_grad[leakyrelu::kOut].Size()/n/k);
@@ -206,29 +222,38 @@ class LeakyReLUOp : public Operator {
       case leakyrelu::kLeakyReLU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<
-            mxnet_op::backward_grad_tuned<mxnet::op::mshadow_op::xelu_grad>, Req>, xpu>::Launch(
+            mxnet_op::backward_grad_tuned<mshadow_op::xelu_grad>, Req>, xpu>::Launch(
               s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
               output.dptr_, DType(param_.slope));
         });
         break;
       }
       case leakyrelu::kPReLU: {
-        weight = in_data[leakyrelu::kGamma].get<xpu, 1, DType>(s);
-        grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, DType>(s);
-        if (weight.shape_.Size() == 1) {
-          Shape<4> gshape = Shape4(1, grad.shape_[0], grad.shape_[1], grad.shape_[2]);
-          Assign(grad_weight, req[leakyrelu::kGamma],
-                 sumall_except_dim<0>(reshape(F<prelu_grad>(data) * grad, gshape)));
-          Assign(gdata, req[leakyrelu::kData],
-                 F<mshadow_op::xelu_grad>(data,
-                                          mshadow::expr::broadcast_scalar(weight, data.shape_))
-                 * grad);
+        TShape gshape = expand_shape(in_grad[leakyrelu::kGamma].shape_,
+                                     in_grad[leakyrelu::kData].shape_);
+        TShape new_lshape, new_rshape, new_oshape;
+        const bool need_bc = BinaryBroadcastShapeCompact(in_grad[leakyrelu::kData].shape_,
+                                                         gshape,
+                                                         out_grad[leakyrelu::kOut].shape_,
+                                                         &new_lshape,
+                                                         &new_rshape,
+                                                         &new_oshape) != 0;
+        if (!need_bc) {
+          ElemwiseBinaryOp::BackwardUseIn<xpu,
+                                          mshadow_op::xelu_grad,
+                                          mshadow_op::prelu_grad>(
+            nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
+                                     in_data[leakyrelu::kData],
+                                     in_data[leakyrelu::kGamma]}, req, in_grad);
         } else {
-          Assign(grad_weight, req[leakyrelu::kGamma],
-                 sumall_except_dim<1>(F<prelu_grad>(data) * grad));
-          Assign(gdata, req[leakyrelu::kData],
-                 F<mshadow_op::xelu_grad>(data, mshadow::expr::broadcast<1>(weight, data.shape_))
-                 * grad);
+          BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
+            BinaryBroadcastBackwardUseInImpl<xpu, NDim, DType,
+              mshadow_op::xelu_grad, mshadow_op::prelu_grad>(
+                ctx, {out_grad[leakyrelu::kOut],
+                      in_data[leakyrelu::kData],
+                      in_data[leakyrelu::kGamma]}, req, in_grad,
+                new_lshape, new_rshape, new_oshape);
+          });
         }
         break;
       }
@@ -239,7 +264,7 @@ class LeakyReLUOp : public Operator {
       case leakyrelu::kELU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<
-            mxnet_op::backward_grad_tuned<mxnet::op::mshadow_op::elu_grad>, Req>, xpu>::Launch(
+            mxnet_op::backward_grad_tuned<mshadow_op::elu_grad>, Req>, xpu>::Launch(
               s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
               output.dptr_, DType(param_.slope));
         });
@@ -251,6 +276,24 @@ class LeakyReLUOp : public Operator {
   }
 
  private:
+  /*! \brief Minimum of three */
+  static MSHADOW_XINLINE size_t minthree(const size_t a, const size_t b, const size_t c) {
+    return a < b ? (a < c ? a : c) : (b < c ? b : c);
+  }
+  static inline TShape expand_shape(const TShape& src, const TShape& dst) {
+    TShape result(dst.ndim());
+    int s = src.ndim() - 1;
+    for (int i = dst.ndim() - 1; i >= 0; i--) {
+      if (s >= 0 && (dst[i] == src[s] || src[s] == 1)) {
+        result[i] = src[s];
+        s--;
+      } else {
+        result[i] = 1;
+      }
+    }
+    CHECK(s == -1) << "Cannot broadcast gamma to data. gamma: " << src << ", data: " << dst;
+    return result;
+  }
   LeakyReLUParam param_;
 };  // class LeakyReLUOp
 
@@ -281,10 +324,12 @@ class LeakyReLUProp : public OperatorProperty {
     if (dshape.ndim() == 0) return false;
     if (param_.act_type == leakyrelu::kPReLU) {
       const TShape &gshape = in_shape->at(leakyrelu::kGamma);
-      if (gshape.ndim() == 1 && gshape.Size() == 1)
-        in_shape->at(leakyrelu::kGamma) = TShape(Shape1(1));
-      else
+      if (gshape.ndim() == 0) {
         in_shape->at(leakyrelu::kGamma) = TShape(Shape1(dshape[1]));
+      }
+      if (dshape == gshape) {
+        SHAPE_ASSIGN_CHECK(*out_shape, 0, dshape);
+      }
     }
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -396,6 +441,11 @@ class LeakyReLUProp : public OperatorProperty {
     }
   }
 
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
   Operator* CreateOperator(Context ctx) const override {
     LOG(FATAL) << "Not Implemented.";
     return NULL;
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 19fa4f8ead8..5953568c7fa 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -126,6 +126,8 @@ MXNET_UNARY_MATH_OP_NC(relu, a > DType(0) ? a : DType(0));
 
 MXNET_UNARY_MATH_OP_NC(relu_grad, a > DType(0) ? DType(1) : DType(0));
 
+MXNET_BINARY_MATH_OP_NC(prelu_grad, a > DType(0) ? DType(0) : a);
+
 MXNET_BINARY_MATH_OP_NC(xelu, a > DType(0) ? a :
                         DType(static_cast<float>(a) * static_cast<float>(b)));
 
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 5632d73c261..d40abaf1fd6 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -125,6 +125,8 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   }
 };
 
+void ConvolutionParamParser(nnvm::NodeAttrs* attrs);
+
 typedef ParamOpSign<ConvolutionParam> ConvSignature;
 
 }  // namespace op
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 0e8a929e1ba..ef70ccd6ec1 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -331,7 +331,7 @@ inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
                              dispatch_mode, wanted_mode);
 }
 
-static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
+void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
   using namespace mshadow;
   ConvolutionParam param_;
   try {
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 84cf6403043..bc3ee366007 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -51,7 +51,11 @@ class CuDNNPoolingOp {
         mode_ = CUDNN_POOLING_MAX;
         break;
       case pool_enum::kAvgPooling:
-        mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        if (param_.count_include_pad.has_value() && !param_.count_include_pad.value()) {
+          mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+        } else {
+          mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        }
         break;
       default:
         LOG(FATAL) << "Not implmented";
@@ -263,7 +267,7 @@ class CuDNNPoolingOp {
                                              &(pad_vec[0]),
                                              &(stride_vec[0])));
       #else
-      LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
+      LOG(FATAL) << "3D pooling only support CUDNN v5 and above";
       #endif
     }
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index a278456ea26..fae72bd9221 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -161,15 +161,15 @@ void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
 
   NDArray in_buffer = in_data;
+  MKLDNNStream *stream = MKLDNNStream::Get();
+
   if (in_data.IsView() && in_data.IsMKLDNNData())
     in_buffer = in_data.Reorder2Default();
 
   auto input_mem = in_buffer.GetMKLDNNData();
   MKLDNNActForward &fwd = GetActForward(param, ctx, in_buffer, *input_mem);
-  auto out_mem = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_primitive_desc(),
-                                 req);
+  auto out_mem = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_primitive_desc(), req, &in_buffer);
   fwd.SetNewMem(*input_mem, *out_mem.second);
-  MKLDNNStream *stream = MKLDNNStream::Get();
   stream->RegisterPrim(fwd.GetFwd());
   CommitOutput(out_data, out_mem);
   stream->Submit();
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index bd2faf5775a..6a7c58f2991 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -324,13 +324,16 @@ typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
  * The difference is that the first function can create MKLDNN memory with
  * special layouts in an NDArray, while the second one can only create MKLDNN
  * memory with default layouts.
+ * Also an optional in_arr parameter can be passed in the first function with
+ * the kWriteInPlace req to validate if mkldnn can support write in place;
+ * otherwise new memory will be written to an copied back onto out_arr.
  * If these two functions are used, we have to call CommitOutput to write
  * the output back to the output NDArray.
  */
-mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
                                 const mkldnn::memory::primitive_desc &desc,
-                                OpReqType req);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+                                OpReqType req, const NDArray* in_arr = nullptr);
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
                                        const mkldnn::memory::primitive_desc &desc,
                                        OpReqType req);
 /* This function has to be used with one of the functions above. */
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 1bd1581dbc2..b182aa0b68d 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -77,29 +77,42 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
   }
 }
 
-mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+bool CanWriteTo(const NDArray &out_arr,
+                const NDArray &in_arr,
+                const mkldnn::memory::primitive_desc &desc) {
+  auto in_mem = in_arr.GetMKLDNNData();
+  bool add_same = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
+  bool pdesc_same = out_arr.GetMKLDNNData()->get_primitive_desc() == desc &&
+      in_mem->get_primitive_desc() == desc;
+  return add_same && pdesc_same;
+}
+
+mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
                                 const mkldnn::memory::primitive_desc &desc,
-                                OpReqType req) {
+                                OpReqType req,
+                                const NDArray* in_arr) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
     return mkldnn_output_t(OutDataOp::AddBack, tmp);
-  } else if (kWriteInplace == req) {
-    // MKLDNN ops may not support the case that the input and the output uses
-    // the same memory. Let's use an extra copy to make sure it always works.
+  } else if (req == kWriteInplace && in_arr != nullptr && CanWriteTo(out_arr, *in_arr, desc)) {
+    mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+    // mem is nullptr if out_arr is view and desc is MKLDNN format.
+    // need to Reorder2Default before calling CreateMKLDNNMem
+    CHECK(mem != nullptr);
+    return mkldnn_output_t(OutDataOp::Noop, mem);
+  } else if (req == kWriteInplace) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+  }
+  mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+  if (nullptr == mem) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
     return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-  } else {
-    mkldnn::memory *mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
-    if (mem == nullptr) {
-      auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-    } else {
-      return mkldnn_output_t(OutDataOp::Noop, mem);
-    }
   }
+  return mkldnn_output_t(OutDataOp::Noop, mem);
 }
 
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
                                        const mkldnn::memory::primitive_desc &desc,
                                        OpReqType req) {
   if (kAddTo == req) {
@@ -113,7 +126,7 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
     auto def_format = GetDefaultFormat(_desc.desc());
     mkldnn::memory *mem = nullptr;
     if (def_format == _desc.desc().data.format) {
-      mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
+      mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
     }
     if (mem == nullptr) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
new file mode 100644
index 00000000000..23f2fe69463
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_convolution-inl.h
+ * \brief
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <utility>
+#include "../convolution-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
+    const ConvolutionParam& param, const bool is_train, const NDArray &data,
+    const NDArray &weights, const NDArray *bias, const NDArray &output);
+
+class MKLDNNConvForward {
+ public:
+  mkldnn::convolution_forward::primitive_desc fwd_pd;
+
+  MKLDNNConvForward(const ConvolutionParam& param, const bool is_train,
+                    const NDArray &data, const NDArray &weights,
+                    const NDArray *bias, const NDArray &output): fwd_pd(
+                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
+  }
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
+                 const mkldnn::memory *bias, const mkldnn::memory &output);
+
+  const mkldnn::convolution_forward &GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::convolution_forward> fwd_;
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> weight_;
+  std::shared_ptr<mkldnn::memory> bias_;
+  std::shared_ptr<mkldnn::memory> out_;
+};
+
+typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
+
+MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs,
+    const bool is_train, const NDArray &data, const NDArray &weights,
+    const NDArray *bias, const NDArray &output);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index f851a6d2535..cf04ea8da3d 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -23,11 +23,14 @@
  * \author Da Zheng
 */
 
+
+#if MXNET_USE_MKLDNN == 1
+
 #include "../convolution-inl.h"
 #include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "./mkldnn_convolution-inl.h"
 
-#if MXNET_USE_MKLDNN == 1
 namespace mxnet {
 namespace op {
 
@@ -37,8 +40,8 @@ bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
   return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
 }
 
-static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
-    const ConvolutionParam& param, bool is_train, const NDArray &data,
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
+    const ConvolutionParam& param, const bool is_train, const NDArray &data,
     const NDArray &weights, const NDArray *bias, const NDArray &output) {
   auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
   auto data_md = GetMemDesc(data);
@@ -162,73 +165,51 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   }
 }
 
-class MKLDNNConvForward {
-  std::shared_ptr<mkldnn::convolution_forward> fwd;
-  std::shared_ptr<mkldnn::memory> data;
-  std::shared_ptr<mkldnn::memory> weight;
-  std::shared_ptr<mkldnn::memory> bias;
-  std::shared_ptr<mkldnn::memory> out;
+void MKLDNNConvForward::SetNewMem(const mkldnn::memory &data,
+                                  const mkldnn::memory &weight,
+                                  const mkldnn::memory *bias,
+                                  const mkldnn::memory &output) {
+  if (this->data_ == nullptr)
+    this->data_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.src_primitive_desc(), data.get_data_handle()));
+  else
+    this->data_->set_data_handle(data.get_data_handle());
 
- public:
-  mkldnn::convolution_forward::primitive_desc fwd_pd;
+  if (this->weight_ == nullptr)
+    this->weight_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+  else
+    this->weight_->set_data_handle(weight.get_data_handle());
 
-  MKLDNNConvForward(const ConvolutionParam& param, bool is_train,
-                    const NDArray &data, const NDArray &weights,
-                    const NDArray *bias, const NDArray &output): fwd_pd(
-                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
-  }
-
-  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
-                 const mkldnn::memory *bias, const mkldnn::memory &output) {
-    if (this->data == nullptr)
-      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.src_primitive_desc(), data.get_data_handle()));
-    else
-      this->data->set_data_handle(data.get_data_handle());
+  if (this->out_ == nullptr)
+    this->out_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+  else
+    this->out_->set_data_handle(output.get_data_handle());
 
-    if (this->weight == nullptr)
-      this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+  if (bias != nullptr) {
+    if (this->bias_ == nullptr)
+      this->bias_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
     else
-      this->weight->set_data_handle(weight.get_data_handle());
-
-    if (this->out == nullptr)
-      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
-    else
-      this->out->set_data_handle(output.get_data_handle());
-
-    if (bias != nullptr) {
-      if (this->bias == nullptr)
-        this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-                fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
-      else
-        this->bias->set_data_handle(bias->get_data_handle());
-      if (this->fwd == nullptr)
-        this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
-            new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                            mkldnn::primitive::at(*this->weight),
-                                            mkldnn::primitive::at(*this->bias),
-                                            *this->out));
-    } else if (this->fwd == nullptr) {
-      this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
-          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                          mkldnn::primitive::at(*this->weight),
-                                          *this->out));
-    }
+      this->bias_->set_data_handle(bias->get_data_handle());
+    if (this->fwd_ == nullptr)
+      this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
+          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data_),
+                                          mkldnn::primitive::at(*this->weight_),
+                                          mkldnn::primitive::at(*this->bias_),
+                                          *this->out_));
+  } else if (this->fwd_ == nullptr) {
+    this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
+        new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data_),
+                                        mkldnn::primitive::at(*this->weight_),
+                                        *this->out_));
   }
+}
 
-  const mkldnn::convolution_forward &GetFwd() const {
-    return *fwd;
-  }
-};
-
-typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
-
-static inline MKLDNNConvForward &GetConvFwd(
-    const nnvm::NodeAttrs& attrs, bool is_train,
-    const NDArray &data, const NDArray &weights,
-    const NDArray *bias, const NDArray &output) {
+MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs, const bool is_train,
+                              const NDArray &data, const NDArray &weights,
+                              const NDArray *bias, const NDArray &output) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash> fwds;
 #else
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
index 4b6235ec446..691e1d371b5 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
@@ -119,6 +119,10 @@ void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam &param,
                               const NDArray &out_grad, const NDArray &in_data,
                               const NDArray *workspace, const OpReqType req,
                               const NDArray &in_grad);
+MKLDNNPoolingFwd &GetPoolingFwd(const PoolingParam &param,
+                                const bool is_train,
+                                const NDArray &data,
+                                const NDArray &output);
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
index 259af2b9402..9fd88a13c46 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling.cc
+++ b/src/operator/nn/mkldnn/mkldnn_pooling.cc
@@ -121,7 +121,11 @@ mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam &param) {
       return mkldnn::algorithm::pooling_max;
       break;
     case pool_enum::kAvgPooling:
-      return mkldnn::algorithm::pooling_avg_include_padding;
+      if (param.count_include_pad.has_value() && !param.count_include_pad.value()) {
+        return mkldnn::algorithm::pooling_avg_exclude_padding;
+      } else {
+        return mkldnn::algorithm::pooling_avg_include_padding;
+      }
       break;
     default:
       LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method.";
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
index fdbfb1558f6..c51e1081d69 100644
--- a/src/operator/nn/mkldnn/mkldnn_sum.cc
+++ b/src/operator/nn/mkldnn/mkldnn_sum.cc
@@ -58,7 +58,6 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   std::vector<mkldnn::memory::primitive_desc> in_pds(inputs.size());
   std::vector<float> scales(inputs.size(), 1);
   in_prims.reserve(inputs.size());
-  bool pd_same = true;
   std::vector<NDArray> in_bufs(inputs.size());
   for (size_t i = 0; i < inputs.size(); i++) {
     const mkldnn::memory *in_mem;
@@ -73,31 +72,11 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   }
 
   mkldnn::sum::primitive_desc pdesc(scales, in_pds);
-  pd_same = pd_same && (pdesc.dst_primitive_desc() == in_pds[0]);
-  auto out_mem = const_cast<NDArray&>(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc());
-  bool addr_same = false;
-  const void *first_data_handle;
-  if (in_bufs[0].is_none())
-    first_data_handle = inputs[0].GetMKLDNNData()->get_data_handle();
-  else
-    first_data_handle = in_bufs[0].GetMKLDNNData()->get_data_handle();
-  if (out_mem)
-    addr_same = out_mem->get_data_handle() == first_data_handle;
-  if (((req == kWriteTo) || (req == kWriteInplace && pd_same && addr_same))
-      && out_mem) {
-    // do sum computation directly on output NDArray
-    MKLDNNStream *stream = MKLDNNStream::Get();
-    stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem));
-    stream->Submit();
-  } else {
-    // req == kWriteInplace but cannot be handled by mkldnn and
-    // req == kAddTo will run into this branch
-    auto mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req);
-    MKLDNNStream *stream = MKLDNNStream::Get();
-    stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *mem.second));
-    CommitOutput(out_data, mem);
-    stream->Submit();
-  }
+  auto mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req, &inputs[0]);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *mem.second));
+  CommitOutput(out_data, mem);
+  stream->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 9d004d295be..976aacf63a5 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -214,16 +214,19 @@ template <typename DType, int p = 1>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w, DType* out_data,
-                                       const bool getAvg = false) {
+                                       const bool get_avg = false, const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int c = (index / pooled_width) % channels;
     const int n = index / pooled_width / channels;
     int wstart = pw * stride_w - pad_w;
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (wend - wstart) : 1);
+    int pool_size = (get_avg? (wend - wstart) : 1);
     wstart = max(wstart, 0);
     wend = min(wend, width);
+    if (get_avg && !count_include_pad) {
+      pool_size = (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * width;
     for (int w = wstart; w < wend; ++w) {
@@ -244,7 +247,8 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_h, const int stride_w,
                                        const int pad_h, const int pad_w, DType* out_data,
-                                       const bool getAvg = false) {
+                                       const bool get_avg = false,
+                                       const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -254,11 +258,14 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
     int wstart = pw * stride_w - pad_w;
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
+    int pool_size = (get_avg? (hend - hstart) * (wend - wstart) : 1);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     hend = min(hend, height);
     wend = min(wend, width);
+    if (get_avg && !count_include_pad) {
+      pool_size = (hend - hstart) * (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
@@ -282,7 +289,8 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_d, const int stride_h, const int stride_w,
                                        const int pad_d, const int pad_h, const int pad_w,
-                                       DType* out_data, const bool getAvg = false) {
+                                       DType* out_data, const bool get_avg = false,
+                                       const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -295,13 +303,16 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
     int dend = min(dstart + kernel_d, depth + pad_d);
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+    int pool_size = (get_avg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     dend = min(dend, depth);
     hend = min(hend, height);
     wend = min(wend, width);
+    if (get_avg && !count_include_pad) {
+      pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * depth * height * width;
     for (int d = dstart; d < dend; ++d) {
@@ -311,7 +322,9 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
         }
       }
     }
-    out_data[index] = a_root_p<DType, p>::Map(sum);
+    out_data[index] = (pool_size == 0) ?
+                      DType(nanf("")) :
+                      a_root_p<DType, p>::Map(sum);
   }
 }
 
@@ -487,7 +500,8 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int channels, const int width,
                                          const int pooled_width, const int kernel_w,
                                          const int stride_w, const int pad_w, DType* in_grad,
-                                         const bool isAvg = false) {
+                                         const bool is_avg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -506,7 +520,12 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
       // figure out the pooling size
       int wstart = pw * stride_w - pad_w;
       int wend = min(wstart + kernel_w, width + pad_w);
-      int pool_size = (isAvg? (wend - wstart) : 1);
+      int pool_size = (is_avg? (wend - wstart) : 1);
+      if (is_avg && !count_include_pad) {
+        wstart = max(wstart, 0);
+        wend = min(wend, width);
+        pool_size = (wend - wstart);
+      }
       gradient +=
         lp_grad<DType, p>::Map(out_grad_slice[pw], in_data[index], out_data_slice[pw]) / pool_size;
     }
@@ -528,7 +547,8 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int kernel_h, const int kernel_w,
                                          const int stride_h, const int stride_w,
                                          const int pad_h, const int pad_w, DType* in_grad,
-                                         const bool isAvg = false) {
+                                         const bool is_avg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -553,8 +573,15 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
         int wstart = pw * stride_w - pad_w;
         int hend = min(hstart + kernel_h, height + pad_h);
         int wend = min(wstart + kernel_w, width + pad_w);
-        int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
+        int pool_size = (is_avg? (hend - hstart) * (wend - wstart) : 1);
         int out_index = ph * pooled_width + pw;
+        if (is_avg && !count_include_pad) {
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, height);
+          wend = min(wend, width);
+          pool_size = (hend - hstart) * (wend - wstart);
+        }
         gradient +=
           lp_grad<DType, p>::Map(out_grad_slice[out_index],
                                  in_data[index],
@@ -580,7 +607,8 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int kernel_d, const int kernel_h,
                                          const int kernel_w, const int stride_d, const int stride_h,
                                          const int stride_w, const int pad_d, const int pad_h,
-                                         const int pad_w, DType* in_grad, const bool isAvg = false) {
+                                         const int pad_w, DType* in_grad, const bool is_avg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCDHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -611,8 +639,17 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
           int dend = min(dstart + kernel_d, depth + pad_d);
           int hend = min(hstart + kernel_h, height + pad_h);
           int wend = min(wstart + kernel_w, width + pad_w);
-          int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+          int pool_size = (is_avg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
           int out_index = (pd * pooled_height + ph) * pooled_width + pw;
+          if (is_avg && !count_include_pad) {
+            dstart = max(dstart, 0);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            dend = min(dend, depth);
+            hend = min(hend, height);
+            wend = min(wend, width);
+            pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          }
           gradient += lp_grad<DType, p>::Map(out_grad_slice[out_index],
                                              in_data[index],
                                              out_data_slice[out_index]) / pool_size;
@@ -643,7 +680,7 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data) {
+                 DType* out_data, const bool count_include_pad) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
@@ -659,7 +696,8 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
-                                   kernel[0], stride[0], pad[0], out_data, true);
+                                   kernel[0], stride[0], pad[0], out_data,
+                                   true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -693,7 +731,8 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], out_data, true);
+                                   stride[0], stride[1], pad[0], pad[1], out_data,
+                                   true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -731,7 +770,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
-                                   pad[0], pad[1], pad[2], out_data, true);
+                                   pad[0], pad[1], pad[2], out_data, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -777,7 +816,8 @@ template<typename DType, int p>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
-                   const int pool_type, OpReqType req_type, DType* in_grad) {
+                   const int pool_type, OpReqType req_type, DType* in_grad,
+                   const bool count_include_pad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
@@ -798,7 +838,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
-                                     stride[0], pad[0], in_grad, true);
+                                     stride[0], pad[0], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -836,7 +876,8 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+                                     stride[0], stride[1], pad[0], pad[1], in_grad,
+                                     true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -878,7 +919,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
-                                     pad[2], in_grad, true);
+                                     pad[2], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 9fe43b2bd46..8f7a5edc832 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -216,7 +216,8 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType, int p = 1>
 inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, const bool getAvg = false) {
+                            DType* out_data,
+                            const bool get_avg = false, const bool count_include_pad = true) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -229,9 +230,12 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
       for (int pw = 0; pw < pooled_width; ++pw) {
         int wstart = pw * stride_w - pad_w;
         int wend = std::min(wstart + kernel_w, width + pad_w);
-        int pool_size = (getAvg ? (wend - wstart) : 1);
+        int pool_size = (get_avg ? (wend - wstart) : 1);
         wstart = std::max(wstart, 0);
         wend = std::min(wend, width);
+        if (get_avg && !count_include_pad) {
+          pool_size = (wend - wstart);
+        }
         DType sum = 0;
         for (int w = wstart; w < wend; ++w) {
           sum += a_pow_p<DType, p>::Map(in_data[w]) / pool_size;
@@ -251,7 +255,8 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType, int p = 1>
 inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, const bool getAvg = false) {
+                            DType* out_data,
+                            const bool get_avg = false, const bool count_include_pad = true) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -267,11 +272,14 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
           int wend = std::min(wstart + kernel_w, width + pad_w);
-          int pool_size = (getAvg ? (hend - hstart) * (wend - wstart) : 1);
+          int pool_size = (get_avg ? (hend - hstart) * (wend - wstart) : 1);
           hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
+          if (get_avg && !count_include_pad) {
+            pool_size = (hend - hstart) * (wend - wstart);
+          }
           DType sum = 0;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
@@ -294,7 +302,8 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType, int p = 1>
 inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, const bool getAvg = false) {
+                            DType* out_data,
+                            const bool get_avg = false, const bool count_include_pad = true) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -313,13 +322,16 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
             int dend = std::min(dstart + kernel_d, depth + pad_d);
             int hend = std::min(hstart + kernel_h, height + pad_h);
             int wend = std::min(wstart + kernel_w, width + pad_w);
-            int pool_size = (getAvg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+            int pool_size = (get_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
             dend = std::min(dend, depth);
             hend = std::min(hend, height);
             wend = std::min(wend, width);
+            if (get_avg && !count_include_pad) {
+              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
             DType sum = 0;
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
@@ -328,7 +340,9 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
                 }
               }
             }
-            out_data[(pd*pooled_height+ph)*pooled_width+pw] = a_root_p<DType, p>::Map(sum);
+            out_data[(pd*pooled_height+ph)*pooled_width+pw] = (pool_size == 0) ?
+                                                              DType(nanf("")) :
+                                                              a_root_p<DType, p>::Map(sum);
           }
         }
       }
@@ -509,8 +523,8 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
 template<typename DType, int p = 1>
 inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride,
-                              DType* in_grad, const bool isAvg = false) {
+                              const TShape& pad, const TShape& stride, DType* in_grad,
+                              const bool is_avg = false, const bool count_include_pad = true) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -523,9 +537,12 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const
       for (int pw = 0; pw < pooled_width; ++pw) {
         int wstart = pw * stride_w - pad_w;
         int wend = std::min(wstart + kernel_w, width + pad_w);
-        int pool_size = (isAvg ? (wend - wstart) : 1);
+        int pool_size = (is_avg ? (wend - wstart) : 1);
         wstart = std::max(wstart, 0);
         wend = std::min(wend, width);
+        if (is_avg && !count_include_pad) {
+          pool_size = (wend - wstart);
+        }
         for (int w = wstart; w < wend; ++w) {
           in_grad[w] += lp_grad<DType, p>::Map(out_grad[pw], in_data[w], out_data[pw]) / pool_size;
         }
@@ -545,8 +562,8 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const
 template<typename DType, int p = 1>
 inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride,
-                              DType* in_grad, const bool isAvg = false) {
+                              const TShape& pad, const TShape& stride, DType* in_grad,
+                              const bool is_avg = false, const bool count_include_pad = true) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -562,11 +579,14 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
           int wend = std::min(wstart + kernel_w, width + pad_w);
-          int pool_size = (isAvg ? (hend - hstart) * (wend - wstart) : 1);
+          int pool_size = (is_avg ? (hend - hstart) * (wend - wstart) : 1);
           hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
+          if (is_avg && !count_include_pad) {
+            pool_size = (hend - hstart) * (wend - wstart);
+          }
           const int pool_index = ph * pooled_width + pw;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
@@ -593,8 +613,8 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const
 template<typename DType, int p = 1>
 inline void unpool_sum_3d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride,
-                              DType* in_grad, const bool isAvg = false) {
+                              const TShape& pad, const TShape& stride, DType* in_grad,
+                              const bool is_avg = false, const bool count_include_pad = true) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -613,13 +633,16 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const DType* in_data, const
             int dend = std::min(dstart + kernel_d, depth + pad_d);
             int hend = std::min(hstart + kernel_h, height + pad_h);
             int wend = std::min(wstart + kernel_w, width + pad_w);
-            int pool_size = (isAvg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+            int pool_size = (is_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
             dend = std::min(dend, depth);
             hend = std::min(hend, height);
             wend = std::min(wend, width);
+            if (is_avg && !count_include_pad) {
+              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
             const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
@@ -660,13 +683,14 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data) {
+                 DType* out_data, const bool count_include_pad) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                      true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -678,7 +702,8 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                      true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -690,7 +715,8 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                      true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -723,7 +749,8 @@ template<typename DType, int p>
 inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
-                   const int pool_type, OpReqType req_type, DType* in_grad, const int p_value = 2) {
+                   const int pool_type, OpReqType req_type, DType* in_grad,
+                   const bool count_include_pad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
@@ -733,7 +760,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
       unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true);
+                        true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -747,7 +774,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
       unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true);
+                        true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -761,7 +788,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
       unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true);
+                        true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kLpPooling == pool_type) {
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index a4770b49e85..ad74a8feae3 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -41,6 +41,8 @@
 namespace mxnet {
 namespace op {
 
+void PoolingParamParser(nnvm::NodeAttrs *attrs);
+
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
   TShape stride;
@@ -50,6 +52,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   bool global_pool;
   bool cudnn_off;
   dmlc::optional<int> p_value;
+  dmlc::optional<bool> count_include_pad;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(kernel).set_default(TShape())  // add default value here
     .enforce_nonzero()
@@ -81,7 +84,13 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding.");
 
     DMLC_DECLARE_FIELD(p_value).set_default(dmlc::optional<int>())
-    .describe("Value of p for Lp pooling, can be 1 or 2, required for Lp Pooling");
+    .describe("Value of p for Lp pooling, can be 1 or 2, required for Lp Pooling.");
+
+    DMLC_DECLARE_FIELD(count_include_pad).set_default(dmlc::optional<bool>())
+    .describe("Only used for AvgPool, specify whether to count padding elements for average"
+              "calculation. For example, with a 5*5 kernel on a 3*3 corner of a image,"
+              "the sum of the 9 valid elements will be divided by 25 if this is set to true,"
+              "or it will be divided by 9 if this is set to false. Defaults to true.");
   }
 
   bool operator==(const PoolingParam& other) const {
@@ -92,7 +101,8 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
            this->pooling_convention == other.pooling_convention &&
            this->global_pool        == other.global_pool &&
            this->cudnn_off          == other.cudnn_off &&
-           this->p_value            == other.p_value;
+           this->p_value            == other.p_value &&
+           this->count_include_pad  == other.count_include_pad;
   }
 };
 
@@ -112,6 +122,7 @@ struct hash<mxnet::op::PoolingParam> {
     ret = dmlc::HashCombine(ret, val.global_pool);
     ret = dmlc::HashCombine(ret, val.cudnn_off);
     ret = dmlc::HashCombine(ret, val.p_value);
+    ret = dmlc::HashCombine(ret, val.count_include_pad);
     return ret;
   }
 };
@@ -153,27 +164,29 @@ class PoolingOp {
     }
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
                         param_.p_value.value() : 1;
+    const bool count_include_pad = (param_.count_include_pad.has_value()) ?
+                                   param_.count_include_pad.value() : true;
     switch (p_value) {
       case 1:
         pool<DType, 1>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>());
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
         break;
       case 2:
         pool<DType, 2>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>());
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
         break;
       case 3:
         pool<DType, 3>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>());
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
@@ -201,6 +214,8 @@ class PoolingOp {
 
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
                         param_.p_value.value() : 1;
+    const bool count_include_pad = (param_.count_include_pad.has_value()) ?
+                                   param_.count_include_pad.value() : true;
     switch (p_value) {
       case 1:
         unpool<DType, 1>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -208,7 +223,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>());
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
         break;
       case 2:
         unpool<DType, 2>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -216,7 +231,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>());
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
         break;
       case 3:
         unpool<DType, 3>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -224,7 +239,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>());
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 3ff94da3c2d..3200a515d6f 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -35,7 +35,7 @@
 namespace mxnet {
 namespace op {
 
-static void PoolingParamParser(nnvm::NodeAttrs *attrs) {
+void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   using namespace mshadow;
   PoolingParam param;
   param.Init(attrs->dict);
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index de3c7422c5f..0953cbaf519 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -322,6 +322,7 @@ IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::xelu_grad); // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::prelu_grad); // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::elu_grad); // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minimum);  // NOLINT()
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index 92b808dd460..bbd79417676 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -23,11 +23,31 @@
  * \brief
  */
 #include "./dequantize-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_dequantize-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(DequantizeParam);
 
+bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
+                           const int dev_mask,
+                           DispatchMode* dispatch_mode,
+                           std::vector<int> *in_attrs,
+                           std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_dequantize)
 .describe(R"code(Dequantize the input tensor into a float tensor.
 min_range and max_range are scalar floats that specify the range for
@@ -50,6 +70,10 @@ by keep zero centered for the quantized value:
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", DequantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", DequantizeType)
+.set_attr<FInferStorageType>("FInferStorageType", DequantizeStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNDequantizeCompute)
+#endif
 .set_attr<FCompute>("FCompute<cpu>", DequantizeCompute<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
new file mode 100644
index 00000000000..89c3c199488
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_dequantize-inl.h
+ * \author Wenting Jiang, Xinyu Chen
+ * \brief
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename SrcType, typename DstType>
+static void MKLDNNDequantizeComputeKer(const std::vector<NDArray> &inputs,
+                                       const std::vector<NDArray> &outputs,
+                                       const std::vector<OpReqType> &req) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  float real_range = 0.0;
+  float quantized_range = 0.0;
+  if (inputs[0].dtype() == mshadow::kUint8) {
+    quantized_range = MaxAbs(MaxValue<SrcType>(), MinValue<SrcType>());
+    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
+  } else if (inputs[0].dtype() == mshadow::kInt8) {
+    quantized_range = MinAbs(MaxValue<SrcType>(), MinValue<SrcType>());
+    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
+  } else {
+    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output type";
+  }
+  float scale = real_range / quantized_range;
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+    in_buffer = inputs[0].Reorder2Default();
+
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  size_t i_ndim = in_buffer.shape().ndim();
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+  for (size_t i = 0; i < i_ndim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  auto o_desc = mkldnn::memory::desc(i_dims,
+                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
+                                    i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNDequantizeCompute(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                    const std::vector<NDArray> &inputs,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &outputs) {
+  if (inputs[0].dtype() == mshadow::kUint8) {
+    MKLDNNDequantizeComputeKer<uint8_t, float>(inputs, outputs, req);
+  } else if (inputs[0].dtype() == mshadow::kInt8) {
+    MKLDNNDequantizeComputeKer<int8_t, float>(inputs, outputs, req);
+  } else {
+    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as input type";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
new file mode 100644
index 00000000000..f7709319d6a
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantize-inl.h
+ * \brief
+ * \author Wenting Jiang, Xinyu Chen
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../quantize-inl.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename SrcType, typename DstType>
+static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
+                                     const std::vector<NDArray>& outputs,
+                                     const QuantizeParam& param,
+                                     const std::vector<OpReqType> &req) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  float real_range = 0.0;
+  float quantized_range = 0.0;
+  if (param.out_type == mshadow::kUint8) {
+    real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    quantized_range = MaxAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = *inputs[1].data().dptr<float>();
+    *outputs[2].data().dptr<float>() = *inputs[2].data().dptr<float>();
+  } else if (param.out_type == mshadow::kInt8) {
+    real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = -real_range;
+    *outputs[2].data().dptr<float>() = real_range;
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+  }
+  float scale = quantized_range / real_range;
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+    in_buffer = inputs[0].Reorder2Default();
+
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  size_t i_ndim = in_buffer.shape().ndim();
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+  for (size_t i = 0; i < i_ndim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  auto o_desc = mkldnn::memory::desc(i_dims,
+                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
+                                    i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNQuantizeCompute(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                  const std::vector<NDArray> &inputs,
+                                  const std::vector<OpReqType> &req,
+                                  const std::vector<NDArray> &outputs) {
+  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+  if (param.out_type == mshadow::kUint8) {
+    MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+  } else if (param.out_type == mshadow::kInt8) {
+    MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
new file mode 100644
index 00000000000..fa6a32a4739
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantized_conv.cc
+ * \brief
+ * \author Wenting Jiang, Xinyu Chen
+*/
+
+#if MXNET_USE_MKLDNN == 1
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/mkldnn/mkldnn_convolution-inl.h"
+#include "../../nn/convolution-inl.h"
+#include "../quantization_utils.h"
+#include "../../tensor/matrix_op-inl.h"
+#include "../../elemwise_op_common.h"
+namespace mxnet {
+namespace op {
+
+static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
+                                       const OpContext &ctx,
+                                       const std::vector<NDArray> &in_data,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<NDArray> &out_data) {
+  CHECK_EQ(in_data[0].dtype(), mshadow::kUint8)
+    << "mkldnn_quantized_conv op only supports uint8 as input type";
+  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  NDArray weight = in_data[conv::kWeight];
+  MKLDNNConvForward &fwd = GetConvFwd(attrs, ctx.is_train,
+      in_data[conv::kData], weight,
+      param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
+
+  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
+  const mkldnn::memory *weight_mem;
+  // For inference, we want to reorder the weight array so we don't need to
+  // reorder data every time.
+  if (weight.IsDefaultData()) {
+    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
+    // We also need to modify the layout on the original weight array. The
+    // data conversion happens after the weight array is used.
+    weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_primitive_desc());
+  } else {
+    weight_mem = weight.GetMKLDNNData();
+    CHECK(weight_mem->get_primitive_desc() == fwd.fwd_pd.weights_primitive_desc());
+  }
+  auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(),
+                                 req[conv::kOut]);
+  const mkldnn::memory *bias_mem = nullptr;
+  if (!param.no_bias)
+    bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc());
+  fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+
+  CommitOutput(out_data[conv::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  const size_t num_inputs = param.no_bias ? 2 : 3;
+  mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+           out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
+           in_data[num_inputs].data().dptr<float>(),
+           in_data[num_inputs+1].data().dptr<float>(),
+           in_data[num_inputs+2].data().dptr<float>(),
+           in_data[num_inputs+3].data().dptr<float>());
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_conv)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedConvForward);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc
new file mode 100644
index 00000000000..83177ad9b34
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantized_pooling.cc
+ * \brief
+ * \author Tao Lv, Xinyu Chen
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "../../nn/mkldnn/mkldnn_pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static void MKLDNNQuantizedPoolingForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                          const std::vector<NDArray> &in_data,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &out_data) {
+  CHECK(in_data[0].dtype() == mshadow::kUint8
+    || in_data[0].dtype() == mshadow::kInt8)
+    << "mkldnn_quantized_pooling op only supports uint8 and int8 as input type";
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  auto fwd = GetPoolingFwd(param, ctx.is_train, in_data[0], out_data[0]);
+  fwd.SetDataHandle(in_data[0], out_data[0]);
+  fwd.Execute();
+  out_data[1].data().dptr<float>()[0] = in_data[1].data().dptr<float>()[0];
+  out_data[2].data().dptr<float>()[0] = in_data[2].data().dptr<float>()[0];
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_pooling)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedPoolingForward);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
new file mode 100644
index 00000000000..409c53dd3b9
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* \file mkldnn_requantize-inl.h
+ * \brief
+ * \author Jin Huang, Xinyu Chen
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../requantize-inl.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
+                                       const OpContext& ctx,
+                                       const std::vector<NDArray>& inputs,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& outputs,
+                                       const float real_range) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  typedef int32_t SrcDType;
+  typedef int8_t  DstDType;
+  // check shapes
+  size_t i_dim = inputs[0].shape().ndim();
+  size_t o_dim = outputs[0].shape().ndim();
+  CHECK_EQ(i_dim, o_dim);
+  float first_quantized_range = MinAbs(MinValue<SrcDType>(),
+                                       MaxValue<SrcDType>());
+  float first_real_range = MaxAbs(*inputs[1].data().dptr<float>(),
+                                  *inputs[2].data().dptr<float>());
+  float first_scale = first_real_range / first_quantized_range;
+  float second_real_range = real_range;
+  float second_quantized_range = MinAbs(MaxValue<DstDType>(),
+                                        MinValue<DstDType>());
+  float second_scale = second_quantized_range / second_real_range;
+  float scale = first_scale * second_scale;
+  *outputs[1].data().dptr<float>() = -second_real_range;
+  *outputs[2].data().dptr<float>() = second_real_range;
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+    in_buffer = inputs[0].Reorder2Default();
+
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_dim);
+  for (size_t i = 0; i < i_dim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  auto o_desc = mkldnn::memory::desc(i_dims,
+                                    (mkldnn::memory::data_type)data_type_enum<DstDType>::type,
+                                    i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  typedef int32_t SrcDType;
+  typedef int8_t  DstDType;
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  const RequantizeParam& param = nnvm::get<RequantizeParam>(attrs.parsed);
+  float real_range;
+  // Model is calibrated
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    real_range =
+          MaxAbs(param.min_calib_range.value(), param.max_calib_range.value());
+    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
+  // Model is not calibrated
+  } else {
+    TShape src_shape, dst_shape;
+    const size_t actual_float_size = sizeof(float);
+    const size_t actual_quantized_size = sizeof(SrcDType);
+    const size_t temp_reduce_size = ConfigReduce<cpu, SrcDType>(s,
+                         inputs[0].shape(), TShape({1}), &src_shape, &dst_shape);
+    Tensor<cpu, 1, char> temp_space =
+      ctx.requested[0].get_space_typed<cpu, 1, char>(
+      Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
+    Tensor<cpu, 1, float> actual_min_float(
+                 reinterpret_cast<float*>(temp_space.dptr_), Shape1(1), s);
+    Tensor<cpu, 1, float> actual_max_float(
+                 reinterpret_cast<float*>(temp_space.dptr_) + 1, Shape1(1), s);
+    const int dev_id = ctx.run_ctx.ctx.dev_id;
+    TBlob actual_min_quantized(reinterpret_cast<SrcDType*>(
+                       temp_space.dptr_ + 8), Shape1(1), cpu::kDevMask, dev_id);
+    TBlob actual_max_quantized(reinterpret_cast<SrcDType*>(
+                   temp_space.dptr_ + 8) + 1, Shape1(1), cpu::kDevMask, dev_id);
+    Tensor<cpu, 1, char> workspace(
+            temp_space.dptr_+2*actual_float_size+2*actual_quantized_size,
+            Shape1(temp_reduce_size), s);
+    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+        s, actual_min_quantized.reshape(dst_shape), kWriteTo,
+        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
+    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
+        actual_min_float.dptr_, actual_min_quantized.dptr<SrcDType>(),
+        inputs[1].Reorder2Default().data().dptr<float>(),
+        inputs[2].Reorder2Default().data().dptr<float>());
+    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+        s, actual_max_quantized.reshape(dst_shape), kWriteTo,
+        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
+    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
+        actual_max_float.dptr_, actual_max_quantized.dptr<SrcDType>(),
+        inputs[1].Reorder2Default().data().dptr<float>(),
+        inputs[2].Reorder2Default().data().dptr<float>());
+
+    real_range = MaxAbs(*actual_min_float.dptr_, *actual_max_float.dptr_);
+    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
diff --git a/src/operator/quantization/quantize.cc b/src/operator/quantization/quantize.cc
index 32eb952fa5d..25fb19dddd1 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -23,11 +23,31 @@
  * \brief
  */
 #include "./quantize-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_quantize-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(QuantizeParam);
 
+bool QuantizeStorageType(const nnvm::NodeAttrs& attrs,
+                         const int dev_mask,
+                         DispatchMode* dispatch_mode,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_quantize)
 .describe(R"code(Quantize a input tensor from float to `out_type`,
 with user-specified `min_range` and `max_range`.
@@ -61,6 +81,10 @@ where
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizeType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizeStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeCompute)
+#endif
 .set_attr<FCompute>("FCompute<cpu>", QuantizeCompute<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 5ec745ccdf3..5376a0ee9f1 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -99,6 +99,7 @@ Graph QuantizeGraph(Graph &&src) {
   static auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
   auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
   auto excluded_nodes = src.GetAttr<std::unordered_set<NodePtr>>("excluded_nodes");
+  auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
 
   // mirror_map stores the mapping from the currently visited graph to the newly created quantized
   // graph. Key is the currently visited graph's node pointer, and value is a copied node of the key
@@ -129,7 +130,7 @@ Graph QuantizeGraph(Graph &&src) {
              mirror_node->op()->name != "_contrib_quantize")) {
           NodePtr quantize_node = InsertNode("_contrib_quantize",
             e.node->attrs.name + "_quantize", new_node, mirror_entry);
-          quantize_node->attrs.dict["out_type"] = "int8";
+          quantize_node->attrs.dict["out_type"] = quantized_dtype;
           quantize_node->op()->attr_parser(&(quantize_node->attrs));
 
           NodePtr min_node = InsertNode("min",
@@ -159,7 +160,11 @@ Graph QuantizeGraph(Graph &&src) {
         uint32_t min_index = 1;
         uint32_t max_index = 2;
         if (quantized_op_map.count(e.node->op())) {
-          size_t  num_outputs = e.node->num_outputs();
+          // here we calculate the output number (exclude min/max, in order to
+          // calculate min/max index from mirror node) based on assumption that
+          // there is only 1min and 1max output from mirror node (which is
+          // currently true)
+          size_t  num_outputs = mirror_node->num_outputs() - 2;
           min_index = num_outputs + 2 * e.index;
           max_index = num_outputs + 2 * e.index + 1;
         } else {
@@ -198,12 +203,15 @@ Graph QuantizeGraph(Graph &&src) {
         NodePtr mirror_node = mirror_map.at(e.node.get());
         NodeEntry mirror_entry = NodeEntry{
           mirror_node, e.index, e.version};
-        size_t num_outputs = e.node->num_outputs();
-        uint32_t min_index = num_outputs + 2 * e.index;
-        uint32_t max_index = num_outputs + 2 * e.index + 1;
-
         // if input node is quantized operator, add dequantize node
         if (NeedQuantize(e.node, excluded_nodes)) {
+          // here we calculate the output number (exclude min/max, in order to
+          // calculate min/max index from mirror node) based on assumption that
+          // there is only 1min and 1max output from mirror node (which is
+          // currently true)
+          size_t num_outputs = mirror_node->num_outputs() - 2;
+          uint32_t min_index = num_outputs + 2 * e.index;
+          uint32_t max_index = num_outputs + 2 * e.index + 1;
           NodePtr dequantize_node = CreateNode("_contrib_dequantize",
             e.node->attrs.name + "_dequantize");
           dequantize_node->inputs.emplace_back(mirror_entry);
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
index d7dc9fe4dbd..ed62228b924 100644
--- a/src/operator/quantization/quantized_conv.cc
+++ b/src/operator/quantization/quantized_conv.cc
@@ -24,6 +24,9 @@
  * \author Ziheng Jiang, Jun Wu
 */
 #include "../nn/convolution-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -86,12 +89,13 @@ bool QuantizedConvType(const nnvm::NodeAttrs& attrs,
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   CHECK_EQ(in_type->size(), param.no_bias? 6U : 9U);
   CHECK_EQ(out_type->size(), 3U);
+#ifndef MXNET_USE_MKLDNN
   TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
+#endif
   TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kInt8);
   if (!param.no_bias) {
     TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kInt8);
   }
-
   const size_t start = param.no_bias? 2 : 3;
   const size_t end = param.no_bias? 6 : 9;
   for (size_t i = start; i < end; ++i) {
@@ -104,6 +108,24 @@ bool QuantizedConvType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+bool QuantizedConvStorageType(const nnvm::NodeAttrs& attrs,
+                              const int dev_mask,
+                              DispatchMode* dispatch_mode,
+                              std::vector<int> *in_attrs,
+                              std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_conv)
 .describe(R"code(Convolution operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -119,7 +141,7 @@ and max thresholds representing the threholds for quantizing the float32 output
     return param.no_bias? 6 : 9;
   })
 .set_num_outputs(3)
-.set_attr_parser(ParamParser<ConvolutionParam>)
+.set_attr_parser(ConvolutionParamParser)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
@@ -137,6 +159,7 @@ and max thresholds representing the threholds for quantizing the float32 output
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedConvShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedConvType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedConvStorageType)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
diff --git a/src/operator/quantization/quantized_flatten-inl.h b/src/operator/quantization/quantized_flatten-inl.h
index 95f36615402..b7209fd28f5 100644
--- a/src/operator/quantization/quantized_flatten-inl.h
+++ b/src/operator/quantization/quantized_flatten-inl.h
@@ -62,11 +62,21 @@ void QuantizedFlattenCompute(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
 
-  typedef int8_t DstDType;
-  typedef int8_t  SrcDType;
-  Kernel<quantized_flatten, xpu>::Launch(s, outputs[0].Size(),
-    outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
-    inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>());
+  if (inputs[0].type_flag_ == mshadow::kUint8) {
+    typedef uint8_t SrcDType;
+    typedef uint8_t DstDType;
+    Kernel<quantized_flatten, xpu>::Launch(s, outputs[0].Size(),
+      outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+      inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>());
+  } else if (inputs[0].type_flag_ == mshadow::kInt8) {
+    typedef int8_t SrcDType;
+    typedef int8_t DstDType;
+    Kernel<quantized_flatten, xpu>::Launch(s, outputs[0].Size(),
+      outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+      inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>());
+  } else {
+    LOG(FATAL) << "quantized_flatten op only supports int8 and uint8 as input and output type";
+  }
 }
 
 inline bool QuantizedFlattenShape(const nnvm::NodeAttrs& attrs,
@@ -96,10 +106,9 @@ inline bool QuantizedFlattenType(const nnvm::NodeAttrs& attrs,
                                  std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
   TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
   return (*in_attrs)[0] != -1;
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index a3105eb654d..779e244c862 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -23,6 +23,9 @@
 */
 #include <mxnet/op_attr_types.h>
 #include "../nn/pooling-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "../nn/mkldnn/mkldnn_pooling-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -79,8 +82,12 @@ bool QuantizedPoolingType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_type->size(), 3U);
   CHECK_EQ(out_type->size(), 3U);
   if (param.pool_type == pool_enum::kMaxPooling || param.pool_type == pool_enum::kAvgPooling) {
+#if MXNET_USE_MKLDNN  == 1
+    TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[0]);
+#else
     TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
     TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt8);
+#endif
   } else {
     LOG(FATAL) << "QuantizedPoolingOp only supports pool_type=max/avg for now";
   }
@@ -91,6 +98,27 @@ bool QuantizedPoolingType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+inline static bool QuantizedPoolingStorageType(const nnvm::NodeAttrs &attrs,
+                                               const int dev_mask,
+                                               DispatchMode *dispatch_mode,
+                                               std::vector<int> *in_attrs,
+                                               std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3);
+
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#else
+  CHECK_EQ(out_attrs->size(), 3);
+#endif
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_pooling)
 .describe(R"code(Pooling operator for input and output data type of int8.
 The input and output data comes with min and max thresholds for quantizing
@@ -101,7 +129,7 @@ the float32 data into int8.
     This operator only supports `pool_type` of `avg` or `max`.)code" ADD_FILELINE)
 .set_num_inputs(3)
 .set_num_outputs(3)
-.set_attr_parser(ParamParser<PoolingParam>)
+.set_attr_parser(PoolingParamParser)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"data", "min_data", "max_data"};
@@ -112,6 +140,7 @@ the float32 data into int8.
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedPoolingShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedPoolingType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedPoolingStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize",
   [](const NodeAttrs& attrs) {
     const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
index 83ea37b835c..5ce0ff0b020 100644
--- a/src/operator/quantization/requantize.cc
+++ b/src/operator/quantization/requantize.cc
@@ -24,11 +24,31 @@
  */
 #include "./requantize-inl.h"
 #include "./quantize-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_requantize-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(RequantizeParam);
 
+bool RequantizeStorageType(const nnvm::NodeAttrs& attrs,
+                         const int dev_mask,
+                         DispatchMode* dispatch_mode,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_requantize)
 .describe(R"code(Given data that is quantized in int32 and the corresponding thresholds,
 requantize the data into int8 using min and max thresholds either calculated at runtime
@@ -43,7 +63,12 @@ inference accuracy.
 .set_num_outputs(3)
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", RequantizeType)
+.set_attr<FInferStorageType>("FInferStorageType", RequantizeStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNRequantizeForward)
+#else
 .set_attr<FCompute>("FCompute<cpu>", RequantizeForward<cpu>)
+#endif
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
     const RequantizeParam& param =
       nnvm::get<RequantizeParam>(attrs.parsed);
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 983f879888c..83c9034e364 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -22,8 +22,9 @@
  * \file shuffle_op.cc
  * \brief Operator to shuffle elements of an NDArray
  */
-#if (__GNUC__ > 4 && !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__)
-  #define USE_GNU_PARALLEL_SHUFFLE
+#if !defined (__ANDROID__) && ((__GNUC__ > 4 &&\
+    !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__))
+        #define USE_GNU_PARALLEL_SHUFFLE
 #endif
 
 #include <mxnet/operator_util.h>
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 3bf437376fa..bed97301fd1 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -28,9 +28,11 @@
 #if MXNET_USE_CUDA
   #include <cuda_runtime.h>
 #endif  // MXNET_USE_CUDA
+
 #include <mxnet/base.h>
 #include <mxnet/storage.h>
 #include <unordered_map>
+#include <algorithm>
 #include <vector>
 #include <mutex>
 #include <new>
@@ -43,7 +45,8 @@ namespace storage {
 
 #if MXNET_USE_CUDA
 /*!
- * \brief Storage manager with a memory pool on gpu.
+ * \brief Storage manager with a memory pool on gpu. Memory chunks are reused based on exact size
+ * match.
  */
 class GPUPooledStorageManager final : public StorageManager {
  public:
@@ -52,6 +55,11 @@ class GPUPooledStorageManager final : public StorageManager {
    */
   GPUPooledStorageManager() {
     reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
+    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+    if (page_size_ < NDEV) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than " << NDEV \
+                 << ". Got " << page_size_ << ".";
+    }
   }
   /*!
    * \brief Default destructor.
@@ -71,7 +79,7 @@ class GPUPooledStorageManager final : public StorageManager {
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
     cudaError_t err = cudaFree(handle.dptr);
-    size_t size = handle.size + NDEV;
+    size_t size = std::max(handle.size, page_size_);
     // ignore unloading error, as memory has already been recycled
     if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
@@ -83,10 +91,12 @@ class GPUPooledStorageManager final : public StorageManager {
   void ReleaseAll();
   // used memory
   size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
   // percentage of reserved memory
   int reserve_;
   // number of devices
-  const int NDEV = 32;
+  const size_t NDEV = 32;
   // memory pool
   std::unordered_map<size_t, std::vector<void*>> memory_pool_;
   DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
@@ -94,7 +104,7 @@ class GPUPooledStorageManager final : public StorageManager {
 
 void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = handle->size + NDEV;
+  size_t size = std::max(handle->size, page_size_);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
     size_t free, total;
@@ -119,7 +129,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
 
 void GPUPooledStorageManager::Free(Storage::Handle handle) {
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = handle.size + NDEV;
+  size_t size = std::max(handle.size, page_size_);
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(handle.dptr);
 }
@@ -129,13 +139,172 @@ void GPUPooledStorageManager::ReleaseAll() {
     for (auto&& j : i.second) {
       Storage::Handle handle;
       handle.dptr = j;
-      handle.size = i.first - NDEV;
+      handle.size = i.first;
       DirectFreeNoLock(handle);
     }
   }
   memory_pool_.clear();
 }
 
+/*!
+ * \brief Storage manager with a memory pool, with rounded size, on gpu.
+ *
+ * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and
+ * nearest multiple (linear) rounding to help alleviate the memory allocation stress
+ * in which the default naive exact-size-match pool falls short, such as in variable-length
+ * input/output cases like RNN workloads.
+ *
+ * \param cutoff the cutoff at which rounding is switched from exponential to linear. It's set
+ * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must be between 20 (1 MB)
+ * and 34 (16 GB).
+ * Suppose the cutoff is X, the memory size buckets look like this:
+ * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ...
+ */
+class GPUPooledRoundedStorageManager final : public StorageManager {
+ public:
+  /*!
+   * \brief Default constructor.
+   */
+  GPUPooledRoundedStorageManager() {
+    reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
+    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+    cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24);
+    if (page_size_ < 32) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \
+                 << "Got: " << page_size_ << ".";
+    }
+    if (page_size_ != 1ul << log2_round_up(page_size_)) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " << page_size_ << ".";
+    }
+    page_size_ = log2_round_up(page_size_);
+    if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
+                 << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \
+                 << cut_off_ << ".";
+    }
+    if (cut_off_ < page_size_) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
+                 << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " \
+                 << cut_off_ << " vs " << page_size_ << ".";
+    }
+    memory_pool_ = std::vector<std::vector<void*>>((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_);
+  }
+  /*!
+   * \brief Default destructor.
+   */
+  ~GPUPooledRoundedStorageManager() {
+    ReleaseAll();
+  }
+
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
+
+  void DirectFree(Storage::Handle handle) override {
+    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+    DirectFreeNoLock(handle);
+  }
+
+ private:
+  inline int log2_round_up(size_t s) {
+    return static_cast<int>(std::ceil(std::log2(s)));
+  }
+  inline int div_pow2_round_up(size_t s, int divisor_log2) {
+    // (1025, 10) -> 2
+    // (2048, 10) -> 2
+    // (2049, 10) -> 3
+    size_t result = s >> divisor_log2;
+    return static_cast<int>(result + (s > (result << divisor_log2) ? 1 : 0));
+  }
+  inline int get_bucket(size_t s) {
+    int log_size = log2_round_up(s);
+    if (log_size > static_cast<int>(cut_off_))
+      return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
+    else
+      return std::max(log_size, static_cast<int>(page_size_));
+  }
+  inline size_t get_size(int bucket) {
+    if (bucket <= static_cast<int>(cut_off_))
+      return 1ul << bucket;
+    else
+      return (bucket - cut_off_ + 1) * (1ul << cut_off_);
+  }
+
+  void DirectFreeNoLock(Storage::Handle handle) {
+    cudaError_t err = cudaFree(handle.dptr);
+    size_t size = get_size(get_bucket(handle.size));
+    // ignore unloading error, as memory has already been recycled
+    if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
+    }
+    used_memory_ -= size;
+  }
+
+ private:
+  void ReleaseAll();
+  // number of devices
+  const int NDEV = 32;
+  // log2 of maximum page size. 16GB
+  const size_t LOG2_MAX_MEM = 34;
+  // address width in bits
+  static const int addr_width = sizeof(size_t) * 8;
+  // used memory
+  size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
+  // log2 of memory size before switching to exponential mode to linear mode
+  size_t cut_off_;
+  // percentage of reserved memory
+  int reserve_;
+  // memory pool
+  std::vector<std::vector<void*>> memory_pool_;
+  DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);
+};  // class GPUPooledRoundedStorageManager
+
+void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  int bucket = get_bucket(handle->size);
+  size_t size = get_size(bucket);
+  auto&& reuse_pool = memory_pool_[bucket];
+  if (reuse_pool.size() == 0) {
+    size_t free, total;
+    cudaMemGetInfo(&free, &total);
+    if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
+      ReleaseAll();
+
+    void* ret = nullptr;
+    cudaError_t e = cudaMalloc(&ret, size);
+    if (e != cudaSuccess && e != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "cudaMalloc failed: " << cudaGetErrorString(e);
+    }
+    used_memory_ += size;
+    handle->dptr = ret;
+  } else {
+    auto ret = reuse_pool.back();
+    reuse_pool.pop_back();
+    handle->dptr = ret;
+  }
+}
+
+void GPUPooledRoundedStorageManager::Free(Storage::Handle handle) {
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  int bucket = get_bucket(handle.size);
+  auto&& reuse_pool = memory_pool_[bucket];
+  reuse_pool.push_back(handle.dptr);
+}
+
+void GPUPooledRoundedStorageManager::ReleaseAll() {
+  for (size_t i = 0; i < memory_pool_.size(); i++) {
+    int size = get_size(i);
+    for (auto& j : memory_pool_[i]) {
+      Storage::Handle handle;
+      handle.size = size;
+      handle.dptr = j;
+      DirectFreeNoLock(handle);
+    }
+    memory_pool_[i].clear();
+  }
+}
+
 #endif  // MXNET_USE_CUDA
 
 }  // namespace storage
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 674c123d8e8..a0a3ed757ea 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -118,7 +118,21 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
 #if MXNET_USE_CUDA
             CUDA_CALL(cudaGetDeviceCount(&num_gpu_device));
             CHECK_GT(num_gpu_device, 0) << "GPU usage requires at least 1 GPU";
-            ptr = new storage::GPUPooledStorageManager();
+
+            const char *type = getenv("MXNET_GPU_MEM_POOL_TYPE");
+            const bool default_pool = (type == nullptr);
+            if (default_pool) type = "Naive";
+            std::string strategy = type;
+
+            if (strategy == "Round") {
+              ptr = new storage::GPUPooledRoundedStorageManager();
+              LOG(INFO) << "Using GPUPooledRoundedStorageManager.";
+            } else {
+              if (strategy != "Naive") {
+                LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << ".";
+              }
+              ptr = new storage::GPUPooledStorageManager();
+            }
 #else
             LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
 #endif  // MXNET_USE_CUDA
diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
index 76872d5e6cf..82fee67b114 100644
--- a/tests/cpp/operator/mkldnn.cc
+++ b/tests/cpp/operator/mkldnn.cc
@@ -30,6 +30,7 @@
 #include "gtest/gtest.h"
 #include "mxnet/imperative.h"
 #include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../../src/operator/nn/mkldnn/mkldnn_ops-inl.h"
 
 using namespace mxnet;
 
@@ -425,30 +426,45 @@ OpAttrs GetSumOp() {
  *    reordered to 5 dimensions.
  *
  */
-std::vector<NDArrayAttrs> GetTestInputArrays(InitFunc init_fn) {
+std::vector<NDArrayAttrs> GetTestInputArrays(InitFunc init_fn, bool rand = false) {
   TestArrayShapes tas = GetTestArrayShapes();
   std::vector<nnvm::TShape> shapes = tas.shapes;
   std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
 
   std::vector<NDArrayAttrs> in_arrs;
+  std::string desc;
   for (auto shape : shapes) {
     // Type 1.
     NDArray arr(shape, Context());
     in_arrs.emplace_back(arr, "Normal NDArray");
-    init_fn(&in_arrs.back().arr, false);
+    init_fn(&in_arrs.back().arr, rand);
     for (auto pd : pds) {
       if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
         continue;
 
       // Type 2, 3.
       arr = NDArray(shape, Context());
-      in_arrs.emplace_back(arr, "MKLDNN NDArray");
+      desc = "MKLDNN NDArray";
+      if (shape.ndim() != pd.desc().data.ndims) {
+        std::stringstream ss;
+        ss << "MKLDNN NDArray with different memory layout " <<
+           shape.ndim() << "/" << pd.desc().data.ndims;
+        desc = ss.str();
+      }
+      in_arrs.emplace_back(arr, desc);
       InitMKLDNNArray(&in_arrs.back().arr, pd, init_fn);
 
       // Type 4, 5, 6.
       arr = NDArray(shape, Context());
+      desc = "Reshaped MKLDNN NDArray";
+      if (shape.ndim() != pd.desc().data.ndims) {
+        std::stringstream ss;
+        ss << "Reshaped MKLDNN NDArray with different memory layout "
+           << shape.ndim() << "/" << pd.desc().data.ndims;
+        desc = ss.str();
+      }
       InitMKLDNNArray(&arr, pd, init_fn);
-      in_arrs.emplace_back(arr.Slice(1, arr.shape()[0] - 1), "Reshaped MKLDNN NDArray");
+      in_arrs.emplace_back(arr.Slice(1, arr.shape()[0] - 1), desc);
     }
   }
   return in_arrs;
@@ -495,6 +511,7 @@ std::vector<NDArrayAttrs> GetTestOutputArrays(const TShape &shape,
                                          const std::vector<mkldnn::memory::primitive_desc> &pds,
                                          const InitFunc init_fn) {
   std::vector<NDArrayAttrs> in_arrs;
+  std::string desc;
   // Type 1.
   NDArray arr(shape, Context());
   in_arrs.emplace_back(arr, "Normal NDArray");
@@ -538,7 +555,14 @@ std::vector<NDArrayAttrs> GetTestOutputArrays(const TShape &shape,
 
     // Type 2, 3.
     arr = NDArray(shape, Context());
-    in_arrs.emplace_back(arr, "MKLDNN NDArray");
+    desc = "MKLDNN NDArray";
+    if (shape.ndim() != pd.desc().data.ndims) {
+      std::stringstream ss;
+      ss << "MKLDNN NDArray with different memory layout "
+         << shape.ndim() << "/" << pd.desc().data.ndims;
+      desc = ss.str();
+    }
+    in_arrs.emplace_back(arr, desc);
     InitMKLDNNArray(&in_arrs.back().arr, pd, init_fn, true);
 
     // Type 8, 9.
@@ -548,7 +572,14 @@ std::vector<NDArrayAttrs> GetTestOutputArrays(const TShape &shape,
     NDArray arr = NDArray(s, Context());
     arr = arr.AsArray(shape, arr.dtype());
     InitMKLDNNArray(&arr, pd, init_fn, true);
-    in_arrs.emplace_back(arr, "Reused MKLDNN NDArray");
+    desc = "Reused MKLDNN NDArray";
+    if (shape.ndim() != pd.desc().data.ndims) {
+      std::stringstream ss;
+      ss << "Reused MKLDNN NDArray with different memory layout "
+         << shape.ndim() << "/" << pd.desc().data.ndims;
+      desc = ss.str();
+    }
+    in_arrs.emplace_back(arr, desc);
   }
   return in_arrs;
 }
@@ -587,7 +618,7 @@ void VerifySumResult(const std::vector<NDArray *> &in_arrs, const NDArray &arr)
   mshadow::default_real_t *d2 = in2.data().dptr<mshadow::default_real_t>();
   mshadow::default_real_t *o = out.data().dptr<mshadow::default_real_t>();
   for (size_t i = 0; i < in1.shape().Size(); i++)
-    EXPECT_EQ(d1[i] + d2[i], o[i]);
+    ASSERT_EQ(d1[i] + d2[i], o[i]);
 }
 
 void PrintVerifyMsg(const NDArrayAttrs &arr1, const NDArrayAttrs &arr2) {
@@ -736,4 +767,55 @@ TEST(IMPERATIVE, BinaryOp) {
   TestBinaryOp(attrs, VerifySumResult);
 }
 
+void VerifySumMemory(mkldnn::memory in_mem1, mkldnn::memory in_mem2, mkldnn::memory out_mem) {
+  float *in1 = static_cast<float*>(in_mem1.get_data_handle());
+  float *in2 = static_cast<float*>(in_mem2.get_data_handle());
+  float *out = static_cast<float*>(out_mem.get_data_handle());
+  for (size_t i = 0; i < in_mem1.get_primitive_desc().get_size() / sizeof(float); i++) {
+    ASSERT_EQ(in1[i] + in2[i], out[i]);
+  }
+}
+
+TEST(MKLDNN_BASE, MKLDNNSum) {
+  std::vector<NDArrayAttrs> in_arrs = GetTestInputArrays(InitDefaultArray);
+  std::vector<NDArrayAttrs> in_arrs2 = GetTestInputArrays(InitDefaultArray, true);
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  for (int i = 0; i < in_arrs.size(); i++) {
+    auto in_arr = in_arrs[i];
+    auto in_arr2 = in_arrs2[i];
+    std::vector<NDArrayAttrs> out_arrs = GetTestOutputArrays(in_arr.arr.shape(), pds,
+                                                             InitDefaultArray);
+    if (!SupportMKLDNN(in_arr.arr) || !in_arr.arr.IsMKLDNNData() || in_arr.arr.IsView())
+      continue;
+
+    for (auto out_arr : out_arrs) {
+      auto in_mem1 = in_arr.arr.GetMKLDNNData();
+      auto in_mem2 = in_arr.arr.GetMKLDNNData();
+      auto out_mem = out_arr.arr.GetMKLDNNData(in_mem1->get_primitive_desc());
+
+      // TODO(alexzai) : remove this noop when by reordering in MKLDNNSum
+      if (out_mem == nullptr)
+        continue;
+      PrintVerifyMsg(in_arr, in_arr);
+      op::MKLDNNSum(*in_mem1, *in_mem2, *out_mem);
+      MKLDNNStream::Get()->Submit();
+      VerifySumMemory(*in_mem1, *in_mem2, *out_mem);
+    }
+
+    // in place
+    auto input_mem = in_arr.arr.GetMKLDNNData();
+    auto input_mem2 = in_arr2.arr.GetMKLDNNData();
+    NDArrayAttrs orig_arr(in_arr.arr.Copy(in_arr.arr.ctx()), "In Place Copy");
+    PrintVerifyMsg(orig_arr, in_arr);
+    InitMKLDNNArray(&orig_arr.arr, input_mem->get_primitive_desc(), InitDefaultArray);
+    orig_arr.arr.CopyFrom(*input_mem);
+    auto old_mem = orig_arr.arr.GetMKLDNNData();
+    op::MKLDNNSum(*input_mem, *input_mem2, *input_mem);
+    MKLDNNStream::Get()->Submit();
+    VerifySumMemory(*old_mem, *input_mem2, *input_mem);
+  }
+}
+
 #endif
diff --git a/tests/cpp/storage/storage_test.cc b/tests/cpp/storage/storage_test.cc
index 269480b83c3..026c3660f32 100644
--- a/tests/cpp/storage/storage_test.cc
+++ b/tests/cpp/storage/storage_test.cc
@@ -1,5 +1,4 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
+/* * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -22,6 +21,7 @@
  * \file storage_test.cc
  * \brief cpu/gpu storage tests
 */
+#include <stdlib.h>
 #include <gtest/gtest.h>
 #include <dmlc/logging.h>
 #include <mxnet/storage.h>
@@ -43,7 +43,37 @@ TEST(Storage, Basic_CPU) {
 }
 
 #if MXNET_USE_CUDA
-TEST(Storage, Basic_GPU) {
+TEST(Storage_GPU, Basic_GPU) {
+  if (mxnet::test::unitTestsWithCuda) {
+    putenv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF=20");
+    putenv("MXNET_GPU_MEM_POOL_TYPE=Round");
+    auto &&storage = mxnet::Storage::Get();
+    mxnet::Context context_gpu = mxnet::Context::GPU(0);
+    auto &&handle = storage->Alloc(32, context_gpu);
+    auto &&handle2 = storage->Alloc(2097153, context_gpu);
+    EXPECT_EQ(handle.ctx, context_gpu);
+    EXPECT_EQ(handle.size, 32);
+    EXPECT_EQ(handle2.ctx, context_gpu);
+    EXPECT_EQ(handle2.size, 2097153);
+    auto ptr = handle.dptr;
+    auto ptr2 = handle2.dptr;
+    storage->Free(handle);
+    storage->Free(handle2);
+
+    handle = storage->Alloc(4095, context_gpu);
+    EXPECT_EQ(handle.ctx, context_gpu);
+    EXPECT_EQ(handle.size, 4095);
+    EXPECT_EQ(handle.dptr, ptr);
+    storage->Free(handle);
+
+    handle2 = storage->Alloc(3145728, context_gpu);
+    EXPECT_EQ(handle2.ctx, context_gpu);
+    EXPECT_EQ(handle2.size, 3145728);
+    EXPECT_EQ(handle2.dptr, ptr2);
+    storage->Free(handle2);
+    unsetenv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF");
+    unsetenv("MXNET_GPU_MEM_POOL_TYPE");
+  }
   if (mxnet::test::unitTestsWithCuda) {
     constexpr size_t kSize = 1024;
     mxnet::Context context_gpu = mxnet::Context::GPU(0);
diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
index 453161fcfe7..126ccabaa7b 100644
--- a/tests/python/gpu/test_forward.py
+++ b/tests/python/gpu/test_forward.py
@@ -22,7 +22,7 @@
 from mxnet.test_utils import *
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon import utils
 
 def _get_model():
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 273ad3d69ca..d4f6f31a30e 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -27,7 +27,7 @@
 import unittest
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index a6e8ebf658a..76231fbe90e 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -24,7 +24,7 @@
 from mxnet.test_utils import assert_almost_equal, default_context
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 shape = (4, 4)
 keys = [5, 7, 11]
@@ -83,7 +83,7 @@ def check_rsp_pull(kv, count, ctxs, is_same_rowid=False, use_slice=False):
         check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], is_same_rowid=True)
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)])
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], is_same_rowid=True)
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], use_slice=True) 
+        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], use_slice=True)
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], use_slice=True)
 
     # test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/9384
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7c3d670ba22..ed4aaa43782 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -32,7 +32,7 @@
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from test_operator import *
 from test_optimizer import *
 from test_random import *
@@ -740,8 +740,8 @@ def test_pooling_with_type():
 
 @with_seed()
 def test_pooling_versions():
-    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride,
-                                     pooling_convention='valid', global_pool=False, p_value=2):
+    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride, pooling_convention='valid',
+                                     global_pool=False, p_value=2, count_include_pad=True, tol=None):
         ctx_list = []
         sym_list = []
         # PoolingV1 cpu
@@ -765,61 +765,69 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
             ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
             if not global_pool:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, name='pool', p_value=p_value))
+                                               pooling_convention=pooling_convention, name='pool',
+                                               p_value=p_value, count_include_pad=count_include_pad))
             else:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool', p_value=p_value))
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool',
+                                               p_value=p_value, count_include_pad=count_include_pad))
         # Pooling gpu
         if 'pool_gpu' in pool_op_list:
             ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
             if not global_pool:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool', p_value=p_value))
+                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool',
+                                               p_value=p_value, count_include_pad=count_include_pad))
             else:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True,
-                                               name='pool', p_value=p_value))
+                                               name='pool', p_value=p_value, count_include_pad=count_include_pad))
         # CuDNNPooling
         if 'pool_cudnn' in pool_op_list:
             ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
             if not global_pool:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, p_value=p_value, cudnn_off=False, name='pool'))
+                                               pooling_convention=pooling_convention, p_value=p_value, cudnn_off=False,
+                                               name='pool', count_include_pad=count_include_pad))
             else:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, p_value=p_value,
-                                               cudnn_off=False, name='pool'))
-        check_consistency(sym_list, ctx_list)
+                                               cudnn_off=False, name='pool', count_include_pad=count_include_pad))
+        check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), tol=tol)
 
-    def test_1d_pooling(pool_type, p_value=2):
+    def test_1d_pooling(pool_type, p_value=2, count_include_pad=True):
         data = (2, 3, 20)
         kernel = (4,)
         pad = (0,)
         stride = (1,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2,)
         stride = (2,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (0,)
         stride = (1,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2,)
         stride = (2,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     global_pool=True, p_value=p_value)
+                                     global_pool=True, p_value=p_value, count_include_pad=count_include_pad)
 
-    def test_2d_pooling(pool_type, p_value=2):
+    def test_2d_pooling(pool_type, p_value=2, count_include_pad=True):
         data = (2, 3, 20, 20)
         kernel = (4, 5)
         pad = (0, 0)
@@ -831,14 +839,15 @@ def test_2d_pooling(pool_type, p_value=2):
         else:
             test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                          data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         pooling_convention='valid', global_pool=False)
+                                         pooling_convention='valid', global_pool=False, count_include_pad=count_include_pad)
 
         # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
         pad = (2, 3)
         stride = (2, 3)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (0, 0)
         stride = (1, 1)
@@ -847,16 +856,24 @@ def test_2d_pooling(pool_type, p_value=2):
                                          data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                          pooling_convention='full', global_pool=False, p_value=p_value)
         else:
-            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         pooling_convention='full', global_pool=False)
+            if count_include_pad:
+                test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                             data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                             pooling_convention='full', global_pool=False,
+                                             count_include_pad=count_include_pad)
+            else:
+                test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                             data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                             pooling_convention='full', global_pool=False,
+                                             count_include_pad=count_include_pad)
 
         # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
         pad = (2, 3)
         stride = (2, 3)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         if pool_type == 'lp':
             test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
@@ -865,55 +882,62 @@ def test_2d_pooling(pool_type, p_value=2):
         else:
             test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                          data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         global_pool=True)
+                                         global_pool=True, count_include_pad=count_include_pad)
 
-    def test_3d_pooling(pool_type, p_value=2):
+    def test_3d_pooling(pool_type, p_value=2, count_include_pad=True):
         data = (2, 3, 20, 20, 20)
         kernel = (4, 5, 3)
         pad = (0, 0, 0)
         stride = (1, 1, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2, 3, 3)
         stride = (2, 3, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (0, 0, 0)
         stride = (1, 1, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2, 3, 3)
         stride = (2, 3, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     global_pool=True, p_value=p_value)
+                                     global_pool=True, p_value=p_value, count_include_pad=count_include_pad)
 
     test_1d_pooling('max')
-    test_1d_pooling('avg')
+    test_1d_pooling('avg', count_include_pad=True)
+    test_1d_pooling('avg', count_include_pad=False)
     test_1d_pooling('sum')
     test_1d_pooling('lp', p_value=1)
     test_1d_pooling('lp', p_value=2)
     test_1d_pooling('lp', p_value=3)
 
     test_2d_pooling('max')
-    test_2d_pooling('avg')
+    test_2d_pooling('avg', count_include_pad=True)
+    test_2d_pooling('avg', count_include_pad=False)
     test_2d_pooling('sum')
     test_2d_pooling('lp', p_value=1)
     test_2d_pooling('lp', p_value=2)
     test_2d_pooling('lp', p_value=3)
 
     test_3d_pooling('max')
-    test_3d_pooling('avg')
+    test_3d_pooling('avg', count_include_pad=True)
+    test_3d_pooling('avg', count_include_pad=False)
     test_3d_pooling('sum')
     test_3d_pooling('lp', p_value=1)
     test_3d_pooling('lp', p_value=2)
diff --git a/tests/python/mkl/test_quantization_mkldnn.py b/tests/python/mkl/test_quantization_mkldnn.py
new file mode 100644
index 00000000000..290f1a195c2
--- /dev/null
+++ b/tests/python/mkl/test_quantization_mkldnn.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import sys
+import mxnet as mx
+
+os.environ['ENABLE_MKLDNN_QUANTIZATION_TEST'] = '1'
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../quantization'))
+from test_quantization import *
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 7b08f46e836..15e8582b9ee 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -18,6 +18,7 @@
 """Some of the tests using CUDNN require a special GPU instruction called dp4a.
 Ref: http://images.nvidia.com/content/pdf/tesla/184457-Tesla-P4-Datasheet-NV-Final-Letter-Web.pdf
 """
+import os
 import mxnet as mx
 import numpy as np
 from mxnet.test_utils import assert_almost_equal, rand_ndarray, rand_shape_nd, same, DummyIter
@@ -25,6 +26,16 @@
 from mxnet.module import Module
 from mxnet.io import NDArrayIter
 
+def is_test_for_gpu():
+    return mx.current_context().device_type == 'gpu'
+
+def is_test_for_mkldnn():
+    return (mx.current_context().device_type == 'cpu'
+            and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == '1')
+
+def is_test_for_native_cpu():
+    return (mx.current_context().device_type == 'cpu'
+            and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == None)
 
 @with_seed()
 def test_quantize_float32_to_int8():
@@ -120,187 +131,220 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
 
 @with_seed()
 def test_quantized_conv():
-    if mx.current_context().device_type != 'gpu':
-        print('skipped testing quantized_conv on cpu since it is not implemented yet')
-        return
-
-    def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias):
-        with mx.Context('gpu', 0):
-            # run fp32 conv
-            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-            conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
-                                        no_bias=no_bias, cudnn_off=False, name='conv2d')
-            arg_shapes, _, _ = conv2d.infer_shape(data=data_shape)
-            arg_names = conv2d.list_arguments()
-            conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null')
-            conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                           shape=data_shape).astype('int32')
-            conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                           shape=arg_shapes[1]).astype('int32')
-            if not no_bias:
-                conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                               shape=arg_shapes[2]).astype('int32')
-            output = conv_exe_fp32.forward()[0]
-
-            # run quantized conv
-            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
-            qweight = mx.sym.Variable(name='qweight', dtype='int8')
-            min_data = mx.sym.Variable(name='min_data')
-            max_data = mx.sym.Variable(name='max_data')
-            min_weight = mx.sym.Variable(name='min_weight')
-            max_weight = mx.sym.Variable(name='max_weight')
-            quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data,
-                                                             max_data=max_data, min_weight=min_weight,
-                                                             max_weight=max_weight, kernel=kernel,
-                                                             num_filter=num_filter, pad=pad, stride=stride,
-                                                             no_bias=no_bias)
-            qarg_names = quantized_conv2d.list_arguments()
-            type_dict = None
-            if not no_bias:
-                type_dict = {qarg_names[2]: 'int8'}
-            conv_exe_int8 = quantized_conv2d.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
-            conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype('int8')
-            conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8')
-            quantized_range = 127.0
-            if no_bias:
-                conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range
-                conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range
-            else:
-                conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8')
-                conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range
-                conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range
-                conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range
-            qoutput, min_range, max_range = conv_exe_int8.forward()
-
-            if no_bias:
-                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-            else:
-                # with adding bias, accuracy loss should not be greater than one
-                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-                cond = mx.nd.lesser(2, diff).sum().asscalar()
-                assert cond == 0
-
-    check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), True)
-    check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), False)
+    def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias, qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_conv for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'int8' and is_test_for_mkldnn():
+            print('skipped testing quantized_conv for mkldnn cpu int8 since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantized_conv for gpu uint8 since it is not supported yet')
+            return
+
+        # run fp32 conv
+        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+        conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
+                                    no_bias=no_bias, cudnn_off=False, name='conv2d')
+        arg_shapes, _, _ = conv2d.infer_shape(data=data_shape)
+        arg_names = conv2d.list_arguments()
+        conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
+                                                                        shape=data_shape).astype('int32')
+        conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                        shape=arg_shapes[1]).astype('int32')
+        if not no_bias:
+            conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                            shape=arg_shapes[2]).astype('int32')
+        output = conv_exe_fp32.forward()[0]
+
+        # run quantized conv
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
+        qweight = mx.sym.Variable(name='qweight', dtype='int8')
+        min_data = mx.sym.Variable(name='min_data')
+        max_data = mx.sym.Variable(name='max_data')
+        min_weight = mx.sym.Variable(name='min_weight')
+        max_weight = mx.sym.Variable(name='max_weight')
+        quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data,
+                                                            max_data=max_data, min_weight=min_weight,
+                                                            max_weight=max_weight, kernel=kernel,
+                                                            num_filter=num_filter, pad=pad, stride=stride,
+                                                            no_bias=no_bias)
+        qarg_names = quantized_conv2d.list_arguments()
+        type_dict = None
+        if not no_bias:
+            type_dict = {qarg_names[2]: 'int8'}
+        conv_exe_int8 = quantized_conv2d.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
+        conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype(qdtype)
+        conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8')
+        quantized_range = 127.0
+        if no_bias:
+            conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range
+            conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range
+        else:
+            conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8')
+            conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range
+            conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range
+            conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range
+        qoutput, min_range, max_range = conv_exe_int8.forward()
+
+        if no_bias:
+            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+        else:
+            # with adding bias, accuracy loss should not be greater than one
+            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            assert cond == 0
 
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), True, qdtype)
+        check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), False, qdtype)
 
 @with_seed()
 def test_quantized_pooling():
-    if mx.current_context().device_type != 'gpu':
-        print('skipped testing quantized_pooling on cpu since it is not implemented yet')
-        return
-
-    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool):
-        with mx.Context('gpu', 0):
-            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-            pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
-                                          pool_type=pool_type, global_pool=global_pool, cudnn_off=False)
-            arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
-            arg_names = pooling_fp32.list_arguments()
-            pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-            pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                              shape=data_shape).astype('int32')
-            output = pooling_fp32_exe.forward()[0]
-
-            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
-            min_data = mx.sym.Variable(name='min_data')
-            max_data = mx.sym.Variable(name='max_data')
-            quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
-                                                                 max_data=max_data, kernel=kernel,
-                                                                 pad=pad, stride=stride, pool_type=pool_type,
-                                                                 global_pool=global_pool)
-            pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
-            qarg_names = quantized_pooling.list_arguments()
-            pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype('int8')
-            quantized_range = 127.0
-            pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
-            pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
-            qoutput, min_range, max_range = pooling_int8_exe.forward()
-
-            if pool_type == 'max':
-                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-            elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
-                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-                cond = mx.nd.lesser(2, diff).sum().asscalar()
-                assert cond == 0
-
-    check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), False)
-    check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), True)
-    check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), False)
-    check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), True)
-
+    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool, qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_pooling for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantized_pooling for gpu uint8 since it is not supported yet')
+            return
+
+        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+        pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
+                                        pool_type=pool_type, global_pool=global_pool, cudnn_off=False)
+        arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
+        arg_names = pooling_fp32.list_arguments()
+        pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
+                                                                            shape=data_shape).astype('int32')
+        output = pooling_fp32_exe.forward()[0]
+
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
+        min_data = mx.sym.Variable(name='min_data')
+        max_data = mx.sym.Variable(name='max_data')
+        quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
+                                                                max_data=max_data, kernel=kernel,
+                                                                pad=pad, stride=stride, pool_type=pool_type,
+                                                                global_pool=global_pool)
+        pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
+        qarg_names = quantized_pooling.list_arguments()
+        pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
+        quantized_range = 127.0
+        pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
+        pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
+        qoutput, min_range, max_range = pooling_int8_exe.forward()
+
+        if pool_type == 'max':
+            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+        elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
+            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            assert cond == 0
+
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), False, qdtype)
+        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), True, qdtype)
+        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), False, qdtype)
+        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), True, qdtype)
 
 @with_seed()
 def test_quantized_fc():
-    if mx.current_context().device_type != 'gpu':
-        print('skipped testing quantized_fc on cpu since it is not implemented yet')
-        return
-
-    def check_quantized_fc(data_shape, num_hidden, no_bias, flatten=True):
-        with mx.Context('gpu', 0):
-            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-            fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
-            arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
-            arg_names = fc_fp32.list_arguments()
-            fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-            fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                         shape=data_shape).astype('int32')
-            fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                         shape=arg_shapes[1]).astype('int32')
-            if not no_bias:
-                fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                             shape=arg_shapes[2]).astype('int32')
-            output = fc_fp32_exe.forward()[0]
-
-            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
-            fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
-                                                               no_bias=no_bias, flatten=flatten)
-            qarg_names = fc_int8.list_arguments()
-            type_dict = {qarg_names[1]: 'int8'}
-            if not no_bias:
-                type_dict.update({qarg_names[2]: 'int8'})
-            fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
-            fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype('int8')
-            fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
-            quantized_range = 127.0
-            if no_bias:
-                fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
-                fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
-            else:
-                fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
-                fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
-                fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
-                fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
-            qoutput, min_range, max_range = fc_int8_exe.forward()
-
-            if no_bias:
-                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-            else:
-                # with adding bias, accuracy loss should not be greater than one
-                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-                cond = mx.nd.lesser(2, diff).sum().asscalar()
-                assert cond == 0
-
-    check_quantized_fc((32, 512, 2, 2), 100, True)
-    check_quantized_fc((32, 111, 2, 2), 100, True)
-    check_quantized_fc((32, 512, 2, 2), 100, False)
-    check_quantized_fc((32, 111, 2, 2), 100, False)
+    def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
+        if mx.current_context().device_type != 'gpu':
+            print('skipped testing quantized_fc on cpu since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
+            return
+
+        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+        fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
+        arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
+        arg_names = fc_fp32.list_arguments()
+        fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
+                                                                     shape=data_shape).astype('int32')
+        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                     shape=arg_shapes[1]).astype('int32')
+        if not no_bias:
+            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                         shape=arg_shapes[2]).astype('int32')
+        output = fc_fp32_exe.forward()[0]
+
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
+        fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
+                                                           no_bias=no_bias, flatten=flatten)
+        qarg_names = fc_int8.list_arguments()
+        type_dict = {qarg_names[1]: 'int8'}
+        if not no_bias:
+            type_dict.update({qarg_names[2]: 'int8'})
+        fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
+        fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
+        fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
+        quantized_range = 127.0
+        if no_bias:
+            fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
+        else:
+            fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
+            fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
+        qoutput, min_range, max_range = fc_int8_exe.forward()
+
+        if no_bias:
+            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+        else:
+            # with adding bias, accuracy loss should not be greater than one
+            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            assert cond == 0
 
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_fc((32, 512, 2, 2), 100, True, qdtype)
+        check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
+        check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
+        check_quantized_fc((32, 111, 2, 2), 100, False, qdtype)
 
 @with_seed()
 def test_quantized_flatten():
-    def check_quantized_flatten(shape):
-        qdata = mx.nd.random.uniform(low=-127, high=127, shape=shape).astype('int8')
+    def check_quantized_flatten(shape, qdtype):
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        qdata = mx.nd.random.uniform(low=data_low, high=data_high, shape=shape).astype(qdtype)
         min_data = mx.nd.array([-1023.343], dtype='float32')
         max_data = mx.nd.array([2343.324275], dtype='float32')
         qoutput, min_output, max_output = mx.nd.contrib.quantized_flatten(qdata, min_data, max_data)
@@ -311,10 +355,11 @@ def check_quantized_flatten(shape):
         assert same(min_data.asnumpy(), min_output.asnumpy())
         assert same(max_data.asnumpy(), max_output.asnumpy())
 
-    check_quantized_flatten((10,))
-    check_quantized_flatten((10, 15))
-    check_quantized_flatten((10, 15, 18))
-    check_quantized_flatten((3, 4, 23, 23))
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_flatten((10,), qdtype)
+        check_quantized_flatten((10, 15), qdtype)
+        check_quantized_flatten((10, 15, 18), qdtype)
+        check_quantized_flatten((3, 4, 23, 23), qdtype)
 
 
 @with_seed()
@@ -353,56 +398,69 @@ def get_fp32_sym():
 
 @with_seed()
 def test_quantize_model():
-    def check_params(params, qparams, qsym=None):
-        if qsym is None:
-            assert len(params) == len(qparams)
-            for k, v in params.items():
-                assert k in qparams
-                assert same(v.asnumpy(), qparams[k].asnumpy())
-        else:
-            qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
-            assert len(qparams) == len(qparams_ground_truth)
-            for k, v in qparams_ground_truth.items():
-                assert k in qparams
-                assert same(v.asnumpy(), qparams[k].asnumpy())
-
-    def check_qsym_calibrated(qsym):
-        attrs = qsym.attr_dict()
-        for k, v in attrs.items():
-            if k.find('requantize_') != -1:
-                assert 'min_calib_range' in v
-                assert 'max_calib_range' in v
-
-    sym = get_fp32_sym()
-    mod = Module(symbol=sym)
-    batch_size = 4
-    data_shape = (batch_size, 4, 10, 10)
-    label_shape = (batch_size, 10)
-    mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
-    mod.init_params()
-    arg_params, aux_params = mod.get_params()
-    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
-                                                                     arg_params=arg_params,
-                                                                     aux_params=aux_params,
-                                                                     ctx=mx.current_context(),
-                                                                     calib_mode='none')
-    check_params(arg_params, qarg_params, qsym)
-    check_params(aux_params, qaux_params)
-
-    calib_data = mx.nd.random.uniform(shape=data_shape)
-    calib_data = NDArrayIter(data=calib_data)
-    calib_data = DummyIter(calib_data)
-    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
-                                                                     arg_params=arg_params,
-                                                                     aux_params=aux_params,
-                                                                     ctx=mx.current_context(),
-                                                                     calib_mode='naive',
-                                                                     calib_data=calib_data,
-                                                                     num_calib_examples=20)
-    check_params(arg_params, qarg_params, qsym)
-    check_params(aux_params, qaux_params)
-    check_qsym_calibrated(qsym)
-
+    def check_quantize_model(qdtype):
+        def check_params(params, qparams, qsym=None):
+            if qsym is None:
+                assert len(params) == len(qparams)
+                for k, v in params.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+            else:
+                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
+                assert len(qparams) == len(qparams_ground_truth)
+                for k, v in qparams_ground_truth.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+
+        def check_qsym_calibrated(qsym):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('requantize_') != -1:
+                    assert 'min_calib_range' in v
+                    assert 'max_calib_range' in v
+
+        def check_qsym_qdtype(qsym, qdtype):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('_quantize') != -1:
+                    assert 'out_type' in v
+                    assert v['out_type'] == qdtype
+
+        sym = get_fp32_sym()
+        mod = Module(symbol=sym)
+        batch_size = 4
+        data_shape = (batch_size, 4, 10, 10)
+        label_shape = (batch_size, 10)
+        mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
+        mod.init_params()
+        arg_params, aux_params = mod.get_params()
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                         arg_params=arg_params,
+                                                                         aux_params=aux_params,
+                                                                         ctx=mx.current_context(),
+                                                                         quantized_dtype=qdtype,
+                                                                         calib_mode='none')
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+
+        calib_data = mx.nd.random.uniform(shape=data_shape)
+        calib_data = NDArrayIter(data=calib_data)
+        calib_data = DummyIter(calib_data)
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                         arg_params=arg_params,
+                                                                         aux_params=aux_params,
+                                                                         ctx=mx.current_context(),
+                                                                         quantized_dtype=qdtype,
+                                                                         calib_mode='naive',
+                                                                         calib_data=calib_data,
+                                                                         num_calib_examples=20)
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+        check_qsym_calibrated(qsym)
+        check_qsym_qdtype(qsym, qdtype)
+
+    for qdtype in ['int8', 'uint8']:
+        check_quantize_model(qdtype)
 
 @with_seed()
 def test_quantize_sym_with_calib():
diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
index 635bdcc6092..b38c851984d 100644
--- a/tests/python/unittest/common.py
+++ b/tests/python/unittest/common.py
@@ -241,3 +241,11 @@ def __enter__(self):
 
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self._dirname)
+
+def teardown():
+    """
+    A function with a 'magic name' executed automatically after each nosetests test module.
+
+    It waits for all operations in one file to finish before carrying on the next.
+    """
+    mx.nd.waitall()
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index c2d0d26c9df..2f889845af3 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -20,7 +20,7 @@
 from mxnet.ndarray import zeros_like
 from mxnet.autograd import *
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def grad_and_loss(func, argnum=None):
diff --git a/tests/python/unittest/test_contrib_autograd.py b/tests/python/unittest/test_contrib_autograd.py
index 9e80bba306d..1c878e322e7 100644
--- a/tests/python/unittest/test_contrib_autograd.py
+++ b/tests/python/unittest/test_contrib_autograd.py
@@ -18,7 +18,7 @@
 import mxnet.ndarray as nd
 from mxnet.contrib.autograd import *
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 def autograd_assert(*args, **kwargs):
     func   = kwargs["func"]
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index bbfed947852..e9e161d7f3b 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
 from mxnet.test_utils import assert_exception, default_context, set_default_context
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index 05e71b426eb..630cad87496 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import mxnet as mx
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def reldiff(a, b):
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index ced3063448b..e9259fde4b3 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -20,9 +20,8 @@
 from mxnet.gluon import nn
 from mxnet.test_utils import assert_almost_equal
 from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
-from common import setup_module, with_seed, assertRaises
+from common import setup_module, with_seed, assertRaises, teardown
 import numpy as np
-from numpy.testing import assert_array_equal
 from nose.tools import raises, assert_raises
 from copy import deepcopy
 import warnings
@@ -202,20 +201,20 @@ def forward(self, x):
     net1.collect_params().initialize()
     net2(mx.nd.zeros((3, 5)))
 
-    net1.save_params('net1.params')
+    net1.save_parameters('net1.params')
 
     net3 = Net(prefix='net3_')
-    net3.load_params('net1.params', mx.cpu())
+    net3.load_parameters('net1.params', mx.cpu())
 
     net4 = Net(prefix='net4_')
     net5 = Net(prefix='net5_', in_units=5, params=net4.collect_params())
     net4.collect_params().initialize()
     net5(mx.nd.zeros((3, 5)))
 
-    net4.save_params('net4.params')
+    net4.save_parameters('net4.params')
 
     net6 = Net(prefix='net6_')
-    net6.load_params('net4.params', mx.cpu())
+    net6.load_parameters('net4.params', mx.cpu())
 
 
 @with_seed()
@@ -359,6 +358,7 @@ def test_sparse_hybrid_block():
 
 @with_seed()
 def check_layer_forward(layer, dshape):
+    print("checking layer {}\nshape: {}.".format(layer, dshape))
     layer.collect_params().initialize()
     x = mx.nd.ones(shape=dshape)
     x.attach_grad()
@@ -438,7 +438,7 @@ def test_deconv():
         nn.Conv2DTranspose(16, (3, 4), groups=2, in_channels=4),
         nn.Conv2DTranspose(16, (3, 4), strides=4, in_channels=4),
         nn.Conv2DTranspose(16, (3, 4), dilation=4, in_channels=4),
-        nn.Conv2DTranspose(16, (3, 4), padding=4, in_channels=4),
+    #   nn.Conv2DTranspose(16, (3, 4), padding=4, in_channels=4),
         nn.Conv2DTranspose(16, (3, 4), strides=4, output_padding=3, in_channels=4),
         ]
     for layer in layers2d:
@@ -470,6 +470,7 @@ def test_pool():
         nn.MaxPool1D(3),
         nn.MaxPool1D(3, 2),
         nn.AvgPool1D(),
+        nn.AvgPool1D(count_include_pad=False),
         nn.GlobalAvgPool1D(),
         ]
     for layer in layers1d:
@@ -481,6 +482,7 @@ def test_pool():
         nn.MaxPool2D((3, 3)),
         nn.MaxPool2D(3, 2),
         nn.AvgPool2D(),
+        nn.AvgPool2D(count_include_pad=False),
         nn.GlobalAvgPool2D(),
         ]
     for layer in layers2d:
@@ -491,6 +493,7 @@ def test_pool():
         nn.MaxPool3D((3, 3, 3)),
         nn.MaxPool3D(3, 2),
         nn.AvgPool3D(),
+        nn.AvgPool3D(count_include_pad=False),
         nn.GlobalAvgPool3D(),
         ]
     for layer in layers3d:
@@ -776,7 +779,7 @@ def test_export():
     model = gluon.model_zoo.vision.resnet18_v1(
         prefix='resnet', ctx=ctx, pretrained=True)
     model.hybridize()
-    data = mx.nd.random.normal(shape=(1, 3, 224, 224))
+    data = mx.nd.random.normal(shape=(1, 3, 32, 32))
     out = model(data)
 
     model.export('gluon')
@@ -794,6 +797,22 @@ def test_export():
 
     assert_almost_equal(out.asnumpy(), out2.asnumpy())
 
+@with_seed()
+def test_import():
+    ctx = mx.context.current_context()
+    net1 = gluon.model_zoo.vision.resnet18_v1(
+        prefix='resnet', ctx=ctx, pretrained=True)
+    net1.hybridize()
+    data = mx.nd.random.normal(shape=(1, 3, 32, 32))
+    out1 = net1(data)
+
+    net1.export('net1', epoch=1)
+
+    net2 = gluon.SymbolBlock.imports(
+        'net1-symbol.json', ['data'], 'net1-0001.params', ctx)
+    out2 = net2(data)
+
+    assert_almost_equal(out1.asnumpy(), out2.asnumpy())
 
 @with_seed()
 def test_hybrid_stale_cache():
@@ -910,7 +929,7 @@ def test_fill_shape_load():
     net1.hybridize()
     net1.initialize(ctx=ctx)
     net1(mx.nd.ones((2,3,5,7), ctx))
-    net1.save_params('net_fill.params')
+    net1.save_parameters('net_fill.params')
 
     net2 = nn.HybridSequential()
     with net2.name_scope():
@@ -919,7 +938,7 @@ def test_fill_shape_load():
                  nn.Dense(10))
     net2.hybridize()
     net2.initialize()
-    net2.load_params('net_fill.params', ctx)
+    net2.load_parameters('net_fill.params', ctx)
     assert net2[0].weight.shape[1] == 3, net2[0].weight.shape[1]
     assert net2[1].gamma.shape[0] == 64, net2[1].gamma.shape[0]
     assert net2[2].weight.shape[1] == 3072, net2[2].weight.shape[1]
@@ -1065,12 +1084,12 @@ def test_req():
 @with_seed()
 def test_save_load():
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=True)
-    net.save_params('test_save_load.params')
+    net.save_parameters('test_save_load.params')
 
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
     net.output = mx.gluon.nn.Dense(1000)
 
-    net.load_params('test_save_load.params')
+    net.load_parameters('test_save_load.params')
 
 @with_seed()
 def test_symbol_block_save_load():
@@ -1095,10 +1114,10 @@ def hybrid_forward(self, F, x):
     net1.initialize(mx.init.Normal())
     net1.hybridize()
     net1(mx.nd.random.normal(shape=(1, 3, 32, 32)))
-    net1.save_params('./test_symbol_block_save_load.params')
+    net1.save_parameters('./test_symbol_block_save_load.params')
 
     net2 = Net()
-    net2.load_params('./test_symbol_block_save_load.params', ctx=mx.cpu())
+    net2.load_parameters('./test_symbol_block_save_load.params', ctx=mx.cpu())
 
 
 @with_seed()
@@ -1108,6 +1127,7 @@ def test_hybrid_multi_context():
     net.hybridize()
     net(mx.nd.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy()
 
+
 @with_seed()
 def test_zero_grad():
     data = mx.nd.random.uniform(shape=(3,3))
@@ -1120,60 +1140,6 @@ def test_zero_grad():
     grad = net.collect_params()['test_zero_grad_weight'].grad()
     assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0)
 
-def check_hybrid_static_memory(**kwargs):
-    x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
-    x.attach_grad()
-
-    net1 = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=True, prefix='net_', ctx=mx.context.current_context())
-    net2 = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=True, prefix='net_', ctx=mx.context.current_context())
-    net2.hybridize(**kwargs)
-    net1(x)
-    net2(x)
-
-    def test(net, x):
-        with mx.autograd.record():
-            y = net(x) + net(x)
-            y.backward()
-
-        grads = {k: v.grad() for k, v in net.collect_params().items() if v.grad_req != 'null'}
-
-        return y, grads
-
-    y1, grads1 = test(net1, x)
-    y2, grads2 = test(net2, x)
-
-    assert_almost_equal(y1.asnumpy(), y2.asnumpy(), rtol=1e-3, atol=1e-5)
-    for key in grads1:
-        assert_almost_equal(grads1[key].asnumpy(), grads2[key].asnumpy(), rtol=1e-3, atol=1e-5)
-
-def test_hybrid_static_memory():
-    check_hybrid_static_memory()
-    check_hybrid_static_memory(static_alloc=True)
-    check_hybrid_static_memory(static_alloc=True, static_shape=True)
-
-def check_hybrid_static_memory_switching(**kwargs):
-    net = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=True, ctx=mx.context.current_context())
-    net.hybridize(**kwargs)
-
-    x = mx.nd.random.uniform(shape=(4, 3, 32, 32))
-    net(x)
-    with mx.autograd.record():
-        y = net(x)
-        y.backward()
-    x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
-    net(x)
-    with mx.autograd.record():
-        y = net(x)
-        y.backward()
-    mx.nd.waitall()
-
-def test_hybrid_static_memory_switching():
-    check_hybrid_static_memory_switching()
-    check_hybrid_static_memory_switching(static_alloc=True)
-    check_hybrid_static_memory_switching(static_alloc=True, static_shape=True)
 
 @with_seed()
 def test_hook():
@@ -1252,6 +1218,22 @@ def test_summary():
     assert_raises(AssertionError, net.summary, mx.nd.ones((32, 3, 224, 224)))
 
 
+@with_seed()
+def test_legacy_save_params():
+    net = gluon.nn.HybridSequential(prefix='')
+    with net.name_scope():
+        net.add(gluon.nn.Conv2D(10, (3, 3)))
+        net.add(gluon.nn.Dense(50))
+    net.initialize()
+    net(mx.nd.ones((1,1,50,50)))
+    a = net(mx.sym.var('data'))
+    a.save('test.json')
+    net.save_params('test.params')
+    model = gluon.nn.SymbolBlock(outputs=mx.sym.load_json(open('test.json', 'r').read()),
+                                     inputs=mx.sym.var('data'))
+    model.load_params('test.params', ctx=mx.cpu())
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_contrib.py b/tests/python/unittest/test_gluon_contrib.py
index 264ff1f5e53..a1cd8ea537d 100644
--- a/tests/python/unittest/test_gluon_contrib.py
+++ b/tests/python/unittest/test_gluon_contrib.py
@@ -21,7 +21,7 @@
 from mxnet.gluon import nn
 from mxnet.gluon.contrib.nn import Concurrent, HybridConcurrent, Identity, SparseEmbedding
 from mxnet.test_utils import almost_equal
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import numpy as np
 from numpy.testing import assert_allclose
 
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 751886b8e7f..ef2ba2ab9b2 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -23,7 +23,7 @@
 import random
 from mxnet import gluon
 import platform
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon.data import DataLoader
 import mxnet.ndarray as nd
 from mxnet import context
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
index fe360ac9708..2ff9c5cb2a1 100644
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ b/tests/python/unittest/test_gluon_data_vision.py
@@ -22,7 +22,7 @@
 from mxnet.gluon.data.vision import transforms
 from mxnet.test_utils import assert_almost_equal
 from mxnet.test_utils import almost_equal
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 @with_seed()
@@ -66,18 +66,19 @@ def test_transformer():
     from mxnet.gluon.data.vision import transforms
 
     transform = transforms.Compose([
-		transforms.Resize(300),
-		transforms.CenterCrop(256),
-		transforms.RandomResizedCrop(224),
-		transforms.RandomFlipLeftRight(),
-		transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
-		transforms.RandomBrightness(0.1),
-		transforms.RandomContrast(0.1),
-		transforms.RandomSaturation(0.1),
-		transforms.RandomHue(0.1),
-		transforms.RandomLighting(0.1),
-		transforms.ToTensor(),
-		transforms.Normalize([0, 0, 0], [1, 1, 1])])
+        transforms.Resize(300),
+        transforms.Resize(300, keep_ratio=True),
+        transforms.CenterCrop(256),
+        transforms.RandomResizedCrop(224),
+        transforms.RandomFlipLeftRight(),
+        transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
+        transforms.RandomBrightness(0.1),
+        transforms.RandomContrast(0.1),
+        transforms.RandomSaturation(0.1),
+        transforms.RandomHue(0.1),
+        transforms.RandomLighting(0.1),
+        transforms.ToTensor(),
+        transforms.Normalize([0, 0, 0], [1, 1, 1])])
 
     transform(mx.nd.ones((245, 480, 3), dtype='uint8')).wait_to_read()
 
diff --git a/tests/python/unittest/test_gluon_model_zoo.py b/tests/python/unittest/test_gluon_model_zoo.py
index f89a8f70182..a64668451a2 100644
--- a/tests/python/unittest/test_gluon_model_zoo.py
+++ b/tests/python/unittest/test_gluon_model_zoo.py
@@ -19,7 +19,7 @@
 import mxnet as mx
 from mxnet.gluon.model_zoo.vision import get_model
 import sys
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def eprint(*args, **kwargs):
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index 44d522ab9a5..0ab61bb2748 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -20,7 +20,7 @@
 import numpy as np
 import unittest
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
-from common import setup_module, with_seed, assertRaises
+from common import setup_module, with_seed, assertRaises, teardown
 from mxnet.base import py_str, MXNetError
 
 shape = (4, 4)
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 5a3237d2d64..14c4f6b8a16 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -19,7 +19,7 @@
 import numpy as np
 from mxnet import gluon
 from mxnet.test_utils import assert_almost_equal, default_context
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import unittest
 
 
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index ae950457444..802988b4329 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -21,7 +21,7 @@
 import numpy as np
 from functools import reduce
 from mxnet.module.executor_group import DataParallelExecutorGroup
-from common import setup_module, with_seed, assertRaises
+from common import setup_module, with_seed, assertRaises, teardown
 from collections import namedtuple
 
 
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 92cdb2ced9d..aeaa0b72679 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -21,7 +21,7 @@
 import pickle as pkl
 import unittest
 from nose.tools import raises
-from common import setup_module, with_seed, assertRaises, TemporaryDirectory
+from common import setup_module, with_seed, assertRaises, TemporaryDirectory, teardown
 from mxnet.test_utils import almost_equal
 from mxnet.test_utils import assert_almost_equal, assert_exception
 from mxnet.test_utils import default_context
@@ -711,9 +711,8 @@ def get_values(ensure_unique):
                  k=dat_size*dat_size*dat_size*dat_size, is_ascend=False)
     assert_almost_equal(nd_ret_argsort, gt)
 
-    # test topk with a big shape
-    a = mx.nd.arange(0, 54686454, step=1, repeat=1)
-    assert_almost_equal(a.topk(k=54686454).asnumpy(), a.asnumpy()[::-1])
+    a = mx.nd.arange(0, 1024, step=1, repeat=1)
+    assert_almost_equal(a.topk(k=1024).asnumpy(), a.asnumpy()[::-1])
 
     # Repeat those tests that don't involve indices.  These should pass even with
     # duplicated input data values (over many repeated runs with different random seeds,
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ab03973e8e8..f287c191963 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -25,7 +25,7 @@
 from numpy.testing import assert_allclose, assert_array_equal
 from mxnet.test_utils import *
 from mxnet.base import py_str, MXNetError
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import unittest
 
 def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req):
@@ -115,6 +115,8 @@ def test_gru_sym():
     check_rnn_consistency(fused, stack, T, N, I, H, 'add')
     check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
+
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11219")
 @with_seed()
 def test_gru_bidirectional():
     T, N, I, H = 5, 20, 800, 800
@@ -675,7 +677,9 @@ def fprelu_grad(x, y, gamma):
         copy_x = x.copy()
         copy_x[pos_indices] = 0.0
         grad_x[pos_indices] = 1.0
-        if gamma.shape[0] == 1:
+        if len(gamma.shape) > 1:
+            grad_gam = copy_x
+        elif gamma.shape[0] == 1:
             grad_gam = np.sum(np.sum(copy_x))
         elif gamma.shape[0] > 1:
             grad_gam = np.sum(copy_x, axis=0)
@@ -685,6 +689,7 @@ def fprelu_grad(x, y, gamma):
     gamma = mx.symbol.Variable("gamma")
     for dtype in [np.float16, np.float32, np.float64]:
         for gam in [np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype)]:
+            gam_full = np.array([gam, gam, gam])
             xa = np.random.uniform(low=-1.0,high=1.0,size=shape).astype(dtype)
             rtol = 1e-2
             atol = 1e-3
@@ -692,12 +697,18 @@ def fprelu_grad(x, y, gamma):
             xa[abs(xa) < eps] = 1.0
             y = mx.symbol.LeakyReLU(data=x, gamma=gamma, act_type='prelu')
             ya = fprelu(xa, gam)
+            ya_full = fprelu(xa, gam_full)
             g_xa, g_gam = fprelu_grad(xa, ya, gamma=gam)
+            g_xa_full, g_gam_full = fprelu_grad(xa, ya_full, gamma=gam_full)
             # Skip numeric check for float16 type to get rid of flaky behavior
             if dtype is not np.float16:
                 check_numeric_gradient(y, [xa, gam], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
+                check_numeric_gradient(y, [xa, gam_full], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
             check_symbolic_forward(y, [xa, gam], [ya], rtol=rtol, atol=atol, dtype=dtype)
             check_symbolic_backward(y, [xa, gam], [np.ones(shape), np.ones(gam.shape)], [g_xa, g_gam], rtol=rtol, atol=atol, dtype=dtype)
+            check_symbolic_forward(y, [xa, gam_full], [ya_full], rtol=rtol, atol=atol, dtype=dtype)
+            check_symbolic_backward(y, [xa, gam_full], [np.ones(shape), np.ones(gam_full.shape)],
+                                    [g_xa_full, g_gam_full], rtol=rtol, atol=atol, dtype=dtype)
 
 @with_seed()
 def test_sigmoid():
@@ -5821,7 +5832,7 @@ def py_bilinear_resize(x, outputHeight, outputWidth):
         batch, channel, inputHeight, inputWidth = x.shape
         if outputHeight == inputHeight and outputWidth == inputWidth:
             return x
-        y = np.empty([batch, channel, outputHeight, outputWidth]) 
+        y = np.empty([batch, channel, outputHeight, outputWidth])
         rheight = 1.0 * (inputHeight - 1) / (outputHeight - 1) if outputHeight > 1 else 0.0
         rwidth = 1.0 * (inputWidth - 1) / (outputWidth - 1) if outputWidth > 1 else 0.0
         for h2 in range(outputHeight):
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 90762f7620f..fba10fb522a 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -23,7 +23,7 @@
 from nose.tools import raises
 import math
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 @with_seed()
 def test_learning_rate():
@@ -230,7 +230,10 @@ def test_sgd():
                                     ('multi_precision' not in kwarg or
                                         not kwarg['multi_precision'])):
                                 continue
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                            if dtype == np.float16:
+                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3)
+                            else:
+                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
                             # test operator fallback on cpu
                             if dtype != np.float16:
                                 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
@@ -420,7 +423,7 @@ def update(self, index, weight, grad, state):
               grad += wd * weight
               mom[:] += grad
               grad[:] += self.momentum * mom
-              weight[:] += -lr * grad 
+              weight[:] += -lr * grad
         else:
             grad32 = array(grad, ctx=grad.context, dtype=np.float32)
             grad32 = grad32 * self.rescale_grad
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 7abbc9918c5..3251ba0fac8 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -22,7 +22,7 @@
 from mxnet.test_utils import verify_generator, gen_buckets_probs_with_ppf
 import numpy as np
 import random as rnd
-from common import setup_module, with_seed, random_seed
+from common import setup_module, with_seed, random_seed, teardown
 import scipy.stats as ss
 
 def same(a, b):
diff --git a/tests/python/unittest/test_recordio.py b/tests/python/unittest/test_recordio.py
index 51d80c33504..9edf9b459f7 100644
--- a/tests/python/unittest/test_recordio.py
+++ b/tests/python/unittest/test_recordio.py
@@ -22,7 +22,7 @@
 import tempfile
 import random
 import string
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 @with_seed()
 def test_recordio():
diff --git a/tests/python/unittest/test_rnn.py b/tests/python/unittest/test_rnn.py
index 52a3dcf9934..a5588250e51 100644
--- a/tests/python/unittest/test_rnn.py
+++ b/tests/python/unittest/test_rnn.py
@@ -300,7 +300,15 @@ def test_convgru():
     args, outs, auxs = outputs.infer_shape(rnn_t0_data=(1, 3, 16, 10), rnn_t1_data=(1, 3, 16, 10), rnn_t2_data=(1, 3, 16, 10))
     assert outs == [(1, 10, 16, 10), (1, 10, 16, 10), (1, 10, 16, 10)]
 
+def test_encode_sentences():
+    sentences = [['a','b','c'],['b','c','d']]
+    dict = {'a':1, 'b':2, 'c':3}
+    result, vocab = mx.rnn.io.encode_sentences(sentences, vocab=dict, invalid_label=-1, invalid_key='\n',
+                         start_label=0, unknown_token='UNK')
+    print(result, vocab)
+    assert vocab == {'a': 1, 'b': 2, 'c': 3, 'UNK': 0}
+    assert result == [[1,2,3],[2,3,0]]
+    
 if __name__ == '__main__':
     import nose
     nose.runmodule()
-
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index c90fb1317fe..b0c3a0cdcd2 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -19,7 +19,7 @@
 
 from mxnet.ndarray import NDArray
 from mxnet.test_utils import *
-from common import setup_module, with_seed, random_seed
+from common import setup_module, with_seed, random_seed, teardown
 from mxnet.base import mx_real_t
 from numpy.testing import assert_allclose
 import numpy.random as rnd
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index b2ff0fecb5a..62f5f3ecb07 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -16,7 +16,7 @@
 # under the License.
 
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import random
 import warnings
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services