You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/06/20 23:11:23 UTC

[GitHub] zhreshold closed pull request #11247: Add seed_aug parameter for ImageRecordItr to fix random seed for default augmentation

zhreshold closed pull request #11247: Add seed_aug parameter for ImageRecordItr to fix random seed for default augmentation
URL: https://github.com/apache/incubator-mxnet/pull/11247
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 8f80df0c461..6ab4da67834 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 8f80df0c46188cdf227764848d8aba0cba6e8273
+Subproject commit 6ab4da6783417d8afdeb6b0426b44959b2afc709
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e57c00b69e9..4fe900d4694 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ mxnet_option(USE_NCCL             "Use NVidia NCCL with CUDA" OFF)
 mxnet_option(USE_OPENCV           "Build with OpenCV support" ON)
 mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
-mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON)
+mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON IF NOT ARM)
 mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON) # autodetects support if ON
 mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
@@ -321,14 +321,15 @@ endif()
 
 # ---[ OpenCV
 if(USE_OPENCV)
-  find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
+  find_package(OpenCV COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
+    message(STATUS "OpenCV imgcodecs missing")
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
   endif()
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
   list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
   message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}")
-  message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
+  message(STATUS "OpenCV ${OpenCV_VERSION} found (${OpenCV_CONFIG_PATH})")
   add_definitions(-DMXNET_USE_OPENCV=1)
 else(USE_OPENCV)
   message(STATUS "OpenCV Disabled")
@@ -340,7 +341,11 @@ if(USE_OPENMP)
   find_package(OpenMP REQUIRED)
   # This should build on Windows, but there's some problem and I don't have a Windows box, so
   # could a Windows user please fix?
-  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp/CMakeLists.txt AND SYSTEM_ARCHITECTURE STREQUAL "x86_64" AND NOT MSVC)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp/CMakeLists.txt
+     AND SYSTEM_ARCHITECTURE STREQUAL "x86_64"
+     AND NOT MSVC
+     AND NOT CMAKE_CROSSCOMPILING)
+
     # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
     set(OPENMP_STANDALONE_BUILD TRUE)
     set(LIBOMP_ENABLE_SHARED TRUE)
@@ -360,7 +365,7 @@ if(USE_OPENMP)
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
     endif()
   endif()
-elseif(UNIX)
+elseif(UNIX AND NOT ANDROID)
   list(APPEND mxnet_LINKER_LIBS pthread)
 endif()
 
@@ -575,11 +580,6 @@ if(USE_PLUGIN_CAFFE)
   endif()
 endif()
 
-if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tvm/nnvm/CMakeLists.txt")
-  set(nnvm_LINKER_LIBS nnvm)
-  list(APPEND mxnet_LINKER_LIBS ${nnvm_LINKER_LIBS})
-endif()
-
 if(NOT MSVC)
   # Only add c++11 flags and definitions after cuda compiling
   add_definitions(-DDMLC_USE_CXX11)
@@ -648,7 +648,7 @@ if(USE_PLUGINS_WARPCTC)
 endif()
 
 
-if(USE_OPENCV)
+if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
   add_executable(im2rec "tools/im2rec.cc")
   if(MSVC)
     target_link_libraries(im2rec mxnet)
@@ -659,9 +659,11 @@ if(USE_OPENCV)
     ${mxnet_LINKER_LIBS}
     ${OpenCV_LIBS}
     dmlc
-    ${nnvm_LINKER_LIBS}
     ${pslite_LINKER_LIBS}
     )
+else()
+    message(WARNING "OpenCV_VERSION_MAJOR: ${OpenCV_VERSION_MAJOR}, version 3 with imgcodecs \
+    is required for im2rec, im2rec will not be available")
 endif()
 
 target_link_libraries(mxnet PUBLIC dmlc)
diff --git a/Jenkinsfile b/Jenkinsfile
index 28edda00959..56fbf3d74af 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -92,6 +92,20 @@ echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
 """
 }
 
+def collect_test_results_unix(original_file_name, new_file_name) {
+    echo 'Saving python test results for ' + new_file_name
+    // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
+    sh 'cp ' + original_file_name + ' ' + new_file_name
+    archiveArtifacts artifacts: new_file_name
+}
+
+def collect_test_results_windows(original_file_name, new_file_name) {
+    echo 'Saving python test results for ' + new_file_name
+    // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
+    bat 'xcopy ' + original_file_name + ' ' + new_file_name
+    archiveArtifacts artifacts: new_file_name
+} 
+
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
   def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
@@ -458,6 +472,16 @@ try {
           }
         }
       }
+    },
+    'Android / ARM64':{
+      node('mxnetlinux-cpu') {
+        ws('workspace/android64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            docker_run('android_arm64', 'build_android_arm64', false)
+          }
+        }
+      }
     }
   } // End of stage('Build')
 
@@ -465,36 +489,55 @@ try {
     parallel 'Python2: CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python2-cpu') {
-          init_git()
-          unpack_lib('cpu')
-          python2_ut('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('cpu')
+            python2_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_cpu_train.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_cpu_quantization.xml')
+          }
         }
       }
     },
     'Python3: CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python3-cpu') {
-          init_git()
-          unpack_lib('cpu')
-          python3_ut('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('cpu')
+            python3_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_quantization.xml')
+          }
         }
       }
     },
     'Python2: GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python2-gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          python2_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('gpu', mx_lib)
+            python2_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_gpu.xml')
+          }
         }
       }
     },
     'Python3: GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python3-gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          python3_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('gpu', mx_lib)
+            python3_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
+          }
         }
       }
     },
@@ -502,9 +545,13 @@ try {
       node('mxnetlinux-gpu-p3') {
         ws('workspace/ut-python2-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu', mx_lib)
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
+            try {
+              init_git()
+              unpack_lib('gpu', mx_lib)
+              docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
+            } finally {
+              collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python2_quantize_gpu.xml')
+            }
           }
         }
       }
@@ -513,9 +560,13 @@ try {
       node('mxnetlinux-gpu-p3') {
         ws('workspace/ut-python3-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu', mx_lib)
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
+            try {
+              init_git()
+              unpack_lib('gpu', mx_lib)
+              docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
+            } finally {
+              collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python3_quantize_gpu.xml')
+            }
           }
         }
       }
@@ -523,36 +574,55 @@ try {
     'Python2: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python2-mkldnn-cpu') {
-          init_git()
-          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
-          python2_ut('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+            python2_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_mkldnn_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_mkldnn_cpu_train.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_mkldnn_cpu_quantization.xml')
+          }
         }
       }
     },
     'Python2: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python2-mkldnn-gpu') {
-          init_git()
-          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
-          python2_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+            python2_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_mkldnn_gpu.xml')
+          }
         }
       }
     },
     'Python3: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-python3-mkldnn-cpu') {
-          init_git()
-          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
-          python3_ut_mkldnn('ubuntu_cpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+            python3_ut_mkldnn('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_mkldnn_cpu_unittest.xml')
+            collect_test_results_unix('nosetests_mkl.xml', 'nosetests_python3_mkldnn_cpu_mkl.xml')
+          }
         }
       }
     },
     'Python3: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python3-mkldnn-gpu') {
-          init_git()
-          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
-          python3_gpu_ut('ubuntu_gpu')
+          try {
+            init_git()
+            unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+            python3_gpu_ut('ubuntu_gpu')
+          } finally {
+            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu.xml')
+          }
         }
       }
     },
@@ -560,9 +630,14 @@ try {
       node('mxnetlinux-cpu') {
         ws('workspace/build-centos7-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('centos7_cpu')
-            docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
+            try {
+              init_git()
+              unpack_lib('centos7_cpu')
+              docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
+            } finally {
+              collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_centos7_cpu_unittest.xml')
+              collect_test_results_unix('nosetests_train.xml', 'nosetests_python3_centos7_cpu_train.xml')
+            }
           }
         }
       }
@@ -571,9 +646,13 @@ try {
       node('mxnetlinux-gpu') {
         ws('workspace/build-centos7-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('centos7_gpu')
-            docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
+            try {
+              init_git()
+              unpack_lib('centos7_gpu')
+              docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
+            } finally {
+              collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_centos7_gpu.xml')
+            }
           }
         }
       }
@@ -671,16 +750,21 @@ try {
       node('mxnetwindows-cpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-cpu') {
-            init_git_win()
-            unstash 'vc14_cpu'
-            bat '''rmdir /s/q pkg_vc14_cpu
-              7z x -y vc14_cpu.7z'''
-            bat """xcopy C:\\mxnet\\data data /E /I /Y
-              xcopy C:\\mxnet\\model model /E /I /Y
-              call activate py2
-              set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-              del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-              C:\\mxnet\\test_cpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_cpu'
+              bat '''rmdir /s/q pkg_vc14_cpu
+                7z x -y vc14_cpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py2
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
+                C:\\mxnet\\test_cpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python2_cpu.xml')
+            }
           }
         }
       }
@@ -689,16 +773,21 @@ try {
       node('mxnetwindows-cpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-cpu') {
-            init_git_win()
-            unstash 'vc14_cpu'
-            bat '''rmdir /s/q pkg_vc14_cpu
-              7z x -y vc14_cpu.7z'''
-            bat """xcopy C:\\mxnet\\data data /E /I /Y
-              xcopy C:\\mxnet\\model model /E /I /Y
-              call activate py3
-              set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-              del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-              C:\\mxnet\\test_cpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_cpu'
+              bat '''rmdir /s/q pkg_vc14_cpu
+                7z x -y vc14_cpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py3
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
+                C:\\mxnet\\test_cpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python3_cpu.xml')
+            }
           }
         }
       }
@@ -707,16 +796,21 @@ try {
       node('mxnetwindows-gpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
-            init_git_win()
-            unstash 'vc14_gpu'
-            bat '''rmdir /s/q pkg_vc14_gpu
-              7z x -y vc14_gpu.7z'''
-            bat """xcopy C:\\mxnet\\data data /E /I /Y
-              xcopy C:\\mxnet\\model model /E /I /Y
-              call activate py2
-              set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-              del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-              C:\\mxnet\\test_gpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_gpu'
+              bat '''rmdir /s/q pkg_vc14_gpu
+                7z x -y vc14_gpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py2
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
+                C:\\mxnet\\test_gpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python2_gpu.xml')
+            }
           }
         }
       }
@@ -725,16 +819,21 @@ try {
       node('mxnetwindows-gpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
-          init_git_win()
-          unstash 'vc14_gpu'
-          bat '''rmdir /s/q pkg_vc14_gpu
-            7z x -y vc14_gpu.7z'''
-          bat """xcopy C:\\mxnet\\data data /E /I /Y
-            xcopy C:\\mxnet\\model model /E /I /Y
-            call activate py3
-            set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-            del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-            C:\\mxnet\\test_gpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_gpu'
+              bat '''rmdir /s/q pkg_vc14_gpu
+                7z x -y vc14_gpu.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py3
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
+                C:\\mxnet\\test_gpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python3_gpu.xml')
+            }
           }
         }
       }
@@ -743,16 +842,21 @@ try {
       node('mxnetwindows-gpu') {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
-          init_git_win()
-          unstash 'vc14_gpu_mkldnn'
-          bat '''rmdir /s/q pkg_vc14_gpu_mkldnn
-            7z x -y vc14_gpu_mkldnn.7z'''
-          bat """xcopy C:\\mxnet\\data data /E /I /Y
-            xcopy C:\\mxnet\\model model /E /I /Y
-            call activate py3
-            set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python
-            del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python\\*.pyc
-            C:\\mxnet\\test_gpu.bat"""
+            try {
+              init_git_win()
+              unstash 'vc14_gpu_mkldnn'
+              bat '''rmdir /s/q pkg_vc14_gpu_mkldnn
+                7z x -y vc14_gpu_mkldnn.7z'''
+              bat """xcopy C:\\mxnet\\data data /E /I /Y
+                xcopy C:\\mxnet\\model model /E /I /Y
+                call activate py3
+                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python
+                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python\\*.pyc
+                C:\\mxnet\\test_gpu.bat"""
+            } finally {
+              // We are unable to modify test_cpu.bat, so we can't track test failures on Windows
+              // collect_test_results_windows('nosetests.xml', 'nosetests_windows_python3_mkldnn_Gpu.xml')
+            }
           }
         }
       }
diff --git a/Makefile b/Makefile
index ff4446ab80c..67aaa7cf707 100644
--- a/Makefile
+++ b/Makefile
@@ -155,6 +155,7 @@ endif
 # issue covered with this
 #   -  for Ubuntu 14.04 or lower, lapack is not automatically installed with openblas
 #   -  for Ubuntu, installing atlas will not automatically install the atlas provided lapack library
+#   -  for rhel7.2, try installing the package `lapack-static` via yum will dismiss this warning.
 # silently switching lapack off instead of letting the build fail because of backward compatibility
 ifeq ($(USE_LAPACK), 1)
 ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
@@ -222,7 +223,10 @@ ifeq ($(USE_GPERFTOOLS), 1)
 		ifeq (,$(FIND_LIBFILE))
 			FIND_LIBFILE=$(wildcard /usr/local/lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
 			ifeq (,$(FIND_LIBFILE))
-				USE_GPERFTOOLS=0
+				FIND_LIBFILE=$(wildcard /usr/lib64/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+				ifeq (,$(FIND_LIBFILE))
+					USE_GPERFTOOLS=0
+				endif
 			endif
 		endif
 	endif
@@ -245,7 +249,10 @@ ifneq ($(USE_GPERFTOOLS), 1)
 				ifeq (,$(FIND_LIBFILE))
 					FIND_LIBFILE=$(wildcard /usr/lib/x86_64-linux-gnu/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
 					ifeq (,$(FIND_LIBFILE))
-						USE_JEMALLOC=0
+						FIND_LIBFILE=$(wildcard /usr/lib64/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+						ifeq (,$(FIND_LIBFILE))
+							USE_JEMALLOC=0
+						endif
 					endif
 				endif
 			endif
diff --git a/ci/build.py b/ci/build.py
index 4473f54210a..680f4574fd8 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -34,12 +34,13 @@
 import subprocess
 import sys
 import tempfile
+import platform
 from copy import deepcopy
 from itertools import chain
 from subprocess import call, check_call
 from typing import *
 
-CCACHE_MAXSIZE = '10G'
+CCACHE_MAXSIZE = '500G'
 
 def get_platforms(path: Optional[str] = "docker"):
     """Get a list of architectures given our dockerfiles"""
@@ -121,11 +122,19 @@ def buildir() -> str:
 
 
 def default_ccache_dir() -> str:
+    # Share ccache across containers
     if 'CCACHE_DIR' in os.environ:
-        ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
+        try:
+            ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
+            os.makedirs(ccache_dir, exist_ok=True)
+            return ccache_dir
+        except PermissionError:
+            logging.info('Unable to make dirs at %s, falling back to local temp dir', ccache_dir)
+    # In osx tmpdir is not mountable by default
+    if platform.system() == 'Darwin':
+        ccache_dir = "/tmp/_mxnet_ccache"
         os.makedirs(ccache_dir, exist_ok=True)
-        return ccache_dirpython
-    # Share ccache across containers
+        return ccache_dir
     return os.path.join(tempfile.gettempdir(), "ci_ccache")
 
 
@@ -151,7 +160,9 @@ def container_run(platform: str,
                '-v', "{}:/work/ccache".format(local_ccache_dir),
                '-u', '{}:{}'.format(os.getuid(), os.getgid()),
                '-e', 'CCACHE_MAXSIZE={}'.format(CCACHE_MAXSIZE),
+               '-e', 'CCACHE_TEMPDIR=/tmp/ccache',  # temp dir should be local and not shared
                '-e', "CCACHE_DIR=/work/ccache",  # this path is inside the container as /work/ccache is mounted
+               '-e', "CCACHE_LOGFILE=/tmp/ccache.log",  # a container-scoped log, useful for ccache verification.
                tag]
     runlist.extend(command)
     cmd = ' '.join(runlist)
diff --git a/ci/docker/Dockerfile.build.android_arm64 b/ci/docker/Dockerfile.build.android_arm64
index 4bd4fd30922..427cc0368db 100755
--- a/ci/docker/Dockerfile.build.android_arm64
+++ b/ci/docker/Dockerfile.build.android_arm64
@@ -18,25 +18,19 @@
 #
 # Dockerfile to build MXNet for Android ARM64/ARMv8
 
-FROM ubuntu:16.04 as ccachebuilder
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
-
 FROM dockcross/base:latest
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
-# extract ccache binary into latest context
-COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
-
-# The cross-compiling emulator
 RUN apt-get update && apt-get install -y \
-  qemu-user \
-  qemu-user-static \
   unzip
 
+WORKDIR /work/deps
+
+# Build x86 dependencies.
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
+
+# Setup Android cross-compilation environment.
 ENV CROSS_TRIPLE=aarch64-linux-android
 ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
 ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
@@ -61,11 +55,12 @@ LABEL org.label-schema.build-date=$BUILD_DATE \
       org.label-schema.schema-version="1.0"
 
 ENV ARCH aarch64
+ENV ANDROID_NDK_REVISION 15c
 
 ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
 ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
 
-WORKDIR /work/deps
+# Build ARM dependencies.
 COPY install/android_arm64_ndk.sh /work/
 RUN /work/android_arm64_ndk.sh
 COPY install/android_arm64_openblas.sh /work/
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index 70bb9fb0eae..859ea02fb36 100755
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -18,19 +18,9 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM ubuntu:16.04 as ccachebuilder
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
-
 FROM dockcross/base:latest
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
-# extract ccache binary into latest context
-COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
-
 # The cross-compiling emulator
 RUN apt-get update && apt-get install -y \
   qemu-user \
@@ -88,6 +78,12 @@ RUN git clone https://github.com/xianyi/OpenBLAS.git && \
 ENV OPENBLAS_ROOT /work/OpenBLAS
 ENV LIBRARY_PATH /work/OpenBLAS/lib/:/work/OpenBLAS/:$LIBRARY_PATH
 ENV CPLUS_INCLUDE_PATH /work/OpenBLAS/include/:/work/OpenBLAS/:$CPLUS_INCLUDE_PATH
+
+WORKDIR /work/deps
+
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
+
 WORKDIR /work
 
 ENV CC /usr/arm-linux-androideabi/bin/arm-linux-androideabi-clang
diff --git a/ci/docker/Dockerfile.build.arm64 b/ci/docker/Dockerfile.build.arm64
index a1f752bbf64..fd87bf0fa6c 100755
--- a/ci/docker/Dockerfile.build.arm64
+++ b/ci/docker/Dockerfile.build.arm64
@@ -18,32 +18,27 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
-FROM ubuntu:16.04 as ccachebuilder
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
-
 # Temporary fix due to https://github.com/apache/incubator-mxnet/issues/10837
 #FROM dockcross/linux-arm64
 FROM mxnetci/dockcross-linux-arm64:05082018
 
-# extract ccache binary into latest context
-COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
-
 ENV ARCH aarch64
-ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
 ENV TARGET ARMV8
 
-WORKDIR /work
+WORKDIR /work/deps
+
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
+
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    PREFIX=${CROSS_ROOT} make install
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
+WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index c073992406f..156c57da3e1 100755
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -18,29 +18,25 @@
 #
 # Dockerfile to build MXNet for ARMv6
 
-FROM ubuntu:16.04 as ccachebuilder
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
-
 FROM dockcross/linux-armv6
 
-# extract ccache binary into latest context
-COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
-
 ENV ARCH armv6l
 ENV HOSTCC gcc
 ENV TARGET ARMV6
 
 WORKDIR /work/deps
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    make PREFIX=$CROSS_ROOT install
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
+
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
+
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
 
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 627486c0537..6316270f9cf 100755
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -18,25 +18,25 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM ubuntu:16.04 as ccachebuilder
+FROM dockcross/linux-armv7
 
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
+ENV ARCH armv7l
+ENV HOSTCC gcc
+ENV TARGET ARMV7
 
-FROM dockcross/linux-armv7
+WORKDIR /work/deps
+
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
 
-# extract ccache binary into latest context
-COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
 
-ENV ARCH armv71
-ENV CC /usr/bin/arm-linux-gnueabihf-gcc
-ENV CXX /usr/bin/arm-linux-gnueabihf-g++
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
-RUN apt-get update && \
-    apt-get install -y libopenblas-dev:armhf && \
-    rm -rf /var/lib/apt/lists/*
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/build
+WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 8a8bb97aa15..9cacdc34b72 100755
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -22,36 +22,29 @@
 
 FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
 
-FROM ubuntu:16.04 as ccachebuilder
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
 
 # Temporary fix due to https://github.com/apache/incubator-mxnet/issues/10837
 # FROM dockcross/linux-arm64
 FROM mxnetci/dockcross-linux-arm64:05082018
 
-# extract ccache binary into latest context
-COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
-
 ENV ARCH aarch64
-ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
 ENV TARGET ARMV8
 
-WORKDIR /work
+WORKDIR /work/deps
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    PREFIX=${CROSS_ROOT} make install
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
 
 ENV OpenBLAS_HOME=${CROSS_ROOT}
 ENV OpenBLAS_DIR=${CROSS_ROOT}
 
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
+
 # Setup CUDA build env (including configuring and copying nvcc)
 COPY --from=cudabuilder /usr/local/cuda /usr/local/cuda
 ENV TARGET_ARCH aarch64
diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
index a87651a4f5a..51b398c832c 100755
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -27,8 +27,8 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
 COPY install/ubuntu_python.sh /work/
 RUN /work/ubuntu_python.sh
 COPY install/ubuntu_scala.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu
index f82ee75c482..57cf1e93e54 100755
--- a/ci/docker/Dockerfile.build.ubuntu_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu
@@ -24,8 +24,8 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
 COPY install/ubuntu_python.sh /work/
 RUN /work/ubuntu_python.sh
 COPY install/ubuntu_scala.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu b/ci/docker/Dockerfile.build.ubuntu_gpu
index 5f4bfc5a87d..1696c183e15 100755
--- a/ci/docker/Dockerfile.build.ubuntu_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu
@@ -24,8 +24,8 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
-COPY install/ubuntu_ccache.sh /work/
-RUN /work/ubuntu_ccache.sh
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
 COPY install/ubuntu_python.sh /work/
 RUN /work/ubuntu_python.sh
 COPY install/ubuntu_scala.sh /work/
diff --git a/ci/docker/install/android_arm64_ndk.sh b/ci/docker/install/android_arm64_ndk.sh
index 23d9ea2c0e9..04023fbdebe 100755
--- a/ci/docker/install/android_arm64_ndk.sh
+++ b/ci/docker/install/android_arm64_ndk.sh
@@ -22,7 +22,8 @@
 
 set -ex
 pushd .
-export ANDROID_NDK_REVISION=15c
+# This environment variable comes from the docker file
+echo "Downloading android SDK rev ${ANDROID_NDK_REVISION}"
 curl -O https://dl.google.com/android/repository/android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
 unzip ./android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
 cd android-ndk-r${ANDROID_NDK_REVISION} && \
@@ -32,4 +33,4 @@ cd android-ndk-r${ANDROID_NDK_REVISION} && \
     --api 21 \
     --install-dir=${CROSS_ROOT} && \
 
-popd
\ No newline at end of file
+popd
diff --git a/ci/docker/install/arm_openblas.sh b/ci/docker/install/arm_openblas.sh
new file mode 100755
index 00000000000..fa2e5cae9cb
--- /dev/null
+++ b/ci/docker/install/arm_openblas.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git
+
+cd OpenBLAS
+make -j$(nproc)
+PREFIX=${CROSS_ROOT} make install
+
+cd ..
+
+rm -rf OpenBLAS
diff --git a/ci/docker/install/ubuntu_ccache.sh b/ci/docker/install/deb_ubuntu_ccache.sh
similarity index 85%
rename from ci/docker/install/ubuntu_ccache.sh
rename to ci/docker/install/deb_ubuntu_ccache.sh
index 4a6b63f0f72..31cd5c8b25a 100755
--- a/ci/docker/install/ubuntu_ccache.sh
+++ b/ci/docker/install/deb_ubuntu_ccache.sh
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Script to build ccache for ubuntu based images
+# Script to build ccache for debian and ubuntu based images.
 
 set -ex
 
@@ -43,7 +43,9 @@ git clone --recursive -b v3.4.2 https://github.com/ccache/ccache.git
 cd ccache
 
 ./autogen.sh
-./configure
+# Manually specify x86 gcc versions so that this script remains compatible with dockcross (which uses an ARM based gcc
+# by default).
+CC=/usr/bin/gcc CXX=/usr/bin/g++ ./configure
 
 # Don't build documentation #11214
 #perl -pi -e 's!\s+\Q$(installcmd) -d $(DESTDIR)$(mandir)/man1\E!!g' Makefile
diff --git a/ci/docker/install/ubuntu_arm.sh b/ci/docker/install/ubuntu_arm.sh
new file mode 100755
index 00000000000..becb012bd18
--- /dev/null
+++ b/ci/docker/install/ubuntu_arm.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+apt update
+apt install -y \
+    unzip
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 36e23879705..6cefeea9fbc 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -31,7 +31,6 @@ clean_repo() {
     git submodule update --init --recursive
 }
 
-# wrap compiler calls with ccache
 build_ccache_wrappers() {
     set -ex
 
@@ -63,34 +62,66 @@ build_ccache_wrappers() {
     export CXX=`pwd`/cxx
 }
 
-# Build commands: Every platform in docker/Dockerfile.build.<platform> should have a corresponding
-# function here with the same suffix:
+build_wheel() {
 
-build_jetson() {
     set -ex
     pushd .
 
-    build_ccache_wrappers
+    PYTHON_DIR=${1:-/work/mxnet/python}
+    BUILD_DIR=${2:-/work/build}
 
-    cp -f make/crosscompile.jetson.mk ./config.mk
+    # build
 
-    make -j$(nproc)
+    export MXNET_LIBRARY_PATH=${BUILD_DIR}/libmxnet.so
 
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
+    cd ${PYTHON_DIR}
     python setup.py bdist_wheel --universal
 
+    # repackage
+
     # Fix pathing issues in the wheel.  We need to move libmxnet.so from the data folder to the
     # mxnet folder, then repackage the wheel.
     WHEEL=`readlink -f dist/*.whl`
     TMPDIR=`mktemp -d`
-    unzip -d $TMPDIR $WHEEL
-    rm $WHEEL
-    cd $TMPDIR
+    unzip -d ${TMPDIR} ${WHEEL}
+    rm ${WHEEL}
+    cd ${TMPDIR}
     mv *.data/data/mxnet/libmxnet.so mxnet
-    zip -r $WHEEL .
-    cp $WHEEL /work/build
-    rm -rf $TMPDIR
+    zip -r ${WHEEL} .
+    cp ${WHEEL} ${BUILD_DIR}
+    rm -rf ${TMPDIR}
+
+    popd
+}
+
+# Build commands: Every platform in docker/Dockerfile.build.<platform> should have a corresponding
+# function here with the same suffix:
+
+build_jetson() {
+    set -ex
+    pushd .
+
+    cp make/crosscompile.jetson.mk ./config.mk
+    make -j$(nproc)
+
+    build_wheel /work/mxnet/python /work/mxnet/lib
+    popd
+}
+
+report_ccache_usage() {
+    set -ex
+    pushd .
+
+    # Show global ccache summary at the end of each run.
+    ccache -s
+    if [ -e $CCACHE_LOGFILE ]
+    then
+        # Display local ccache log, excluding some overly verbose output.
+        cat $CCACHE_LOGFILE | grep -v "Config:" | grep -v "stats.lock"
+    else
+        echo "No ccache log found."
+    fi
+
     popd
 }
 
@@ -107,7 +138,7 @@ build_armv6() {
     # We do not need OpenMP, since most armv6 systems have only 1 core
 
     cmake \
-        -DCMAKE_TOOLCHAIN_FILE=$CROSS_ROOT/Toolchain.cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DUSE_CUDA=OFF \
@@ -120,11 +151,10 @@ build_armv6() {
         -DBUILD_CPP_EXAMPLES=OFF \
         -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
-    ninja
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
-    python setup.py bdist_wheel --universal
-    cp dist/*.whl /work/build
+
+    ninja -v
+    report_ccache_usage
+    build_wheel
     popd
 }
 
@@ -132,21 +162,31 @@ build_armv7() {
     set -ex
     pushd .
     cd /work/build
+
+    # Lapack functionality will be included and statically linked to openblas.
+    # But USE_LAPACK needs to be set to OFF, otherwise the main CMakeLists.txt
+    # file tries to add -llapack. Lapack functionality though, requires -lgfortran
+    # to be linked additionally.
+
     cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DCMAKE_CROSSCOMPILING=ON \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DUSE_CUDA=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_CUDA=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=ON \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_LAPACK=OFF \
+        -DBUILD_CPP_EXAMPLES=OFF \
+        -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
-    ninja
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
-    python setup.py bdist_wheel --universal
-    cp dist/*.whl /work/build
+
+    ninja -v
+    report_ccache_usage
+    build_wheel
     popd
 }
 
@@ -164,7 +204,8 @@ build_amzn_linux_cpu() {
         -DUSE_LAPACK=OFF\
         -DUSE_DIST_KVSTORE=ON\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v
+    report_ccache_usage
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
 }
 
@@ -173,20 +214,22 @@ build_arm64() {
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DUSE_CUDA=OFF\
+        -DSUPPORT_F16C=OFF\
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
+        -DCMAKE_BUILD_TYPE=Release\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v
+    report_ccache_usage
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
     cd /work/mxnet/python
     python setup.py bdist_wheel --universal
     cp dist/*.whl /work/build
 }
 
-build_android_arm64() {
+build_android_armv7() {
     set -ex
     cd /work/build
     cmake \
@@ -194,6 +237,7 @@ build_android_arm64() {
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DUSE_CUDA=OFF\
         -DUSE_SSE=OFF\
+        -DSUPPORT_F16C=OFF\
         -DUSE_LAPACK=OFF\
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
@@ -201,18 +245,44 @@ build_android_arm64() {
         -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v
+    report_ccache_usage
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
     cd /work/mxnet/python
     python setup.py bdist_wheel --universal
     cp dist/*.whl /work/build
 }
 
+build_android_arm64() {
+    set -ex
+    cd /work/build
+# There are other ways for CMake to cross compile android, like setting the following variables
+# below. But right not it doesn't work as expected, we need to find what's the best strategy to 
+# build with CMake in Android.
+#        -DCMAKE_ANDROID_NDK=${CROSS_ROOT} \
+#        -DCMAKE_SYSTEM_VERSION=${ANDROID_NDK_REVISION} \
+#        -DCMAKE_SYSTEM_NAME=Android \
+#
+    cmake\
+        -DANDROID=ON \
+        -DUSE_CUDA=OFF\
+        -DUSE_SSE=OFF\
+        -DUSE_LAPACK=OFF\
+        -DUSE_OPENCV=OFF\
+        -DUSE_OPENMP=OFF\
+        -DUSE_SIGNAL_HANDLER=ON\
+        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
+        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -G Ninja /work/mxnet
+    ninja -v
+}
+
 build_centos7_cpu() {
     set -ex
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
+
     make \
         DEV=1 \
         USE_LAPACK=1 \
@@ -220,6 +290,8 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_centos7_mkldnn() {
@@ -227,6 +299,7 @@ build_centos7_mkldnn() {
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
+
     make \
         DEV=1 \
         USE_LAPACK=1 \
@@ -234,6 +307,8 @@ build_centos7_mkldnn() {
         USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_centos7_gpu() {
@@ -267,6 +342,8 @@ build_ubuntu_cpu_openblas() {
         USE_BLAS=openblas             \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang39() {
@@ -283,6 +360,8 @@ build_ubuntu_cpu_clang39() {
         USE_OPENMP=0                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50() {
@@ -299,6 +378,8 @@ build_ubuntu_cpu_clang50() {
         USE_OPENMP=1                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang39_mkldnn() {
@@ -315,6 +396,8 @@ build_ubuntu_cpu_clang39_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=0                  \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50_mkldnn() {
@@ -331,17 +414,23 @@ build_ubuntu_cpu_clang50_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=1                  \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_cpu_mkldnn() {
     set -ex
+
     build_ccache_wrappers
+
     make  \
         DEV=1                         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_gpu() {
@@ -350,7 +439,9 @@ build_ubuntu_gpu() {
 
 build_ubuntu_gpu_mkldnn() {
     set -ex
+
     build_ccache_wrappers
+
     make  \
         DEV=1                         \
         USE_CPP_PACKAGE=1             \
@@ -360,6 +451,8 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDA_PATH=/usr/local/cuda \
         USE_CUDNN=1                   \
         -j$(nproc)
+
+    report_ccache_usage
 }
 
 build_ubuntu_gpu_cuda91_cudnn7() {
@@ -406,6 +499,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    report_ccache_usage
     # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
     cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
     mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
@@ -427,6 +521,7 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
+    report_ccache_usage
 }
 
 
@@ -448,9 +543,9 @@ unittest_ubuntu_python2_cpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-2.7 --verbose tests/python/unittest
-    nosetests-2.7 --verbose tests/python/train
-    nosetests-2.7 --verbose tests/python/quantization
+    nosetests-2.7 --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    nosetests-2.7 --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
+    nosetests-2.7 --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python3_cpu() {
@@ -460,20 +555,19 @@ unittest_ubuntu_python3_cpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/unittest
-    nosetests-3.4 --verbose tests/python/quantization
+    nosetests-3.4 --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    nosetests-3.4 --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python3_cpu_mkldnn() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/unittest
-    nosetests-3.4 --verbose tests/python/quantization
-    nosetests-3.4 --verbose tests/python/mkl
+    nosetests-3.4 --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    nosetests-3.4 --with-xunit --xunit-file nosetests_mkl.xml --verbose tests/python/mkl
 }
 
 unittest_ubuntu_python2_gpu() {
@@ -483,7 +577,7 @@ unittest_ubuntu_python2_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-2.7 --verbose tests/python/gpu
+    nosetests-2.7 --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
 tutorialtest_ubuntu_python3_gpu() {
@@ -494,7 +588,8 @@ tutorialtest_ubuntu_python3_gpu() {
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python3
-    cd /work/mxnet/tests/tutorials && nosetests-3.4 test_tutorials.py --nologcapture
+    cd /work/mxnet/tests/tutorials
+    nosetests-3.4 --with-xunit --xunit-file nosetests_tutorials.xml test_tutorials.py --nologcapture
 }
 
 tutorialtest_ubuntu_python2_gpu() {
@@ -505,7 +600,8 @@ tutorialtest_ubuntu_python2_gpu() {
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python2
-    cd /work/mxnet/tests/tutorials && nosetests-3.4 test_tutorials.py --nologcapture
+    cd /work/mxnet/tests/tutorials
+    nosetests-3.4 --with-xunit --xunit-file nosetests_tutorials.xml test_tutorials.py --nologcapture
 }
 
 unittest_ubuntu_python3_gpu() {
@@ -515,7 +611,7 @@ unittest_ubuntu_python3_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/gpu
+    nosetests-3.4 --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
 # quantization gpu currently only runs on P3 instances
@@ -527,7 +623,7 @@ unittest_ubuntu_python2_quantization_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-2.7 --verbose tests/python/quantization_gpu
+    nosetests-2.7 --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
 # quantization gpu currently only runs on P3 instances
@@ -539,7 +635,7 @@ unittest_ubuntu_python3_quantization_gpu() {
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    nosetests-3.4 --verbose tests/python/quantization_gpu
+    nosetests-3.4 --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
 unittest_ubuntu_cpu_scala() {
@@ -585,14 +681,14 @@ unittest_ubuntu_gpu_R() {
 unittest_centos7_cpu() {
     set -ex
     cd /work/mxnet
-    python3.6 -m "nose" --with-timer --verbose tests/python/unittest
-    python3.6 -m "nose" --with-timer --verbose tests/python/train
+    python3.6 -m "nose" --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    python3.6 -m "nose" --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
 }
 
 unittest_centos7_gpu() {
     set -ex
     cd /work/mxnet
-    python3.6 -m "nose" --with-timer --verbose tests/python/gpu
+    python3.6 -m "nose" --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
 integrationtest_ubuntu_cpu_onnx() {
diff --git a/ci/test_docker_cache.py b/ci/test_docker_cache.py
index fa8833fb002..3f471db5e7a 100644
--- a/ci/test_docker_cache.py
+++ b/ci/test_docker_cache.py
@@ -28,6 +28,7 @@
 import logging
 import subprocess
 import sys
+from unittest.mock import MagicMock
 
 sys.path.append(os.path.dirname(__file__))
 import docker_cache
@@ -87,6 +88,8 @@ def setUp(self):
         base = os.path.split(os.path.realpath(__file__))[0]
         os.chdir(base)
 
+        docker_cache._login_dockerhub = MagicMock()  # Override login
+
         # Stop in case previous execution was dirty
         try:
             self._stop_local_docker_registry()
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index 0c000d9955f..8facde16840 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -77,6 +77,7 @@ def GetConvertEnumVariableToString(self, variable=''):
 
 class Arg:
     typeDict = {'boolean':'bool',\
+        'boolean or None':'dmlc::optional<bool>',\
         'Shape(tuple)':'Shape',\
         'Symbol':'Symbol',\
         'NDArray':'Symbol',\
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index d2d44b07848..a1e40fe6c30 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -1,5 +1,5 @@
 var versionSelect   = defaultVersion = 'v1.2.0';
-var deviceSelect    = 'Linux';
+var platformSelect    = 'Linux';
 var languageSelect  = 'Python';
 var processorSelect = 'CPU';
 var environSelect   = 'Pip';
@@ -16,10 +16,10 @@ $(document).ready(function () {
         $('li a:contains(' + versionSelect + ')').parent().siblings().removeClass('active');
         $('li a:contains(' + versionSelect + ')').parent().addClass('active');
         $('.current-version').html( versionSelect + ' <span class="caret"></span></button>' );
-        if (urlParams.get('device'))
-            deviceSelect = urlParams.get('device');
-        $('button:contains(' + deviceSelect + ')').siblings().removeClass('active');
-        $('button:contains(' + deviceSelect + ')').addClass('active');
+        if (urlParams.get('platform'))
+            platformSelect = urlParams.get('platform');
+        $('button:contains(' + platformSelect + ')').siblings().removeClass('active');
+        $('button:contains(' + platformSelect + ')').addClass('active');
         if (urlParams.get('language'))
             languageSelect = urlParams.get('language');
         $('button:contains(' + languageSelect + ')').siblings().removeClass('active');
@@ -35,9 +35,9 @@ $(document).ready(function () {
         showContent();
         if (window.location.href.includes("/install/index.html")) {
             if (versionSelect.includes(defaultVersion)) {
-                history.pushState(null, null, '/install/index.html?device=' + deviceSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+                history.pushState(null, null, '/install/index.html?platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
             } else {
-                history.pushState(null, null, '/install/index.html?version=' + versionSelect + '&device=' + deviceSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+                history.pushState(null, null, '/install/index.html?version=' + versionSelect + '&platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
             }
         } 
     }
@@ -71,8 +71,8 @@ $(document).ready(function () {
                   history.pushState(null, null, '/install/index.html' + window.location.search.replace( 'version', 'prev' ));
               }
         }
-        else if ($(this).hasClass("Devices")) {
-            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('device'), $(this).text() ));
+        else if ($(this).hasClass("platforms")) {
+            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('platform'), $(this).text() ));
         }
         else if ($(this).hasClass("languages")) {
             history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('language'), $(this).text() ));
diff --git a/docs/_static/mxnet-theme/layout.html b/docs/_static/mxnet-theme/layout.html
index 3028342abd5..d72582cc1c7 100644
--- a/docs/_static/mxnet-theme/layout.html
+++ b/docs/_static/mxnet-theme/layout.html
@@ -102,11 +102,17 @@
        must come *after* these tags. #}
     {{ metatags }}
     {%- block htmltitle %}
-    {%- if pagename != 'index' and 'no title' not in title%}
+    <meta property="og:title"              content="{{ title }}" />
+    <meta property="og:image"              content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" />
+    <meta property="og:image:secure_url"              content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" />
+    {%- if pagename != 'index' and 'no title' not in title %}
+    <meta property="og:description"        content="{{ title }}" />
     <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
     {%- elif pagename == 'index' %}
+    <meta property="og:description"        content="A flexible and efficient library for deep learning." />
     <title>MXNet: A Scalable Deep Learning Framework</title>
     {%- else %}
+    <meta property="og:description"        content="A flexible and efficient library for deep learning." />
     <title>{{ pagename.split('/')[0]|capitalize }}{{ titlesuffix }}</title>
     {%- endif %}
     {%- endblock %}
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index f29301dec7a..12a898aadc2 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -14,7 +14,7 @@ export MXNET_GPU_WORKER_NTHREADS=3
   - Values: Int ```(default=2)```
   - The maximum number of threads to use on each GPU. This parameter is used to parallelize the computation within a single GPU card.
 * MXNET_GPU_COPY_NTHREADS
-  - Values: Int ```(default=1)```
+  - Values: Int ```(default=2)```
   - The maximum number of concurrent threads that do the memory copy job on each GPU.
 * MXNET_CPU_WORKER_NTHREADS
   - Values: Int ```(default=1)```
diff --git a/docs/install/index.md b/docs/install/index.md
index 4b966b62067..a35cf941501 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -20,11 +20,11 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <!-- START - OS Menu -->
 
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active Devices">Linux</button>
-  <button type="button" class="btn btn-default opt Devices">MacOS</button>
-  <button type="button" class="btn btn-default opt Devices">Windows</button>
-  <button type="button" class="btn btn-default opt Devices">Cloud</button>
-  <button type="button" class="btn btn-default opt Devices">Devices</button>
+  <button type="button" class="btn btn-default opt active platforms">Linux</button>
+  <button type="button" class="btn btn-default opt platforms">MacOS</button>
+  <button type="button" class="btn btn-default opt platforms">Windows</button>
+  <button type="button" class="btn btn-default opt platforms">Cloud</button>
+  <button type="button" class="btn btn-default opt platforms">Devices</button>
 </div>
 
 <!-- START - Language Menu -->
@@ -513,12 +513,23 @@ $ wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
 
 <div class="v1-2-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.2
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
 
 ```bash
-$ pip install mxnet-cu90
+nvcc --version
 ```
 
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+```bash
+$ pip install mxnet-cu92
+```
+
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
 sudo apt-get install graphviz
@@ -537,12 +548,23 @@ $ pip install mxnet-cu90mkl
 
 <div class="v1-1-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.1
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
 
 ```bash
-$ pip install mxnet-cu90==1.1.0
+nvcc --version
 ```
 
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+```bash
+$ pip install mxnet-cu91==1.1.0
+```
+
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
 sudo apt-get install graphviz
@@ -551,11 +573,13 @@ pip install graphviz
 
 **Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+**Experimental Choice** If You would like to install MXNet with Intel MKL, try the experimental pip package with MKL:
 ```bash
-$ pip install mxnet-cu90mkl==1.1.0
+$ pip install mxnet-cu91mkl==1.1.0
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-1-0-->
 
 
@@ -621,10 +645,10 @@ $ pip install mxnet-cu90mkl==0.12.0
 
 <div class="v0-11-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 8.0
 
 ```bash
-$ pip install mxnet-cu90==0.11.0
+$ pip install mxnet-cu80==0.11.0
 ```
 
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
@@ -635,9 +659,9 @@ pip install graphviz
 
 **Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+**Experimental Choice** If You would like to install MXNet with Intel MKL, try the experimental pip package with MKL:
 ```bash
-$ pip install mxnet-cu90mkl==0.11.0
+$ pip install mxnet-cu80mkl==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
@@ -687,23 +711,45 @@ Installing *MXNet* with pip requires a latest version of `pip`. Install the late
 
 <div class="v1-2-0">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+Install *MXNet* with GPU support using CUDA 9.2:
 
 ```bash
-(mxnet)$ pip install mxnet-cu90
+(mxnet)$ pip install mxnet-cu92
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-2-0-->
 
 
 <div class="v1-1-0">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+Install *MXNet* with GPU support using CUDA 9.1:
 
 ```bash
-(mxnet)$ pip install mxnet-cu90==1.1.0
+(mxnet)$ pip install mxnet-cu91==1.1.0
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-1-0-->
 
 
@@ -714,6 +760,7 @@ Install *MXNet* with GPU support using CUDA 9.0.
 ```bash
 (mxnet)$ pip install mxnet-cu90==1.0.0
 ```
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
 
 </div> <!-- End of v1-0-0-->
 
@@ -726,33 +773,40 @@ Install *MXNet* with GPU support using CUDA 9.0.
 (mxnet)$ pip install mxnet-cu90==0.12.1
 ```
 
-For *MXNet* 0.12.0 with GPU support using CUDA 9.0.
-
-```bash
-(mxnet)$ pip install mxnet-cu90==0.12.0
-``` 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
 
 </div> <!-- End of v0-12-1-->
 
 
 <div class="v0-11-0">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+Install *MXNet* with GPU support using CUDA 8.0.
 
 ```bash
-(mxnet)$ pip install mxnet-cu90==0.11.0
+(mxnet)$ pip install mxnet-cu80==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
 
 <div class="master">
 
-Install *MXNet* with GPU support using CUDA 9.0.
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+
+Install *MXNet* with GPU support using CUDA 9.2.
 
 ```bash
-(mxnet)$ pip install mxnet-cu90 --pre
+(mxnet)$ pip install mxnet-cu92 --pre
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of master-->
 
 **Step 4**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
@@ -1572,22 +1626,44 @@ Follow the installation instructions [in this guide](./windows_setup.md) to set
 
 <div class="v1-2-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.2.
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
 ```bash
-$ pip install mxnet-cu90
+$ pip install mxnet-cu92
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-2-0-->
 
 <div class="v1-1-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.1.
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
 ```bash
-$ pip install mxnet-cu90==1.1.0
+$ pip install mxnet-cu91==1.1.0
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of v1-1-0-->
 
 <div class="v1-0-0">
@@ -1618,22 +1694,33 @@ $ pip install mxnet-cu90==0.12.0
 
 <div class="v0-11-0">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 8.0.
 
 ```bash
-$ pip install mxnet-cu90==0.11.0
+$ pip install mxnet-cu80==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
 
 <div class="master">
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.2.
+
+**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
+Check your CUDA version with the following command:
+
+```bash
+nvcc --version
+```
+
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
 ```bash
-$ pip install mxnet-cu90 --pre
+$ pip install mxnet-cu92 --pre
 ```
 
+Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+
 </div> <!-- End of master-->
 
 Refer to [#8671](https://github.com/apache/incubator-mxnet/issues/8671) for status on CUDA 9.1 support.
@@ -1644,7 +1731,7 @@ Refer to [#8671](https://github.com/apache/incubator-mxnet/issues/8671) for stat
 
 We provide both options to build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), and [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/).
 
-**Option 1** 
+**Option 1**
 
 To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
 
@@ -1678,7 +1765,7 @@ git clone https://github.com/apache/incubator-mxnet.git --recursive
 "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
 ```
 
-5. Create a build dir using the following command and go to the directory, for example: 
+5. Create a build dir using the following command and go to the directory, for example:
 
 ```r
 mkdir C:\build
@@ -1699,11 +1786,11 @@ NOTE: make sure the DCUDNN_INCLUDE and DCUDNN_LIBRARY pointing to the “include
 msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
 ```
 
-**Option 2** 
+**Option 2**
 
 To build and install MXNet yourself using [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/), you need the following dependencies. Install the required dependencies:
 
-1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition. At least Update 3 of Microsoft Visual Studio 2015 is required to build MXNet from source. Upgrade via it's ```Tools -> Extensions and Updates... | Product Updates``` menu.
 2. Download and install [CMake](https://cmake.org/) if it is not already installed.
 3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
 4. Unzip the OpenCV package.
@@ -1711,10 +1798,12 @@ To build and install MXNet yourself using [Microsoft Visual Studio 2015](https:/
 6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
 7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
 8. Download and install [CUDA](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64) and [cuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
+9. Set the environment variable ```CUDACXX``` to point to the ```CUDA Compiler```(```C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.1\bin\nvcc.exe``` for example).
+10. Set the environment variable ```CUDNN_ROOT``` to point to the ```cuDNN``` directory that contains the ```include```,  ```lib``` and ```bin``` directories (```C:\Downloads\cudnn-9.1-windows7-x64-v7\cuda``` for example).
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
-1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet).
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet) (make sure you also download third parties submodules e.g. ```git clone --recurse-submodules```).
 2. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build```.
 3. In Visual Studio, open the solution file,```.sln```, and compile it.
 These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
@@ -1778,7 +1867,7 @@ Follow the installation instructions [in this guide](./windows_setup.md) to set
 <p> To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>. </p>
 <br/>
 </div> <!-- End of cpu gpu -->
-</div> <!-- End of C++>
+</div> <!-- End of C++ -->
 </div> <!-- End of Windows -->
 
 
@@ -2402,4 +2491,3 @@ Will be available soon.
 # Source Download
 
 <a href="download.html">Download</a> your required version of MXNet.
-
diff --git a/docs/tutorials/gluon/hybrid.md b/docs/tutorials/gluon/hybrid.md
index 3554a15fa3b..f9f2c112f53 100644
--- a/docs/tutorials/gluon/hybrid.md
+++ b/docs/tutorials/gluon/hybrid.md
@@ -87,7 +87,7 @@ net(x)
 Hybrid execution can be activated by simply calling `.hybridize()` on the top
 level layer. The first forward call after activation will try to build a
 computation graph from `hybrid_forward` and cache it. On subsequent forward
-calls the cached graph instead of `hybrid_forward` will be invoked:
+calls the cached graph, instead of `hybrid_forward`, will be invoked:
 
 ```python
 net.hybridize()
@@ -105,23 +105,35 @@ Hybridize will speed up execution and save memory. If the top level layer is
 not a `HybridBlock`, you can still call `.hybridize()` on it and Gluon will try
 to hybridize its children layers instead.
 
+`hybridize` also accepts several options for performance tuning. For example, you
+can do
+
+```python
+net.hybridize(static_alloc=True)
+# or
+net.hybridize(static_alloc=True, static_shape=True)
+```
+
+Please refer to the [API manual](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=hybridize#mxnet.gluon.Block.hybridize)
+for details.
+
 ## Serializing trained model for deployment
 
-Models implemented as `HybridBlock` can be easily serialized for deployment
-using other language front-ends like C, C++ and Scala. To this end, we simply
-forward the model with symbolic variables instead of NDArrays and save the
-output Symbol(s):
+Models implemented as `HybridBlock` can be easily serialized. The serialized
+model can be loaded back later or used for deployment
+with other language front-ends like C, C++ and Scala. To this end, we simply
+use `export` and `SymbolBlock.imports`:
 
 ```python
-x = mx.sym.var('data')
-y = net(x)
-print(y)
-y.save('model.json')
-net.save_params('model.params')
+net.export('model', epoch=1)
 ```
 
-If your network outputs more than one value, you can use `mx.sym.Group` to
-combine them into a grouped Symbol and then save. The saved json and params
-files can then be loaded with C, C++ and Scala interface for prediction.
+Two files `model-symbol.json` and `model-0001.params` are saved on disk.
+You can use other language bindings to load them. You can also load them back
+to gluon with `SymbolBlock`:
+
+```python
+net2 = gluon.SymbolBlock.imports('model-symbol.json', ['data'], 'model-0001.params')
+```
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/naming.md b/docs/tutorials/gluon/naming.md
index 37b63fa08a9..3606a03dcbd 100644
--- a/docs/tutorials/gluon/naming.md
+++ b/docs/tutorials/gluon/naming.md
@@ -203,12 +203,12 @@ except Exception as e:
     Parameter 'model1_dense0_weight' is missing in file 'model.params', which contains parameters: 'model0_mydense_weight', 'model0_dense1_bias', 'model0_dense1_weight', 'model0_dense0_weight', 'model0_dense0_bias', 'model0_mydense_bias'. Please make sure source and target networks have the same prefix.
 
 
-To solve this problem, we use `save_params`/`load_params` instead of `collect_params` and `save`/`load`. `save_params` uses model structure, instead of parameter name, to match parameters.
+To solve this problem, we use `save_parameters`/`load_parameters` instead of `collect_params` and `save`/`load`. `save_parameters` uses model structure, instead of parameter name, to match parameters.
 
 
 ```python
-model0.save_params('model.params')
-model1.load_params('model.params')
+model0.save_parameters('model.params')
+model1.load_parameters('model.params')
 print(mx.nd.load('model.params').keys())
 ```
 
diff --git a/docs/tutorials/gluon/save_load_params.md b/docs/tutorials/gluon/save_load_params.md
index cd876808a86..f5f48125cc1 100644
--- a/docs/tutorials/gluon/save_load_params.md
+++ b/docs/tutorials/gluon/save_load_params.md
@@ -10,7 +10,7 @@ Parameters of any Gluon model can be saved using the `save_params` and `load_par
 
 **2. Save/load model parameters AND architecture**
 
-The Model architecture of `Hybrid` models stays static and don't change during execution. Therefore both model parameters AND architecture can be saved and loaded using `export`, `load_checkpoint` and `load` methods.
+The Model architecture of `Hybrid` models stays static and don't change during execution. Therefore both model parameters AND architecture can be saved and loaded using `export`, `imports` methods.
 
 Let's look at the above methods in more detail. Let's start by importing the modules we'll need.
 
@@ -61,7 +61,7 @@ def build_lenet(net):
         net.add(gluon.nn.Dense(512, activation="relu"))
         # Second fully connected layer with as many neurons as the number of classes
         net.add(gluon.nn.Dense(num_outputs))
-        
+
         return net
 
 # Train a given model using MNIST data
@@ -240,18 +240,10 @@ One of the main reasons to serialize model architecture into a JSON file is to l
 
 ### From Python
 
-Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `mx.model.load_checkpoint` and `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above.
+Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above.
 
 ```python
-# Load the network architecture and parameters
-sym = mx.sym.load('lenet-symbol.json')
-# Create a Gluon Block using the loaded network architecture.
-# 'inputs' parameter specifies the name of the symbol in the computation graph
-# that should be treated as input. 'data' is the default name used for input when
-# a model architecture is saved to a file.
-deserialized_net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('data'))
-# Load the parameters
-deserialized_net.collect_params().load('lenet-0001.params', ctx=ctx)
+deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params")
 ```
 
 `deserialized_net` now contains the network we deserialized from files. Let's test the deserialized network to make sure it works.
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index a970c0a52ef..39a5aa4e22d 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -38,10 +38,11 @@ Select API:&nbsp;
     * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
 * Practitioner Guides
     * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
-    * [Checkpointing and Model Serialization (a.k.a. saving and loading)](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/> ([Alternative](/tutorials/gluon/save_load_params.html))
+    * [Checkpointing and Model Serialization (a.k.a. saving and loading)](/tutorials/gluon/save_load_params.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/> ([Alternative](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html))
     * [Inference using an ONNX model](/tutorials/onnx/inference_on_onnx_model.html)
     * [Fine-tuning an ONNX model on Gluon](/tutorials/onnx/fine_tuning_gluon.html)
     * [Visualizing Decisions of Convolutional Neural Networks](/tutorials/vision/cnn_visualization.html)
+    * [Profiling MXNet Models](/tutorials/python/profiler.html)
 * API Guides
     * Core APIs
         * NDArray
diff --git a/docs/tutorials/python/profiler.md b/docs/tutorials/python/profiler.md
new file mode 100644
index 00000000000..46fe23bbafe
--- /dev/null
+++ b/docs/tutorials/python/profiler.md
@@ -0,0 +1,205 @@
+# Profiling MXNet Models
+
+It is often helpful to understand what operations take how much time while running a model. This helps optimize the model to run faster. In this tutorial, we will learn how to profile MXNet models to measure their running time and memory consumption using the MXNet profiler.
+
+## The incorrect way to profile
+
+If you have just begun using MXNet, you might be tempted to measure the execution time of your model using Python's `time` module like shown below:
+
+```python
+from time import time
+from mxnet import autograd, nd
+import mxnet as mx
+
+start = time()
+x = nd.random_uniform(shape=(2000,2000))
+y = nd.dot(x, x)
+print('Time for matrix multiplication: %f sec\n' % (time() - start))
+
+start = time()                                
+print(y.asnumpy())                                
+print('Time for printing the output: %f sec' % (time() - start))
+```
+
+
+**Time for matrix multiplication: 0.005051 sec**<!--notebook-skip-line-->
+
+[[501.1584  508.29724 495.65237 ... 492.84705 492.69092 490.0481 ]<!--notebook-skip-line-->
+
+ [508.81058 507.1822  495.1743  ... 503.10526 497.29315 493.67917]<!--notebook-skip-line-->
+
+ [489.56598 499.47015 490.17722 ... 490.99945 488.05008 483.28836]<!--notebook-skip-line-->
+
+ ...<!--notebook-skip-line-->
+
+ [484.0019  495.7179  479.92142 ... 493.69952 478.89194 487.2074 ]<!--notebook-skip-line-->
+
+ [499.64932 507.65094 497.5938  ... 493.0474  500.74512 495.82712]<!--notebook-skip-line-->
+
+ [516.0143  519.1715  506.354   ... 510.08878 496.35608 495.42523]]<!--notebook-skip-line-->
+
+**Time for printing the output: 0.167693 sec**<!--notebook-skip-line-->
+
+
+From the output above, it seems as if printing the output takes lot more time that multiplying two large matrices. That doesn't feel right. 
+
+This is because, in MXNet, all operations are executed asynchronously. So, when `nd.dot(x, x)` returns, the matrix multiplication is not complete, it has only been queued for execution. `asnumpy` in `print(y.asnumpy())` however, waits for the result to be computed and hence takes longer time.
+
+While it is possible to use `NDArray.waitall()` before and after operations to get running time of operations, it is not a scalable method to measure running time of multiple sets of operations, especially in a Sequential or Hybrid network.
+
+## The correct way to profile
+
+The correct way to measure running time of MXNet models is to use MXNet profiler. In the rest of this tutorial, we will learn how to use the MXNet profiler to measure the running time and memory consumption of MXNet models.
+
+To use the profiler, you need to build MXNet with `USE_PROFILER=1`. Check the [installation](http://mxnet.incubator.apache.org/install/index.html) page for more information on how to install MXNet from source. After building with `USE_PROFILER=1` and installing, you can import the profiler and configure it from Python code.
+
+```python
+from mxnet import profiler
+profiler.set_config(profile_all=True, aggregate_stats=True, filename='profile_output.json')
+```
+
+`profile_all` enables all types of profiling. You can also individually enable the following types of profiling:
+
+- `profile_symbolic` (boolean): whether to profile symbolic operators
+- `profile_imperative` (boolean): whether to profile imperative operators
+- `profile_memory` (boolean): whether to profile memory usage
+- `profile_api` (boolean): whether to profile the C API
+
+`aggregate_stats` aggregates statistics in memory which can then be printed to console by calling `profiler.dumps()`.
+
+### Setup: Build a model
+
+Let's build a small convolutional neural network that we can use for profiling.
+
+```python
+from mxnet import gluon
+net = gluon.nn.HybridSequential()
+with net.name_scope():
+    net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+    net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
+    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+    net.add(gluon.nn.Flatten())
+    net.add(gluon.nn.Dense(512, activation="relu"))
+    net.add(gluon.nn.Dense(10))
+```
+
+We need data that we can run through the network for profiling. We'll use the MNIST dataset.
+
+```python
+from mxnet.gluon.data.vision import transforms
+train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True).transform_first(transforms.ToTensor()),
+                                   batch_size=64, shuffle=True)
+```
+
+Let's define a method that will run one training iteration given data and label.
+
+```python
+# Use GPU if available
+ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+
+# Initialize the parameters with random weights
+net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+
+# Use SGD optimizer
+trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})
+
+# Softmax Cross Entropy is a frequently used loss function for multi-classs classification
+softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
+
+# A helper function to run one training iteration
+def run_training_iteration(data, label):
+    
+    # Load data and label is the right context
+    data = data.as_in_context(ctx)
+    label = label.as_in_context(ctx)
+    
+    # Run the forward pass
+    with autograd.record():
+        output = net(data)
+        loss = softmax_cross_entropy(output, label)
+    
+    # Run the backward pass
+    loss.backward()
+    
+    # Apply changes to parameters
+    trainer.step(data.shape[0])
+```
+
+### Starting and stopping the profiler from Python
+
+When the first forward pass is run on a network, MXNet does a number of housekeeping tasks including inferring the shapes of various parameters, allocating memory for intermediate and final outputs, etc. For these reasons, profiling the first iteration doesn't provide accurate results. We will, therefore skip the first iteration.
+
+```python
+# Run the first iteration without profiling
+itr = iter(train_data)
+run_training_iteration(*next(itr))
+```
+
+We'll run the next iteration with the profiler turned on.
+
+```python
+data, label = next(itr)
+
+# Ask the profiler to start recording
+profiler.set_state('run')
+
+run_training_iteration(*next(itr))
+
+# Ask the profiler to stop recording
+profiler.set_state('stop')
+```
+
+Between running and stopping the profiler, you can also pause and resume the profiler using `profiler.pause()` and `profiler.resume()` respectively to profile only parts of the code you want to profile.
+
+### Starting profiler automatically using environment variable
+
+The method described above requires code changes to start and stop the profiler. You can also start the profiler automatically and profile the entire code without any code changes using the `MXNET_PROFILER_AUTOSTART` environment variable.
+
+MXNet will start the profiler automatically if you run your code with the environment variable `MXNET_PROFILER_AUTOSTART` set to `1`. The profiler output is stored into `profile.json` in the current directory.
+
+Note that the profiler output could be large depending on your code. It might be helpful to profile only sections of your code using the `set_state` API described in the previous section.
+
+### Increasing granularity of the profiler output
+
+MXNet executes computation graphs in 'bulk mode' which reduces kernel launch gaps in between symbolic operators for faster execution. This could reduce the granularity of the profiler output. If you need profiling result of every operator, please set the environment variables `MXNET_EXEC_BULK_EXEC_INFERENCE` and `MXNET_EXEC_BULK_EXEC_TRAIN` to `0` to disable the bulk execution mode.
+
+### Viewing profiler output
+
+There are two ways to view the information collected by the profiler. You can either view it in the console or you can view a more graphical version in a browser.
+
+#### 1. View in console
+
+You can use the `profiler.dumps()` method to view the information collected by the profiler in the console. The collected information contains time taken by each operator, time taken by each C API and memory consumed in both CPU and GPU.
+
+```python
+print(profiler.dumps())
+```
+
+![Profile Statistics](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profile_stats.png)<!--notebook-skip-line-->
+
+#### 2. View in browser
+
+You can also dump the information collected by the profiler into a `json` file using the `profiler.dump()` function and view it in a browser.
+
+```python
+profiler.dump()
+```
+
+`dump()` creates a `json` file which can be viewed using a trace consumer like `chrome://tracing` in the Chrome browser. Here is a snapshot that shows the output of the profiling we did above.
+
+![Tracing Screenshot](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_output_chrome.png)
+
+Let's zoom in to check the time taken by operators
+
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profile_operators.png)
+
+The above picture visualizes the sequence in which the operators were executed and the time taken by each operator.
+
+### Further reading
+
+- [Examples using MXNet profiler.](https://github.com/apache/incubator-mxnet/tree/master/example/profiler)
+- [Some tips for improving MXNet performance.](https://mxnet.incubator.apache.org/faq/perf.html)
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
+
diff --git a/example/README.md b/example/README.md
index 542162c0bf6..0dc6138c2ef 100644
--- a/example/README.md
+++ b/example/README.md
@@ -1,6 +1,6 @@
 # MXNet Examples
 
-This page contains a curated list of awesome MXNet examples, tutorials and blogs. It is inspired by [awesome-php](https://github.com/ziadoz/awesome-php) and [awesome-machine-learning](https://github.com/josephmisiti/awesome-machine-learning).
+This page contains a curated list of awesome MXNet examples, tutorials and blogs. It is inspired by [awesome-php](https://github.com/ziadoz/awesome-php) and [awesome-machine-learning](https://github.com/josephmisiti/awesome-machine-learning). See also [Awesome-MXNet](https://github.com/chinakook/Awesome-MXNet) for a similar list.
 
   - [Contributing](#contributing)
   - [List of examples](#list-of-examples)
@@ -28,7 +28,7 @@ Example applications or scripts should be submitted in this `example` folder.
 
 ### Tutorials
 
-If you have a tutorial idea for the website, download the [ Jupyter notebook tutorial template](https://github.com/dmlc/mxnet/tree/master/example/MXNetTutorialTemplate.ipynb).
+If you have a tutorial idea for the website, download the [Jupyter notebook tutorial template](https://github.com/dmlc/mxnet/tree/master/example/MXNetTutorialTemplate.ipynb).
 
 #### Tutorial location
 
@@ -45,9 +45,11 @@ The site expects the format to be markdown, so export your notebook as a .md via
 ```
 
 If you want some lines to show-up in the markdown but not in the generated notebooks, add  this comment `<!--notebook-skip-line-->` after your `![png](img_url)`. Like this:
+
 ```
 ![png](img_url.png)<!--notebook-skip-line-->
 ```
+
 Typically when you have a `plt.imshow()` you want the image tag `[png](img.png)` in the `.md` but not in the downloaded notebook as the user will re-generate the plot at run-time.
 
 #### Tutorial tests
@@ -151,7 +153,8 @@ If your tutorial depends on specific packages, simply add them to this provision
 * [LSTM Human Activity Recognition](https://github.com/Ldpe2G/DeepLearningForFun/tree/master/Mxnet-Scala/HumanActivityRecognition) by [Ldpe2G](https://github.com/Ldpe2G)
 * [Visual Question Answering](https://github.com/liuzhi136/Visual-Question-Answering) by [liuzhi136](https://github.com/liuzhi136)
 * [Deformable ConvNets](https://arxiv.org/abs/1703.06211) ([github](https://github.com/msracver/Deformable-ConvNets)) by [MSRACVer](https://github.com/msracver)
-
+* [OCR with bi-LSTM and CTC Loss in Gluon](https://github.com/ThomasDelteil/Gluon_OCR_LSTM_CTC) by [ThomasDelteil](https://github.com/ThomasDelteil)
+* [Visual Search with Gluon and HNSWlib](https://github.com/ThomasDelteil/VisualSearch_MXNet), by [ThomasDelteil](https://github.com/ThomasDelteil), online demo [here](https://thomasdelteil.github.io/VisualSearch_MXNet/)
 
 ### <a name="ipython-notebooks"></a>IPython Notebooks
 -----------------
@@ -164,7 +167,7 @@ If your tutorial depends on specific packages, simply add them to this provision
 * [class active maps](https://github.com/dmlc/mxnet-notebooks/blob/master/python/moved-from-mxnet/class_active_maps.ipynb) - A demo of how to localize the discriminative regions in an image using global average pooling (GAP) in CNNs.
 * [DMLC MXNet Notebooks](https://github.com/dmlc/mxnet-notebooks) DMLC's repo for various notebooks ranging from basic usages of MXNet to state-of-the-art deep learning applications.
 * [AWS Seoul Summit 2017 Demos](https://github.com/sxjscience/aws-summit-2017-seoul) The demo codes and ipython notebooks in AWS Seoul Summit 2017.
-* [Character-level CNN for text classification](https://github.com/ThomasDelteil/CNN_NLP_MXNet) Performing category classification on Amazon reviews using Gluon and character-level Convolutional Neural Networks
+* [Character-level CNN for text classification](https://github.com/ThomasDelteil/CNN_NLP_MXNet) Performing category classification on Amazon reviews using Gluon and character-level Convolutional Neural Networks. Online demo [here](https://thomasdelteil.github.io/CNN_NLP_MXNet/)
 
 ### <a name="mobile-apps-examples"></a>Mobile App Examples
 -------------------
@@ -220,4 +223,3 @@ If your tutorial depends on specific packages, simply add them to this provision
 * [MXnet-face](https://github.com/tornadomeet/mxnet-face) - Using mxnet for face-related algorithm by [tornadomeet](https://github.com/tornadomeet) where the single model get 97.13%+-0.88% accuracy on LFW, and with only 20MB size.
 * [MinPy](https://github.com/dmlc/minpy) - Pure numpy practice with third party operator Integration and MXnet as backend for GPU computing
 * [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) - a flexible and easy to use tool for serving Deep Learning models
-* [ONNX-MXNet](https://github.com/onnx/onnx-mxnet) - implements ONNX model format support for Apache MXNet
diff --git a/example/gluon/dcgan.py b/example/gluon/dcgan.py
index 3233f430eea..8ac9c522cf5 100644
--- a/example/gluon/dcgan.py
+++ b/example/gluon/dcgan.py
@@ -229,8 +229,8 @@ def transformer(data, label):
     logging.info('time: %f' % (time.time() - tic))
 
     if check_point:
-        netG.save_params(os.path.join(outf,'generator_epoch_%d.params' %epoch))
-        netD.save_params(os.path.join(outf,'discriminator_epoch_%d.params' % epoch))
+        netG.save_parameters(os.path.join(outf,'generator_epoch_%d.params' %epoch))
+        netD.save_parameters(os.path.join(outf,'discriminator_epoch_%d.params' % epoch))
 
-netG.save_params(os.path.join(outf, 'generator.params'))
-netD.save_params(os.path.join(outf, 'discriminator.params'))
+netG.save_parameters(os.path.join(outf, 'generator.params'))
+netD.save_parameters(os.path.join(outf, 'discriminator.params'))
diff --git a/example/gluon/embedding_learning/train.py b/example/gluon/embedding_learning/train.py
index 46f76b55614..b8a5bf2716c 100644
--- a/example/gluon/embedding_learning/train.py
+++ b/example/gluon/embedding_learning/train.py
@@ -246,7 +246,7 @@ def train(epochs, ctx):
         if val_accs[0] > best_val:
             best_val = val_accs[0]
             logging.info('Saving %s.' % opt.save_model_prefix)
-            net.save_params('%s.params' % opt.save_model_prefix)
+            net.save_parameters('%s.params' % opt.save_model_prefix)
     return best_val
 
 
diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py
index 6e2f1d6a78d..b21e943f17f 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -122,7 +122,7 @@ def get_model(model, ctx, opt):
 
     net = models.get_model(model, **kwargs)
     if opt.resume:
-        net.load_params(opt.resume)
+        net.load_parameters(opt.resume)
     elif not opt.use_pretrained:
         if model in ['alexnet']:
             net.initialize(mx.init.Normal())
@@ -176,12 +176,12 @@ def update_learning_rate(lr, trainer, epoch, ratio, steps):
 def save_checkpoint(epoch, top1, best_acc):
     if opt.save_frequency and (epoch + 1) % opt.save_frequency == 0:
         fname = os.path.join(opt.prefix, '%s_%d_acc_%.4f.params' % (opt.model, epoch, top1))
-        net.save_params(fname)
+        net.save_parameters(fname)
         logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
     if top1 > best_acc[0]:
         best_acc[0] = top1
         fname = os.path.join(opt.prefix, '%s_best.params' % (opt.model))
-        net.save_params(fname)
+        net.save_parameters(fname)
         logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
 
 def train(opt, ctx):
@@ -267,7 +267,7 @@ def main():
                 optimizer = 'sgd',
                 optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
                 initializer = mx.init.Xavier(magnitude=2))
-        mod.save_params('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
+        mod.save_parameters('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
     else:
         if opt.mode == 'hybrid':
             net.hybridize()
diff --git a/example/gluon/mnist.py b/example/gluon/mnist.py
index 198d7ca5ab2..6aea3abc504 100644
--- a/example/gluon/mnist.py
+++ b/example/gluon/mnist.py
@@ -117,7 +117,7 @@ def train(epochs, ctx):
         name, val_acc = test(ctx)
         print('[Epoch %d] Validation: %s=%f'%(epoch, name, val_acc))
 
-    net.save_params('mnist.params')
+    net.save_parameters('mnist.params')
 
 
 if __name__ == '__main__':
diff --git a/example/gluon/style_transfer/main.py b/example/gluon/style_transfer/main.py
index cab8211bc9c..dde992ae700 100644
--- a/example/gluon/style_transfer/main.py
+++ b/example/gluon/style_transfer/main.py
@@ -55,7 +55,7 @@ def train(args):
     style_model.initialize(init=mx.initializer.MSRAPrelu(), ctx=ctx)
     if args.resume is not None:
         print('Resuming, initializing using weight from {}.'.format(args.resume))
-        style_model.load_params(args.resume, ctx=ctx)
+        style_model.load_parameters(args.resume, ctx=ctx)
     print('style_model:',style_model)
     # optimizer and loss
     trainer = gluon.Trainer(style_model.collect_params(), 'adam',
@@ -121,14 +121,14 @@ def train(args):
                     str(count) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
                     args.content_weight) + "_" + str(args.style_weight) + ".params"
                 save_model_path = os.path.join(args.save_model_dir, save_model_filename)
-                style_model.save_params(save_model_path)
+                style_model.save_parameters(save_model_path)
                 print("\nCheckpoint, trained model saved at", save_model_path)
 
     # save model
     save_model_filename = "Final_epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
         args.content_weight) + "_" + str(args.style_weight) + ".params"
     save_model_path = os.path.join(args.save_model_dir, save_model_filename)
-    style_model.save_params(save_model_path)
+    style_model.save_parameters(save_model_path)
     print("\nDone, trained model saved at", save_model_path)
 
 
@@ -143,7 +143,7 @@ def evaluate(args):
     style_image = utils.preprocess_batch(style_image)
     # model
     style_model = net.Net(ngf=args.ngf)
-    style_model.load_params(args.model, ctx=ctx)
+    style_model.load_parameters(args.model, ctx=ctx)
     # forward
     style_model.set_target(style_image)
     output = style_model(content_image)
diff --git a/example/gluon/super_resolution.py b/example/gluon/super_resolution.py
index 38c3bec8949..0f2f21f3c0a 100644
--- a/example/gluon/super_resolution.py
+++ b/example/gluon/super_resolution.py
@@ -168,13 +168,13 @@ def train(epoch, ctx):
         print('training mse at epoch %d: %s=%f'%(i, name, acc))
         test(ctx)
 
-    net.save_params('superres.params')
+    net.save_parameters('superres.params')
 
 def resolve(ctx):
     from PIL import Image
     if isinstance(ctx, list):
         ctx = [ctx[0]]
-    net.load_params('superres.params', ctx=ctx)
+    net.load_parameters('superres.params', ctx=ctx)
     img = Image.open(opt.resolve_img).convert('YCbCr')
     y, cb, cr = img.split()
     data = mx.nd.expand_dims(mx.nd.expand_dims(mx.nd.array(y), axis=0), axis=0)
diff --git a/example/gluon/tree_lstm/main.py b/example/gluon/tree_lstm/main.py
index d2fe464638a..ad5d59f7a47 100644
--- a/example/gluon/tree_lstm/main.py
+++ b/example/gluon/tree_lstm/main.py
@@ -138,7 +138,7 @@ def test(ctx, data_iter, best, mode='validation', num_iter=-1):
         if test_r >= best:
             best = test_r
             logging.info('New optimum found: {}. Checkpointing.'.format(best))
-            net.save_params('childsum_tree_lstm_{}.params'.format(num_iter))
+            net.save_parameters('childsum_tree_lstm_{}.params'.format(num_iter))
             test(ctx, test_iter, -1, 'test')
         return best
 
diff --git a/example/gluon/word_language_model/train.py b/example/gluon/word_language_model/train.py
index 9e152636bb0..7f0a916b79b 100644
--- a/example/gluon/word_language_model/train.py
+++ b/example/gluon/word_language_model/train.py
@@ -185,7 +185,7 @@ def train():
         if val_L < best_val:
             best_val = val_L
             test_L = eval(test_data)
-            model.save_params(args.save)
+            model.save_parameters(args.save)
             print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
         else:
             args.lr = args.lr*0.25
@@ -193,6 +193,6 @@ def train():
 
 if __name__ == '__main__':
     train()
-    model.load_params(args.save, context)
+    model.load_parameters(args.save, context)
     test_L = eval(test_data)
     print('Best test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
index 05f5ddc4506..bfaadb3ff6b 100755
--- a/example/image-classification/common/data.py
+++ b/example/image-classification/common/data.py
@@ -43,9 +43,9 @@ def add_data_args(parser):
 def add_data_aug_args(parser):
     aug = parser.add_argument_group(
         'Image augmentations', 'implemented in src/io/image_aug_default.cc')
-    aug.add_argument('--random-crop', type=int, default=1,
+    aug.add_argument('--random-crop', type=int, default=0,
                      help='if or not randomly crop the image')
-    aug.add_argument('--random-mirror', type=int, default=1,
+    aug.add_argument('--random-mirror', type=int, default=0,
                      help='if or not randomly flip horizontally')
     aug.add_argument('--max-random-h', type=int, default=0,
                      help='max change of hue, whose range is [0, 180]')
@@ -53,8 +53,13 @@ def add_data_aug_args(parser):
                      help='max change of saturation, whose range is [0, 255]')
     aug.add_argument('--max-random-l', type=int, default=0,
                      help='max change of intensity, whose range is [0, 255]')
+    aug.add_argument('--min-random-aspect-ratio', type=float, default=None,
+                     help='min value of aspect ratio, whose value is either None or a positive value.')
     aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
-                     help='max change of aspect ratio, whose range is [0, 1]')
+                     help='max value of aspect ratio. If min_random_aspect_ratio is None, '
+                          'the aspect ratio range is [1-max_random_aspect_ratio, '
+                          '1+max_random_aspect_ratio], otherwise it is '
+                          '[min_random_aspect_ratio, max_random_aspect_ratio].')
     aug.add_argument('--max-random-rotate-angle', type=int, default=0,
                      help='max angle to rotate, whose range is [0, 360]')
     aug.add_argument('--max-random-shear-ratio', type=float, default=0,
@@ -63,16 +68,28 @@ def add_data_aug_args(parser):
                      help='max ratio to scale')
     aug.add_argument('--min-random-scale', type=float, default=1,
                      help='min ratio to scale, should >= img_size/input_shape. otherwise use --pad-size')
+    aug.add_argument('--max-random-area', type=float, default=1,
+                     help='max area to crop in random resized crop, whose range is [0, 1]')
+    aug.add_argument('--min-random-area', type=float, default=1,
+                     help='min area to crop in random resized crop, whose range is [0, 1]')
+    aug.add_argument('--brightness', type=float, default=0,
+                     help='brightness jittering, whose range is [0, 1]')
+    aug.add_argument('--contrast', type=float, default=0,
+                     help='contrast jittering, whose range is [0, 1]')
+    aug.add_argument('--saturation', type=float, default=0,
+                     help='saturation jittering, whose range is [0, 1]')
+    aug.add_argument('--pca-noise', type=float, default=0,
+                     help='pca noise, whose range is [0, 1]')
+    aug.add_argument('--random-resized-crop', type=int, default=0,
+                     help='whether to use random resized crop')
     return aug
 
-def set_data_aug_level(aug, level):
-    if level >= 1:
-        aug.set_defaults(random_crop=1, random_mirror=1)
-    if level >= 2:
-        aug.set_defaults(max_random_h=36, max_random_s=50, max_random_l=50)
-    if level >= 3:
-        aug.set_defaults(max_random_rotate_angle=10, max_random_shear_ratio=0.1, max_random_aspect_ratio=0.25)
-
+def set_resnet_aug(aug):
+    # standard data augmentation setting for resnet training
+    aug.set_defaults(random_crop=1, random_resized_crop=1)
+    aug.set_defaults(min_random_area=0.08)
+    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
+    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
 
 class SyntheticDataIter(DataIter):
     def __init__(self, num_classes, data_shape, max_iter, dtype):
@@ -135,8 +152,16 @@ def get_rec_iter(args, kv=None):
         max_random_scale    = args.max_random_scale,
         pad                 = args.pad_size,
         fill_value          = 127,
+        random_resized_crop = args.random_resized_crop,
         min_random_scale    = args.min_random_scale,
         max_aspect_ratio    = args.max_random_aspect_ratio,
+        min_aspect_ratio    = args.min_random_aspect_ratio,
+        max_random_area     = args.max_random_area,
+        min_random_area     = args.min_random_area,
+        brightness          = args.brightness,
+        contrast            = args.contrast,
+        saturation          = args.saturation,
+        pca_noise           = args.pca_noise,
         random_h            = args.max_random_h,
         random_s            = args.max_random_s,
         random_l            = args.max_random_l,
@@ -156,6 +181,7 @@ def get_rec_iter(args, kv=None):
         mean_r              = rgb_mean[0],
         mean_g              = rgb_mean[1],
         mean_b              = rgb_mean[2],
+        resize              = 256,
         data_name           = 'data',
         label_name          = 'softmax_label',
         batch_size          = args.batch_size,
diff --git a/example/image-classification/predict-cpp/CMakeLists.txt b/example/image-classification/predict-cpp/CMakeLists.txt
index a2f52b9df3a..c42d1917b76 100644
--- a/example/image-classification/predict-cpp/CMakeLists.txt
+++ b/example/image-classification/predict-cpp/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Check OpenCV
-if(NOT USE_OPENCV OR NOT OpenCV_FOUND)
+if(NOT USE_OPENCV OR NOT OpenCV_FOUND OR OpenCV_VERSION_MAJOR LESS 3)
   message(WARNING "\
-OpenCV should be enabled and found to build image classification example, skipping...")
+OpenCV version >= 3 should be enabled and found to build image classification example, skipping...")
   return()
 endif()
 
diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
index f465fbc5f46..a90b6aead23 100644
--- a/example/image-classification/train_imagenet.py
+++ b/example/image-classification/train_imagenet.py
@@ -30,8 +30,8 @@
     fit.add_fit_args(parser)
     data.add_data_args(parser)
     data.add_data_aug_args(parser)
-    # use a large aug level
-    data.set_data_aug_level(parser, 3)
+    # uncomment to set standard augmentation for resnet training
+    # data.set_resnet_aug(parser)
     parser.set_defaults(
         # network
         network          = 'resnet',
diff --git a/example/quantization/imagenet_gen_qsym.py b/example/quantization/imagenet_gen_qsym.py
index 045ce62489a..85474b663fa 100644
--- a/example/quantization/imagenet_gen_qsym.py
+++ b/example/quantization/imagenet_gen_qsym.py
@@ -53,6 +53,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model')
+    parser.add_argument('--ctx', type=str, default='gpu')
     parser.add_argument('--model', type=str, choices=['imagenet1k-resnet-152', 'imagenet1k-inception-bn'],
                         help='currently only supports imagenet1k-resnet-152 or imagenet1k-inception-bn')
     parser.add_argument('--batch-size', type=int, default=32)
@@ -91,8 +92,18 @@ def save_params(fname, arg_params, aux_params, logger=None):
                              ' thresholds. This mode is expected to produce the best inference accuracy of all three'
                              ' kinds of quantized models if the calibration dataset is representative enough of the'
                              ' inference dataset.')
+    parser.add_argument('--quantized-dtype', type=str, default='int8', 
+                        choices=['int8', 'uint8'],
+                        help='quantization destination data type for input data')
     args = parser.parse_args()
 
+    if args.ctx == 'gpu':
+        ctx = mx.gpu(0)
+    elif args.ctx == 'cpu':
+        ctx = mx.cpu(0)
+    else:
+        raise ValueError('ctx %s is not supported in this script' % args.ctx)
+
     logging.basicConfig()
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
@@ -129,17 +140,26 @@ def save_params(fname, arg_params, aux_params, logger=None):
     excluded_sym_names = []
     if args.model == 'imagenet1k-resnet-152':
         rgb_mean = '0,0,0'
-        calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
-                                                                 or name.find('sc') != -1
-                                                                 or name.find('fc') != -1)
+        if args.ctx == 'gpu':
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                     or name.find('sc') != -1
+                                                                     or name.find('fc') != -1)
+        else:
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                     or name.find('sc') != -1)
+            excluded_sym_names += ['flatten0', 'fc1']
         if exclude_first_conv:
-            excluded_sym_names = ['conv0']
+            excluded_sym_names += ['conv0']
     elif args.model == 'imagenet1k-inception-bn':
         rgb_mean = '123.68,116.779,103.939'
-        calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
-                                                                 or name.find('fc') != -1)
+        if args.ctx == 'gpu':
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
+                                                                     or name.find('fc') != -1)
+        else:
+            calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1)
+            excluded_sym_names += ['flatten', 'fc1']
         if exclude_first_conv:
-            excluded_sym_names = ['conv_1']
+            excluded_sym_names += ['conv_1']
     else:
         raise ValueError('model %s is not supported in this script' % args.model)
 
@@ -156,8 +176,9 @@ def save_params(fname, arg_params, aux_params, logger=None):
     if calib_mode == 'none':
         logger.info('Quantizing FP32 model %s' % args.model)
         qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                       excluded_sym_names=excluded_sym_names,
-                                                       calib_mode=calib_mode, logger=logger)
+                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
+                                                       logger=logger)
         sym_name = '%s-symbol.json' % (prefix + '-quantized')
         save_symbol(sym_name, qsym, logger)
     else:
@@ -176,10 +197,11 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                      **mean_args)
 
         cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                        ctx=mx.gpu(0), excluded_sym_names=excluded_sym_names,
+                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                         calib_mode=calib_mode, calib_data=data,
                                                         num_calib_examples=num_calib_batches * batch_size,
-                                                        calib_layer=calib_layer, logger=logger)
+                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        logger=logger)
         if calib_mode == 'entropy':
             suffix = '-quantized-%dbatches-entropy' % num_calib_batches
         elif calib_mode == 'naive':
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index fe3f2661c65..85649530aa0 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -99,6 +99,7 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Score a model on a dataset')
+    parser.add_argument('--ctx', type=str, default='gpu')
     parser.add_argument('--symbol-file', type=str, required=True, help='symbol file path')
     parser.add_argument('--param-file', type=str, required=True, help='param file path')
     parser.add_argument('--batch-size', type=int, default=32)
@@ -122,6 +123,13 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
 
     args = parser.parse_args()
 
+    if args.ctx == 'gpu':
+        ctx = mx.gpu(0)
+    elif args.ctx == 'cpu':
+        ctx = mx.cpu(0)
+    else:
+        raise ValueError('ctx %s is not supported in this script' % args.ctx)
+    
     logging.basicConfig()
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
@@ -172,5 +180,5 @@ def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples,
 
     num_inference_images = args.num_inference_batches * batch_size
     logger.info('Running model %s for inference' % symbol_file)
-    score(sym, arg_params, aux_params, data, [mx.gpu(0)], label_name,
+    score(sym, arg_params, aux_params, data, [ctx], label_name,
           max_num_examples=num_inference_images, logger=logger)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 6b7cf4407ed..55c26bc980b 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -987,6 +987,11 @@ MXNET_DLL int MXCreateCachedOpEx(SymbolHandle handle,
                                  int num_flags,
                                  const char** keys,
                                  const char** vals,
+                                 int num_inputs,
+                                 const char** input_names,
+                                 int num_params,
+                                 const char** param_names,
+                                 NDArrayHandle* params,
                                  CachedOpHandle *out);
 /*!
  * \brief free cached operator
@@ -1431,13 +1436,15 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
  * \param excluded_symbols array of symbols to be excluded from being quantized
  * \param num_offline number of parameters that are quantized offline
  * \param offline_params array of c strings representing the names of params quantized offline
+ * \param quantized_dtype the quantized destination type for input data.
  */
 MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle,
                                SymbolHandle *ret_sym_handle,
                                const mx_uint num_excluded_symbols,
                                const SymbolHandle *excluded_symbols,
                                const mx_uint num_offline,
-                               const char **offline_params);
+                               const char **offline_params,
+                               const char *quantized_dtype);
 
 /*!
  * \brief Set calibration table to node attributes in the sym
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 7ea60df3302..758ce851321 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -35,6 +35,23 @@
 #include "./ndarray.h"
 
 namespace mxnet {
+/*! \brief CachedOp Parameters */
+struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
+  uint32_t inline_limit;
+  uint32_t forward_bulk_size;
+  uint32_t backward_bulk_size;
+  DMLC_DECLARE_PARAMETER(CachedOpConfig) {
+    DMLC_DECLARE_FIELD(inline_limit)
+    .set_default(2)
+    .describe("Maximum number of operators that can be inlined.");
+    DMLC_DECLARE_FIELD(forward_bulk_size)
+    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .describe("Segment size of bulk execution during forward pass.");
+    DMLC_DECLARE_FIELD(backward_bulk_size)
+    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .describe("Segment size of bulk execution during backward pass.");
+  }
+};
 /*! \brief runtime functions for NDArray */
 class Imperative {
  public:
@@ -77,6 +94,67 @@ class Imperative {
              && info.out_grads.size() == 1;
     }
   };
+  class CachedOp {
+   public:
+    CachedOp(
+        const nnvm::Symbol& sym,
+        const std::vector<std::pair<std::string, std::string> >& flags,
+        const std::vector<std::string> arg_names,
+        const std::unordered_map<std::string, std::vector<NDArray> >& params);
+    uint32_t num_inputs() {
+      return fwd_graph_.indexed_graph().input_nodes().size();
+    }
+    uint32_t num_outputs() {
+      return fwd_graph_.outputs.size();
+    }
+    uint32_t num_backward_inputs() {
+      return bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size();
+    }
+    std::vector<bool>& save_inputs() {
+      return save_inputs_;
+    }
+    std::vector<bool>& save_outputs() {
+      return save_outputs_;
+    }
+    const std::unordered_set<uint32_t>& mutable_input_nodes() {
+      return fwd_graph_.indexed_graph().mutable_input_nodes();
+    }
+    nnvm::Graph GetForwardGraph(const bool recording,
+                                const std::vector<NDArray*>& inputs);
+    nnvm::Graph GetBackwardGraph(const OpStatePtr& state,
+                                 const std::vector<OpReqType>& reqs,
+                                 const std::vector<NDArray*>& inputs);
+    std::vector<nnvm::NodeEntry> Gradient(const nnvm::NodePtr& node,
+                                          const std::vector<nnvm::NodeEntry>& ograds);
+    void Forward(const std::shared_ptr<CachedOp>& op_ptr,
+                 const std::vector<NDArray*>& args,
+                 const std::vector<NDArray*>& outputs);
+    void Backward(const bool retain_graph,
+                  const OpStatePtr& state,
+                  const std::vector<NDArray*>& inputs,
+                  const std::vector<OpReqType>& reqs,
+                  const std::vector<NDArray*>& outputs);
+
+   private:
+    struct CachedOpState {
+      std::vector<NDArray> buff;
+      std::vector<OpStatePtr> states;
+    };
+    std::mutex mutex_;
+    CachedOpConfig config_;
+    nnvm::Graph fwd_graph_;
+    nnvm::Graph grad_graph_;
+    nnvm::Graph full_graph_;
+    std::unordered_map<Context, std::vector<NDArray> > params_;
+    bool inlining_;
+    std::vector<nnvm::NodeEntry> ograd_entries_;
+    std::vector<bool> curr_grad_req_;
+    std::vector<uint32_t> bwd_in_dep_, bwd_out_dep_, bwd_ograd_dep_;
+    std::vector<uint32_t> fwd_args_idx_;
+    std::vector<uint32_t> fwd_params_idx_;
+    std::vector<uint32_t> bwd_input_eid_;
+    std::vector<bool> save_inputs_, save_outputs_;
+  };
   /*! \brief whether operator recording is on. */
   bool is_training() const {
     return is_train_;
@@ -144,6 +222,15 @@ class Imperative {
       uint32_t num_inputs, uint32_t num_outputs,
       std::vector<bool> *p_save_inputs,
       std::vector<bool> *p_save_outputs);
+  void RunGraph(
+      const bool retain_graph,
+      const nnvm::IndexedGraph& idx,
+      const std::vector<NDArray*> arrays,
+      size_t node_start, size_t node_end,
+      std::vector<OpReqType>&& array_reqs,
+      std::vector<uint32_t>&& ref_count,
+      std::vector<OpStatePtr> *p_states,
+      const DispatchModeVector& dispatch_modes);
   /*! \brief indicate whether is training. */
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local bool is_train_;
@@ -160,5 +247,7 @@ class Imperative {
   int backward_bulk_size_{0};
 };
 
+using CachedOpPtr = std::shared_ptr<Imperative::CachedOp>;
+
 }  // namespace mxnet
 #endif  // MXNET_IMPERATIVE_H_
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index ae96fd87b0d..e243eb71c47 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -155,14 +155,6 @@ class NDArray {
     return byte_offset_ > 0 || shape() != ptr_->storage_shape;
   }
 
-  /* \brief Check whether the two arrays are the same array */
-  inline bool IsSame(const NDArray& other) {
-    return ptr_ == other.ptr_ &&
-        shape_ == other.shape_ &&
-        byte_offset_ == other.byte_offset_ &&
-        dtype_ == other.dtype_;
-  }
-
   /*!
    * \return the shape of current NDArray.
    */
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index f4694efad29..3969d8445be 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -126,36 +126,25 @@ class OpStatePtr {
   template<typename T, typename... Args>
   static OpStatePtr Create(Args&&... args) {
     OpStatePtr ret;
-    auto state = new T(std::forward<Args>(args)...);
-    auto var = Engine::Get()->NewVariable();
-    ret.ptr_.reset(
-      new OpState(var, state),
-      [](OpState* p) {
-        Engine::Get()->DeleteVariable([](RunContext s) {}, Context::CPU(), p->var);
-        delete reinterpret_cast<T*>(p->state);
-        delete p;
-      });
+    ret.ptr_ = std::make_shared<OpState>();
+    ret.ptr_->var_ = Engine::Get()->NewVariable();
+    ret.ptr_->state_.construct<T>(std::forward<Args>(args)...);
 
     return ret;
   }
   /* \brief Get engine variable associated with this state */
   engine::VarHandle get_var() const {
-    return ptr_->var;
+    return ptr_->var_;
   }
   /* \brief Get state of type T */
   template<typename T>
   T& get_state() const {
-    return *reinterpret_cast<T*>(ptr_->state);
+    return dmlc::get<T>(ptr_->state_);
   }
   /* \brief clear state */
   void reset() {
     ptr_.reset();
   }
-  /* \brief checks whether the managed object is managed only by the current
-            OpStatePtr instance */
-  bool unique() const {
-    return ptr_.unique();
-  }
   /* \brief Whether state is empty */
   explicit operator bool() const {
     return ptr_ ? true : false;
@@ -164,12 +153,16 @@ class OpStatePtr {
  private:
   /* \brief state structure */
   struct OpState {
-    engine::VarHandle var;
-    void* state;
-
-    OpState(engine::VarHandle var_, void* state_) : var(var_), state(state_) {}
+    OpState() {}
     OpState(const OpState& other) = delete;
     OpState& operator=(const OpState& other) = delete;
+
+    ~OpState() {
+      Engine::Get()->DeleteVariable([](RunContext s) {}, Context::CPU(), var_);
+    }
+
+    engine::VarHandle var_;
+    dmlc::any state_;
   };
   /* \brief shared pointer to state */
   std::shared_ptr<OpState> ptr_;
diff --git a/python/README.md b/python/README.md
index c1aaa580afc..1ab7aa4464a 100644
--- a/python/README.md
+++ b/python/README.md
@@ -13,9 +13,9 @@ For running unit tests, you will need the [nose PyPi package](https://pypi.pytho
 pip install --upgrade nose
 ```
 
-Once ```nose``` is installed, run the following from MXNet root directory:
+Once ```nose``` is installed, run the following from MXNet root directory (please make sure the installation path of ```nosetests``` is included in your ```$PATH``` environment variable):
 ```
 nosetests tests/python/unittest
 nosetests tests/python/train
 
-```
\ No newline at end of file
+```
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index f324545a235..d2cae0c45aa 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -105,14 +105,28 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
 class CachedOp(object):
     """Cached operator handle."""
     __slots__ = ["handle"]
-    def __init__(self, sym, flags=()):
+    def __init__(self, sym, flags=(), inputs=None, params=None):
         self.handle = CachedOpHandle()
+        param_names = []
+        param_arrays = []
+        if inputs is None:
+            assert params is None, "When inputs is None params must also be None."
+            inputs = sym.list_inputs()
+        elif params is not None:
+            for name, arrs in params.items():
+                param_arrays.extend(arrs)
+                param_names.extend([name] * len(arrs))
 
         check_call(_LIB.MXCreateCachedOpEx(
             sym.handle,
             len(flags),
             c_str_array([key for key, _ in flags]),
             c_str_array([str(val) for _, val in flags]),
+            len(inputs),
+            c_str_array(inputs),
+            len(param_names),
+            c_str_array(param_names),
+            c_handle_array(param_arrays),
             ctypes.byref(self.handle)))
 
     def __del__(self):
diff --git a/python/mxnet/contrib/onnx/_import/import_helper.py b/python/mxnet/contrib/onnx/_import/import_helper.py
index c8d45216729..3dfff3ed681 100644
--- a/python/mxnet/contrib/onnx/_import/import_helper.py
+++ b/python/mxnet/contrib/onnx/_import/import_helper.py
@@ -32,6 +32,10 @@
 from .op_translations import reduce_max, reduce_mean, reduce_min, reduce_sum
 from .op_translations import reduce_prod, avg_pooling, max_pooling
 from .op_translations import argmax, argmin, maximum, minimum
+from .op_translations import clip, reduce_log_sum, reduce_log_sum_exp
+from .op_translations import reduce_sum_square, reduce_l2, max_roi_pooling, instance_norm
+from .op_translations import log_softmax, softsign, lesser, greater, equal
+from .op_translations import logical_and, logical_or, logical_xor, logical_not
 
 # convert_map defines maps of ONNX operator names to converter functor(callable)
 # defined in the op_translations module.
@@ -102,6 +106,22 @@
     # Sorting and Searching
     'ArgMax'            : argmax,
     'ArgMin'            : argmin,
-    'Max'               : maximum, #elemwise maximum
-    'Min'               : minimum #elemwise minimum
+    'Max'               : maximum,
+    'Min'               : minimum,
+    'Clip'              : clip,
+    'ReduceLogSum'      : reduce_log_sum,
+    'ReduceLogSumExp'   : reduce_log_sum_exp,
+    'ReduceSumSquare'   : reduce_sum_square,
+    'ReduceL2'          : reduce_l2,
+    'MaxRoiPool'        : max_roi_pooling,
+    'InstanceNormalization' : instance_norm,
+    'LogSoftmax'        : log_softmax,
+    'Softsign'          : softsign,
+    'Less'              : lesser,
+    'Greater'           : greater,
+    'Equal'             : equal,
+    'And'               : logical_and,
+    'Xor'               : logical_xor,
+    'Not'               : logical_not,
+    'Or'                : logical_or
 }
diff --git a/python/mxnet/contrib/onnx/_import/op_translations.py b/python/mxnet/contrib/onnx/_import/op_translations.py
index e02cb0c2b62..0fad0080bef 100644
--- a/python/mxnet/contrib/onnx/_import/op_translations.py
+++ b/python/mxnet/contrib/onnx/_import/op_translations.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 """ Module for translating ONNX operators into Mxnet operatoes"""
 # pylint: disable=unused-argument,protected-access
+import numpy as np
 from . import translation_utils
 from .... import symbol
 
@@ -80,6 +81,22 @@ def divide(attrs, inputs, proto_obj):
         return op_value, new_attr, inputs
     return 'broadcast_div', new_attr, inputs
 
+def logical_and(attrs, inputs, proto_obj):
+    """Logical and of two input arrays."""
+    return 'broadcast_logical_and', attrs, inputs
+
+def logical_or(attrs, inputs, proto_obj):
+    """Logical or of two input arrays."""
+    return 'broadcast_logical_or', attrs, inputs
+
+def logical_xor(attrs, inputs, proto_obj):
+    """Logical xor of two input arrays."""
+    return 'broadcast_logical_xor', attrs, inputs
+
+def logical_not(attrs, inputs, proto_obj):
+    """Logical not of two input arrays."""
+    return 'logical_not', attrs, inputs
+
 def absolute(attrs, inputs, proto_obj):
     """Returns element-wise absolute value of the input."""
     return 'abs', attrs, inputs
@@ -97,7 +114,6 @@ def argmax(attrs, inputs, proto_obj):
     """Returns indices of the maximum values along an axis"""
     return 'argmax', attrs, inputs
 
-
 def argmin(attrs, inputs, proto_obj):
     """Returns indices of the minimum values along an axis."""
     return 'argmin', attrs, inputs
@@ -130,6 +146,18 @@ def minimum(attrs, inputs, proto_obj):
         mxnet_op = inputs[0]
     return mxnet_op, attrs, inputs
 
+def lesser(attrs, inputs, proto_obj):
+    """Logical Lesser operator with broadcasting."""
+    return 'broadcast_lesser', attrs, inputs
+
+def greater(attrs, inputs, proto_obj):
+    """Logical Greater operator with broadcasting."""
+    return 'broadcast_greater', attrs, inputs
+
+def equal(attrs, inputs, proto_obj):
+    """Logical Equal operator with broadcasting."""
+    return 'broadcast_equal', attrs, inputs
+
 #Hyperbolic functions
 def tanh(attrs, inputs, proto_obj):
     """Returns the hyperbolic tangent of the input array."""
@@ -151,6 +179,10 @@ def concat(attrs, inputs, proto_obj):
     return 'concat', new_attrs, inputs
 
 # Basic neural network functions
+def softsign(attrs, inputs, proto_obj):
+    """Computes softsign of x element-wise."""
+    return 'softsign', attrs, inputs
+
 def sigmoid(attrs, inputs, proto_obj):
     """Computes elementwise sigmoid of the input array"""
     return 'sigmoid', attrs, inputs
@@ -183,6 +215,11 @@ def batch_norm(attrs, inputs, proto_obj):
     new_attrs['fix_gamma'] = not attrs.get('is_test', 1)
     return 'BatchNorm', new_attrs, inputs
 
+def instance_norm(attrs, inputs, proto_obj):
+    """Instance Normalization."""
+    new_attrs = translation_utils._fix_attribute_names(attrs, {'epsilon' : 'eps'})
+    return 'InstanceNorm', new_attrs, inputs
+
 def leaky_relu(attrs, inputs, proto_obj):
     """Leaky Relu function"""
     if 'alpha' in attrs:
@@ -211,6 +248,16 @@ def softmax(attrs, inputs, proto_obj):
         attrs = translation_utils._add_extra_attributes(attrs, {'axis': 1})
     return 'softmax', attrs, inputs
 
+def log_softmax(attrs, inputs, proto_obj):
+    """Computes the log softmax of the input. This is equivalent to
+    computing softmax followed by log."""
+    return 'log_softmax', attrs, inputs
+
+def softplus(attrs, inputs, proto_obj):
+    """Applies the sofplus activation function element-wise to the input."""
+    new_attrs = translation_utils._add_extra_attributes(attrs, {'act_type' : 'softrelu'})
+    return 'Activation', new_attrs, inputs
+
 def conv(attrs, inputs, proto_obj):
     """Compute N-D convolution on (N+2)-D input."""
     new_attrs = translation_utils._fix_attribute_names(attrs, {'kernel_shape' : 'kernel',
@@ -389,15 +436,9 @@ def transpose(attrs, inputs, proto_obj):
 
 def squeeze(attrs, inputs, proto_obj):
     """Remove single-dimensional entries from the shape of a tensor."""
-    # MXNet doesnt have a squeeze operator.
-    # Using "split" to perform similar operation.
     new_attrs = translation_utils._fix_attribute_names(attrs,
                                                        {'axes' : 'axis'})
-    axes = new_attrs.get('axis')
-    mxnet_op = symbol.split(inputs[0], axis=axes[0], num_outputs=1, squeeze_axis=1)
-    for i in axes[1:]:
-        mxnet_op = symbol.split(mxnet_op, axis=i-1, num_outputs=1, squeeze_axis=1)
-    return mxnet_op, new_attrs, inputs
+    return 'squeeze', new_attrs, inputs
 
 def unsqueeze(attrs, inputs, cls):
     """Inserts a new axis of size 1 into the array shape"""
@@ -417,6 +458,16 @@ def flatten(attrs, inputs, proto_obj):
     new_attrs = translation_utils._remove_attributes(attrs, ['axis'])
     return 'Flatten', new_attrs, inputs
 
+def clip(attrs, inputs, proto_obj):
+    """Clips (limits) the values in an array."""
+    new_attrs = translation_utils._fix_attribute_names(attrs, {'min' : 'a_min',
+                                                               'max' : 'a_max'})
+    if 'a_max' not in new_attrs:
+        new_attrs = translation_utils._add_extra_attributes(new_attrs, {'a_max' : np.inf})
+    if 'a_min' not in new_attrs:
+        new_attrs = translation_utils._add_extra_attributes(new_attrs, {'a_min' : -np.inf})
+    return 'clip', new_attrs, inputs
+
 #Powers
 def reciprocal(attrs, inputs, proto_obj):
     """Returns the reciprocal of the argument, element-wise."""
@@ -454,20 +505,49 @@ def reduce_mean(attrs, inputs, proto_obj):
     return 'mean', new_attrs, inputs
 
 def reduce_min(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by mean value"""
+    """Reduce the array along a given axis by minimum value"""
     new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
     return 'min', new_attrs, inputs
 
 def reduce_sum(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by mean value"""
+    """Reduce the array along a given axis by sum value"""
     new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
     return 'sum', new_attrs, inputs
 
 def reduce_prod(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by mean value"""
+    """Reduce the array along a given axis by product value"""
     new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
     return 'prod', new_attrs, inputs
 
+def reduce_log_sum(attrs, inputs, proto_obj):
+    """Reduce the array along a given axis by log sum value"""
+    keep_dims = True if 'keepdims' not in attrs else attrs.get('keepdims')
+    sum_op = symbol.sum(inputs[0], axis=attrs.get('axes'),
+                        keepdims=keep_dims)
+    log_sym = symbol.log(sum_op)
+    return log_sym, attrs, inputs
+
+def reduce_log_sum_exp(attrs, inputs, proto_obj):
+    """Reduce the array along a given axis by log sum exp value"""
+    keep_dims = True if 'keepdims' not in attrs else attrs.get('keepdims')
+    exp_op = symbol.exp(inputs[0])
+    sum_op = symbol.sum(exp_op, axis=attrs.get('axes'),
+                        keepdims=keep_dims)
+    log_sym = symbol.log(sum_op)
+    return log_sym, attrs, inputs
+
+def reduce_sum_square(attrs, inputs, proto_obj):
+    """Reduce the array along a given axis by sum square value"""
+    square_op = symbol.square(inputs[0])
+    sum_op = symbol.sum(square_op, axis=attrs.get('axes'),
+                        keepdims=attrs.get('keepdims'))
+    return sum_op, attrs, inputs
+
+def reduce_l2(attrs, inputs, proto_obj):
+    """Reduce input tensor by l2 normalization."""
+    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
+    return 'norm', new_attrs, inputs
+
 def avg_pooling(attrs, inputs, proto_obj):
     """ Average pooling"""
     new_attrs = translation_utils._fix_attribute_names(attrs,
@@ -497,3 +577,11 @@ def max_pooling(attrs, inputs, proto_obj):
     new_op = translation_utils._fix_pooling('max', inputs, new_attrs)
 
     return new_op, new_attrs, inputs
+
+def max_roi_pooling(attrs, inputs, proto_obj):
+    """Max ROI Pooling."""
+    new_attrs = translation_utils._fix_attribute_names(attrs,
+                                                       {'pooled_shape': 'pooled_size',
+                                                        'spatial_scale': 'spatial_scale'
+                                                       })
+    return 'ROIPooling', new_attrs, inputs
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index c9c58a9c9ba..1314b97028a 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -72,7 +72,8 @@ def _quantize_params(qsym, params):
     return quantized_params
 
 
-def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
+def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
+                     quantized_dtype='int8'):
     """Given a symbol object representing a neural network of data type FP32,
     quantize it into a INT8 network.
 
@@ -86,6 +87,8 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
         Names of the parameters that users want to quantize offline. It's always recommended to
         quantize parameters offline so that quantizing parameters during the inference can be
         avoided.
+    quantized_dtype: str
+        The quantized destination type for input data.
     """
     num_excluded_symbols = 0
     excluded_handles = []
@@ -108,7 +111,8 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
                                      mx_uint(num_excluded_symbols),
                                      c_array(SymbolHandle, excluded_handles),
                                      mx_uint(num_offline),
-                                     c_array(ctypes.c_char_p, offline)))
+                                     c_array(ctypes.c_char_p, offline),
+                                     c_str(quantized_dtype)))
     return Symbol(out)
 
 
@@ -401,7 +405,8 @@ def _load_params(params, logger=logging):
 def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, calib_mode='entropy',
-                   calib_data=None, num_calib_examples=None, calib_layer=None, logger=logging):
+                   calib_data=None, num_calib_examples=None, calib_layer=None,
+                   quantized_dtype='int8', logger=logging):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -451,6 +456,9 @@ def quantize_model(sym, arg_params, aux_params,
         calibrate this layer. If yes, the statistics of the layer's output will be collected;
         otherwise, no information of the layer's output will be collected. If not provided,
         all the layers' outputs that need requantization will be collected.
+    quantized_dtype : str
+        The quantized destination type for input data. Currently support 'int8'
+        and 'uint8', default value is 'int8'.
     logger : Object
         A logging object for printing information during the process of quantization.
 
@@ -473,8 +481,13 @@ def quantize_model(sym, arg_params, aux_params,
             idx = nodes.list_outputs().index(sym_name + '_output')
             excluded_syms.append(nodes[idx])
     logger.info('Quantizing symbol')
+
+    if quantized_dtype != 'int8' and quantized_dtype != 'uint8':
+        raise ValueError('unknown quantized_dtype %s received,'
+                         ' expected `int8` or `uint8`' % quantized_dtype)
     qsym = _quantize_symbol(sym, excluded_symbols=excluded_syms,
-                            offline_params=list(arg_params.keys()))
+                            offline_params=list(arg_params.keys()),
+                            quantized_dtype=quantized_dtype)
 
     logger.info('Quantizing parameters')
     qarg_params = _quantize_params(qsym, arg_params)
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 7406a5d6c75..3b97c0578ca 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable= arguments-differ
+# pylint: disable= arguments-differ, too-many-lines
 """Base container class for all neural network models."""
 __all__ = ['Block', 'HybridBlock', 'SymbolBlock']
 
@@ -149,7 +149,8 @@ def forward(self, x):
 
 
     Child :py:class:`Block` assigned this way will be registered and :py:meth:`collect_params`
-    will collect their Parameters recursively.
+    will collect their Parameters recursively. You can also manually register
+    child blocks with :py:meth:`register_child`.
 
     Parameters
     ----------
@@ -268,12 +269,12 @@ def collect_params(self, select=None):
         children's Parameters(default), also can returns the select :py:class:`ParameterDict`
         which match some given regular expressions.
 
-        For example, collect the specified parameter in ['conv1_weight', 'conv1_bias', 'fc_weight',
+        For example, collect the specified parameters in ['conv1_weight', 'conv1_bias', 'fc_weight',
         'fc_bias']::
 
             model.collect_params('conv1_weight|conv1_bias|fc_weight|fc_bias')
 
-        or collect all paramters which their name ends with 'weight' or 'bias', this can be done
+        or collect all parameters whose names end with 'weight' or 'bias', this can be done
         using regular expressions::
 
             model.collect_params('.*weight|.*bias')
@@ -307,9 +308,23 @@ def _collect_params_with_prefix(self, prefix=''):
             ret.update(child._collect_params_with_prefix(prefix + name))
         return ret
 
-    def save_params(self, filename):
+    def save_parameters(self, filename):
         """Save parameters to file.
+        This function is to be used to save parameters of a Gluon model, note that
+        the saved parameters are not meant to be loaded in a different language binding for now.
+        Saving parameters using `.save_parameters()` is different than
+        `.collect_params().save()` and `.save_params()`, which are deprecated ways
+        to save the parameters of a model and should be avoided.
+
+        If your model is hybridizable and you want to export a serialized version of the
+        structure of the model as well as its parameters please refer to
+        :py:meth:`HybridBlock.export`. Such model can then be loaded back in any language binding
+        or even in Gluon using a :py:class:`SymbolBlock`.
+        Refer to this tutorial for a complete overview of saving/loading models with
+        MXNet: https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html
 
+        Parameters
+        ----------
         filename : str
             Path to file.
         """
@@ -317,14 +332,35 @@ def save_params(self, filename):
         arg_dict = {key : val._reduce() for key, val in params.items()}
         ndarray.save(filename, arg_dict)
 
-    def load_params(self, filename, ctx=None, allow_missing=False,
-                    ignore_extra=False):
+    def save_params(self, filename):
+        """[Deprecated] Please use save_parameters.
+
+        Save parameters to file.
+
+        filename : str
+            Path to file.
+        """
+        warnings.warn("save_params is deprecated. Please use save_parameters.")
+        try:
+            self.collect_params().save(filename, strip_prefix=self.prefix)
+        except ValueError as e:
+            raise ValueError('%s\nsave_params is deprecated. Using ' \
+                              'save_parameters may resolve this error.'%e.message)
+
+    def load_parameters(self, filename, ctx=None, allow_missing=False,
+                        ignore_extra=False):
         """Load parameters from file.
+        This function is to be used to load parameters of a Gluon model that were
+        saved using the `.save_parameters()` function. Any other use is undefined behaviour.
+        Refer to this tutorial for a complete overview of saving/loading models with
+        MXNet: https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html
 
+        Parameters
+        ----------
         filename : str
             Path to parameter file.
         ctx : Context or list of Context, default cpu()
-            Context(s) initialize loaded parameters on.
+            Context(s) to initialize loaded parameters on.
         allow_missing : bool, default False
             Whether to silently skip loading parameters not represents in the file.
         ignore_extra : bool, default False
@@ -358,6 +394,25 @@ def load_params(self, filename, ctx=None, allow_missing=False,
             if name in params:
                 params[name]._load_init(loaded[name], ctx)
 
+    def load_params(self, filename, ctx=None, allow_missing=False,
+                    ignore_extra=False):
+        """[Deprecated] Please use load_parameters.
+
+        Load parameters from file.
+
+        filename : str
+            Path to parameter file.
+        ctx : Context or list of Context, default cpu()
+            Context(s) to initialize loaded parameters on.
+        allow_missing : bool, default False
+            Whether to silently skip loading parameters not represents in the file.
+        ignore_extra : bool, default False
+            Whether to silently ignore parameters from the file that are not
+            present in this Block.
+        """
+        warnings.warn("load_params is deprecated. Please use load_parameters.")
+        self.load_parameters(filename, ctx, allow_missing, ignore_extra)
+
     def register_child(self, block, name=None):
         """Registers block as a child of self. :py:class:`Block` s assigned to self as
         attributes will be registered automatically."""
@@ -447,16 +502,8 @@ def hybridize(self, active=True, **kwargs):
         ----------
         active : bool, default True
             Whether to turn hybrid on or off.
-        static_alloc : bool, default False
-            Statically allocate memory to improve speed. Memory usage may increase.
-        static_shape : bool, default False
-            Optimize for invariant input shapes between iterations. Must also
-            set static_alloc to True. Change of input shapes is still allowed
-            but slower.
-        forward_bulk_size : int, default 15
-            Segment size of bulk execution during forward pass.
-        backward_bulk_size : int, default 15
-            Segment size of bulk execution during backward pass.
+        **kwargs : string
+            Additional flags for hybridized operator.
         """
         for cld in self._children.values():
             cld.hybridize(active, **kwargs)
@@ -601,9 +648,31 @@ def _summary_hook(block, _, outputs):
 class HybridBlock(Block):
     """`HybridBlock` supports forwarding with both Symbol and NDArray.
 
+    `HybridBlock` is similar to `Block`, with a few differences::
+
+        import mxnet as mx
+        from mxnet.gluon import HybridBlock, nn
+
+        class Model(HybridBlock):
+            def __init__(self, **kwargs):
+                super(Model, self).__init__(**kwargs)
+                # use name_scope to give child Blocks appropriate names.
+                with self.name_scope():
+                    self.dense0 = nn.Dense(20)
+                    self.dense1 = nn.Dense(20)
+
+            def hybrid_forward(self, F, x):
+                x = F.relu(self.dense0(x))
+                return F.relu(self.dense1(x))
+
+        model = Model()
+        model.initialize(ctx=mx.cpu(0))
+        model.hybridize()
+        model(mx.nd.zeros((10, 10), ctx=mx.cpu(0)))
+
     Forward computation in :py:class:`HybridBlock` must be static to work with :py:class:`Symbol` s,
     i.e. you cannot call :py:meth:`NDArray.asnumpy`, :py:attr:`NDArray.shape`,
-    :py:attr:`NDArray.dtype`, etc on tensors.
+    :py:attr:`NDArray.dtype`, `NDArray` indexing (`x[i]`) etc on tensors.
     Also, you cannot use branching or loop logic that bases on non-constant
     expressions like random numbers or intermediate results, since they change
     the graph structure for each iteration.
@@ -613,9 +682,12 @@ class HybridBlock(Block):
     representing the forward computation and cache it. On subsequent forwards,
     the cached graph will be used instead of :py:meth:`hybrid_forward`.
 
-    Refer `Hybrid tutorial <http://mxnet.io/tutorials/gluon/hybrid.html>`_ to see
-    the end-to-end usage.
+    Please see references for detailed tutorial.
 
+    References
+    ----------
+        `Hybrid - Faster training and easy deployment
+        <http://mxnet.io/tutorials/gluon/hybrid.html>`_
     """
     def __init__(self, prefix=None, params=None):
         super(HybridBlock, self).__init__(prefix=prefix, params=params)
@@ -624,7 +696,7 @@ def __init__(self, prefix=None, params=None):
         self._out_format = None
         self._in_format = None
         self._active = False
-        self._flags = []
+        self._flags = {}
 
     def __setattr__(self, name, value):
         """Registers parameters."""
@@ -651,43 +723,39 @@ def _get_graph(self, *args):
         return self._cached_graph
 
     def _build_cache(self, *args):
-        data, out = self._get_graph(*args)
-        data_names = {data.name : i for i, data in enumerate(data)}
-        params = self.collect_params()
-        input_names = out.list_inputs()
+        inputs, out = self._get_graph(*args)
+        input_names = [i.name for i in inputs]
 
+        params = self.collect_params()
         param_names = set(params.keys())
-        expected_names = set(input_names)
+        expected_names = set(out.list_inputs())
         for name in expected_names:
-            assert name in param_names or name in data_names, \
+            assert name in param_names or name in input_names, \
                 "Unknown input to HybridBlock: %s"%name
 
-        used_data_names = [i for i in data_names if i in expected_names]
-        if len(used_data_names) != len(data_names):
-            unused = ', '.join(['%d-th'%i for name, i in data_names.items()
+        used_input_names = [i for i in input_names if i in expected_names]
+        if len(used_input_names) != len(input_names):
+            unused = ', '.join(['%d-th'%i for i, name in enumerate(input_names)
                                 if name not in expected_names])
             warnings.warn("The %s input to HybridBlock is not used by any "
                           "computation. Is this intended?"%unused, stacklevel=4)
 
-        used_param_names = [i for i in param_names if i in expected_names]
+        used_param_names = set(i for i in param_names if i in expected_names)
         if len(used_param_names) != len(param_names):
-            unused = ', '.join(list(param_names - set(used_param_names)))
+            unused = ', '.join(list(param_names - used_param_names))
             warnings.warn("Parameter %s is not used by any computation. "
                           "Is this intended?"%unused, stacklevel=4)
 
-        data_indices = []
-        param_indices = []
-        self._cached_op_args = []
-        for i, name in enumerate(input_names):
-            if name in data_names:
-                data_indices.append(i)
-                self._cached_op_args.append((True, data_names[name]))
-            else:
-                param_indices.append(i)
-                self._cached_op_args.append((False, params[name]))
-        flags = [('data_indices', data_indices), ('param_indices', param_indices)] + \
-                self._flags
-        self._cached_op = ndarray.CachedOp(out, flags)
+        used_params = {k: params[k] for k in used_param_names}
+        try:
+            param_dict = {k: v.list_data() for k, v in used_params.items()}
+        except DeferredInitializationError:
+            self._deferred_infer_shape(*args)
+            for i in used_params.values():
+                i._finish_deferred_init()
+            param_dict = {k: v.list_data() for k, v in used_params.items()}
+
+        self._cached_op = ndarray.CachedOp(out, self._flags, input_names, param_dict)
 
     def _deferred_infer_shape(self, *args):
         try:
@@ -703,19 +771,7 @@ def _call_cached_op(self, *args):
 
         args, fmt = _flatten(args, "input")
         assert fmt == self._in_format, "Invalid input format"
-        try:
-            cargs = [args[i] if is_arg else i.data()
-                     for is_arg, i in self._cached_op_args]
-        except DeferredInitializationError:
-            self._deferred_infer_shape(*args)
-            cargs = []
-            for is_arg, i in self._cached_op_args:
-                if is_arg:
-                    cargs.append(args[i])
-                else:
-                    i._finish_deferred_init()
-                    cargs.append(i.data())
-        out = self._cached_op(*cargs)
+        out = self._cached_op(*args)
         if isinstance(out, NDArray):
             out = [out]
         return _regroup(out, self._out_format)[0]
@@ -736,7 +792,7 @@ def register_child(self, block, name=None):
 
     def hybridize(self, active=True, **kwargs):
         self._active = active
-        self._flags = list(kwargs.items())
+        self._flags = kwargs.items()
         self._clear_cached_op()
         if active and self._forward_hooks or self._forward_pre_hooks:
             warnings.warn('"{}" is being hybridized while still having forward hook/pre-hook. '
@@ -771,8 +827,8 @@ def infer_type(self, *args):
         self._infer_attrs('infer_type', 'dtype', *args)
 
     def export(self, path, epoch=0):
-        """Export HybridBlock to json format that can be loaded by `mxnet.mod.Module`
-        or the C++ interface.
+        """Export HybridBlock to json format that can be loaded by
+        `SymbolBlock.imports`, `mxnet.mod.Module` or the C++ interface.
 
         .. note:: When there are only one input, it will have name `data`. When there
                   Are more than one inputs, they will be named as `data0`, `data1`, etc.
@@ -886,6 +942,50 @@ class SymbolBlock(HybridBlock):
     >>> x = mx.nd.random.normal(shape=(16, 3, 224, 224))
     >>> print(feat_model(x))
     """
+    @staticmethod
+    def imports(symbol_file, input_names, param_file=None, ctx=None):
+        """Import model previously saved by `HybridBlock.export` or
+        `Module.save_checkpoint` as a SymbolBlock for use in Gluon.
+
+        Parameters
+        ----------
+        symbol_file : str
+            Path to symbol file.
+        input_names : list of str
+            List of input variable names
+        param_file : str, optional
+            Path to parameter file.
+        ctx : Context, default None
+            The context to initialize SymbolBlock on.
+
+        Returns
+        -------
+        SymbolBlock
+            SymbolBlock loaded from symbol and parameter files.
+
+        Examples
+        --------
+        >>> net1 = gluon.model_zoo.vision.resnet18_v1(
+        ...     prefix='resnet', pretrained=True)
+        >>> net1.hybridize()
+        >>> x = mx.nd.random.normal(shape=(1, 3, 32, 32))
+        >>> out1 = net1(x)
+        >>> net1.export('net1', epoch=1)
+        >>>
+        >>> net2 = gluon.SymbolBlock.imports(
+        ...     'net1-symbol.json', ['data'], 'net1-0001.params')
+        >>> out2 = net2(x)
+        """
+        sym = symbol.load(symbol_file)
+        if isinstance(input_names, str):
+            input_names = [input_names]
+        inputs = [symbol.var(i) for i in input_names]
+        ret = SymbolBlock(sym, inputs)
+        if param_file is not None:
+            ret.collect_params().load(param_file, ctx=ctx)
+        return ret
+
+
     def __init__(self, outputs, inputs, params=None):
         super(SymbolBlock, self).__init__(prefix=None, params=None)
         self._prefix = ''
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 7ec1c32d5e3..2e35a404b00 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -196,7 +196,7 @@ class RandomResizedCrop(Block):
         - **out**: output tensor with (H x W x C) shape.
     """
     def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0/4.0, 4.0/3.0),
-                 interpolation=2):
+                 interpolation=1):
         super(RandomResizedCrop, self).__init__()
         if isinstance(size, numeric_types):
             size = (size, size)
@@ -233,7 +233,7 @@ class CenterCrop(Block):
     >>> transformer(image)
     <NDArray 500x1000x3 @cpu(0)>
     """
-    def __init__(self, size, interpolation=2):
+    def __init__(self, size, interpolation=1):
         super(CenterCrop, self).__init__()
         if isinstance(size, numeric_types):
             size = (size, size)
@@ -250,6 +250,9 @@ class Resize(Block):
     ----------
     size : int or tuple of (W, H)
         Size of output image.
+    keep_ratio : bool
+        Whether to resize the short edge or both edges to `size`,
+        if size is give as an integer.
     interpolation : int
         Interpolation method for resizing. By default uses bilinear
         interpolation. See OpenCV's resize function for available choices.
@@ -268,14 +271,28 @@ class Resize(Block):
     >>> transformer(image)
     <NDArray 500x1000x3 @cpu(0)>
     """
-    def __init__(self, size, interpolation=2):
+    def __init__(self, size, keep_ratio=False, interpolation=1):
         super(Resize, self).__init__()
-        if isinstance(size, numeric_types):
-            size = (size, size)
-        self._args = tuple(size) + (interpolation,)
+        self._keep = keep_ratio
+        self._size = size
+        self._interpolation = interpolation
 
     def forward(self, x):
-        return image.imresize(x, *self._args)
+        if isinstance(self._size, numeric_types):
+            if not self._keep:
+                wsize = self._size
+                hsize = self._size
+            else:
+                h, w, _ = x.shape
+                if h > w:
+                    wsize = self._size
+                    hsize = int(h * wsize / w)
+                else:
+                    hsize = self._size
+                    wsize = int(w * hsize / h)
+        else:
+            wsize, hsize = self._size
+        return image.imresize(x, wsize, hsize, self._interpolation)
 
 
 class RandomFlipLeftRight(HybridBlock):
diff --git a/python/mxnet/gluon/model_zoo/vision/alexnet.py b/python/mxnet/gluon/model_zoo/vision/alexnet.py
index 55499470460..fdb006258c2 100644
--- a/python/mxnet/gluon/model_zoo/vision/alexnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/alexnet.py
@@ -83,5 +83,5 @@ def alexnet(pretrained=False, ctx=cpu(),
     net = AlexNet(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('alexnet', root=root), ctx=ctx)
+        net.load_parameters(get_model_file('alexnet', root=root), ctx=ctx)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/densenet.py b/python/mxnet/gluon/model_zoo/vision/densenet.py
index 835336739a6..b03f5ce8d52 100644
--- a/python/mxnet/gluon/model_zoo/vision/densenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/densenet.py
@@ -141,7 +141,7 @@ def get_densenet(num_layers, pretrained=False, ctx=cpu(),
     net = DenseNet(num_init_features, growth_rate, block_config, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('densenet%d'%(num_layers), root=root), ctx=ctx)
+        net.load_parameters(get_model_file('densenet%d'%(num_layers), root=root), ctx=ctx)
     return net
 
 def densenet121(**kwargs):
diff --git a/python/mxnet/gluon/model_zoo/vision/inception.py b/python/mxnet/gluon/model_zoo/vision/inception.py
index 6d75050b83f..7c54691f1b5 100644
--- a/python/mxnet/gluon/model_zoo/vision/inception.py
+++ b/python/mxnet/gluon/model_zoo/vision/inception.py
@@ -216,5 +216,5 @@ def inception_v3(pretrained=False, ctx=cpu(),
     net = Inception3(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('inceptionv3', root=root), ctx=ctx)
+        net.load_parameters(get_model_file('inceptionv3', root=root), ctx=ctx)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/mobilenet.py b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
index 5b4c9a8e615..1a2c9b94619 100644
--- a/python/mxnet/gluon/model_zoo/vision/mobilenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
@@ -213,7 +213,7 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
         version_suffix = '{0:.2f}'.format(multiplier)
         if version_suffix in ('1.00', '0.50'):
             version_suffix = version_suffix[:-1]
-        net.load_params(
+        net.load_parameters(
             get_model_file('mobilenet%s' % version_suffix, root=root), ctx=ctx)
     return net
 
@@ -245,7 +245,7 @@ def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
         version_suffix = '{0:.2f}'.format(multiplier)
         if version_suffix in ('1.00', '0.50'):
             version_suffix = version_suffix[:-1]
-        net.load_params(
+        net.load_parameters(
             get_model_file('mobilenetv2_%s' % version_suffix, root=root), ctx=ctx)
     return net
 
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index 5ee67b510a8..da279b89583 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -386,8 +386,8 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
     net = resnet_class(block_class, layers, channels, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('resnet%d_v%d'%(num_layers, version),
-                                       root=root), ctx=ctx)
+        net.load_parameters(get_model_file('resnet%d_v%d'%(num_layers, version),
+                                           root=root), ctx=ctx)
     return net
 
 def resnet18_v1(**kwargs):
diff --git a/python/mxnet/gluon/model_zoo/vision/squeezenet.py b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
index 09f62a52074..aaff4c36dfa 100644
--- a/python/mxnet/gluon/model_zoo/vision/squeezenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
@@ -132,7 +132,7 @@ def get_squeezenet(version, pretrained=False, ctx=cpu(),
     net = SqueezeNet(version, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('squeezenet%s'%version, root=root), ctx=ctx)
+        net.load_parameters(get_model_file('squeezenet%s'%version, root=root), ctx=ctx)
     return net
 
 def squeezenet1_0(**kwargs):
diff --git a/python/mxnet/gluon/model_zoo/vision/vgg.py b/python/mxnet/gluon/model_zoo/vision/vgg.py
index dbae5385898..a3b1685b413 100644
--- a/python/mxnet/gluon/model_zoo/vision/vgg.py
+++ b/python/mxnet/gluon/model_zoo/vision/vgg.py
@@ -114,8 +114,8 @@ def get_vgg(num_layers, pretrained=False, ctx=cpu(),
     if pretrained:
         from ..model_store import get_model_file
         batch_norm_suffix = '_bn' if kwargs.get('batch_norm') else ''
-        net.load_params(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix),
-                                       root=root), ctx=ctx)
+        net.load_parameters(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix),
+                                           root=root), ctx=ctx)
     return net
 
 def vgg11(**kwargs):
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 2fbf7d8786d..24f30270ad6 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -675,7 +675,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
 class _Pooling(HybridBlock):
     """Abstract class for different pooling layers."""
     def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
-                 pool_type, **kwargs):
+                 pool_type, count_include_pad=None, **kwargs):
         super(_Pooling, self).__init__(**kwargs)
         if strides is None:
             strides = pool_size
@@ -687,6 +687,8 @@ def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
             'kernel': pool_size, 'stride': strides, 'pad': padding,
             'global_pool': global_pool, 'pool_type': pool_type,
             'pooling_convention': 'full' if ceil_mode else 'valid'}
+        if count_include_pad is not None:
+            self._kwargs['count_include_pad'] = count_include_pad
 
     def _alias(self):
         return 'pool'
@@ -863,6 +865,8 @@ class AvgPool1D(_Pooling):
         respectively. padding is applied on 'W' dimension.
     ceil_mode : bool, default False
         When `True`, will use ceil instead of floor to compute the output shape.
+    count_include_pad : bool, default True
+        When 'False', will exclude padding elements when computing the average value.
 
 
     Inputs:
@@ -879,13 +883,13 @@ class AvgPool1D(_Pooling):
           equation.
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
-                 ceil_mode=False, **kwargs):
+                 ceil_mode=False, count_include_pad=True, **kwargs):
         assert layout == 'NCW', "Only supports 'NCW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
         super(AvgPool1D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
 
 
 class AvgPool2D(_Pooling):
@@ -907,6 +911,8 @@ class AvgPool2D(_Pooling):
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
         When True, will use ceil instead of floor to compute the output shape.
+    count_include_pad : bool, default True
+        When 'False', will exclude padding elements when computing the average value.
 
 
     Inputs:
@@ -926,13 +932,13 @@ class AvgPool2D(_Pooling):
           equation.
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
-                 ceil_mode=False, layout='NCHW', **kwargs):
+                 ceil_mode=False, layout='NCHW', count_include_pad=True, **kwargs):
         assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
         super(AvgPool2D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
 
 
 class AvgPool3D(_Pooling):
@@ -955,6 +961,8 @@ class AvgPool3D(_Pooling):
         dimension.
     ceil_mode : bool, default False
         When True, will use ceil instead of floor to compute the output shape.
+    count_include_pad : bool, default True
+        When 'False', will exclude padding elements when computing the average value.
 
 
     Inputs:
@@ -975,13 +983,13 @@ class AvgPool3D(_Pooling):
           equation.
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
-                 ceil_mode=False, layout='NCDHW', **kwargs):
+                 ceil_mode=False, layout='NCDHW', count_include_pad=True, **kwargs):
         assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
         super(AvgPool3D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
 
 
 class GlobalMaxPool1D(_Pooling):
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index c0d89fbd4cc..0c6aae92135 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -310,14 +310,16 @@ def _init_grad(self):
                                 self._grad, self.grad_req)
 
     def _reduce(self):
-        """Reduce data from multiple context."""
+        """Reduce data from multiple context to cpu."""
+        ctx = context.cpu()
         if self._stype == 'default':
             block = self.list_data()
-            data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block)
+            data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
         else:
             # fetch all rows for 'row_sparse' param
-            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=context.cpu())
-            data = self.row_sparse_data(all_row_ids)
+            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=ctx)
+            data = ndarray.zeros(self.shape, stype='row_sparse', ctx=ctx)
+            self._trainer._row_sparse_pull(self, data, all_row_ids)
         return data
 
     def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
@@ -391,6 +393,8 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
     def reset_ctx(self, ctx):
         """Re-assign Parameter to other contexts.
 
+        Parameters
+        ----------
         ctx : Context or list of Context, default ``context.current_context()``.
             Assign Parameter to given context. If ctx is a list of Context, a
             copy will be made for each context.
@@ -587,8 +591,8 @@ def __init__(self, **kwargs):
                 super(Block, self).__init__(**kwargs)
                 self.const = self.params.get_constant('const', [[1,2],[3,4]])
 
-    Parameter
-    ---------
+    Parameters
+    ----------
     name : str
         Name of the parameter.
     value : array-like
@@ -739,7 +743,7 @@ def get_constant(self, name, value=None):
         found, :py:func:`get` will create a new :py:class:`Constant` with key-word
         arguments and insert it to self.
 
-        Constants
+        Parameters
         ----------
         name : str
             Name of the desired Constant. It will be prepended with this dictionary's
@@ -814,6 +818,8 @@ def zero_grad(self):
     def reset_ctx(self, ctx):
         """Re-assign all Parameters to other contexts.
 
+        Parameters
+        ----------
         ctx : Context or list of Context, default :py:meth:`context.current_context()`.
             Assign Parameter to given context. If ctx is a list of Context, a
             copy will be made for each context.
@@ -846,6 +852,8 @@ def setattr(self, name, value):
     def save(self, filename, strip_prefix=''):
         """Save parameters to file.
 
+        Parameters
+        ----------
         filename : str
             Path to parameter file.
         strip_prefix : str, default ''
@@ -870,6 +878,8 @@ def load(self, filename, ctx=None, allow_missing=False,
              ignore_extra=False, restore_prefix=''):
         """Load parameters from file.
 
+        Parameters
+        ----------
         filename : str
             Path to parameter file.
         ctx : Context or list of Context
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index ef20109021a..02d68f0c39c 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -152,7 +152,6 @@ def _reset_kvstore(self):
 
     def _init_kvstore(self):
         """Create kvstore."""
-        arg_arrays = {}
         config = self._kvstore_params
         if self._contains_sparse:
             kvstore, update_on_kvstore = _create_sparse_kvstore(config['kvstore'])
@@ -162,6 +161,7 @@ def _init_kvstore(self):
                                    "gradients and/or sparse weights are present for "
                                    "Parameter '%s'."%param.name)
         else:
+            arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params}
             kvstore, update_on_kvstore = _create_kvstore(config['kvstore'], len(self._contexts),
                                                          arg_arrays)
             if config['update_on_kvstore'] is not None:
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 06b91fadcee..fcb7c97b980 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -118,10 +118,14 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
 def clip_global_norm(arrays, max_norm):
     """Rescales NDArrays so that the sum of their 2-norm is smaller than `max_norm`.
     """
+    def _norm(array):
+        if array.stype == 'default':
+            x = array.reshape((-1,))
+            return ndarray.dot(x, x)
+        return array.norm().square()
     assert len(arrays) > 0
     ctx = arrays[0].context
-    total_norm = ndarray.add_n(*[ndarray.dot(x, x).as_in_context(ctx)
-                                 for x in (arr.reshape((-1,)) for arr in arrays)])
+    total_norm = ndarray.add_n(*[_norm(arr).as_in_context(ctx) for arr in arrays])
     total_norm = ndarray.sqrt(total_norm).asscalar()
     if not np.isfinite(total_norm):
         warnings.warn(UserWarning('nan or inf is detected. Clipping results will be undefined.'),
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 1297c3da9a7..ef9026d4507 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -161,7 +161,7 @@ def _legacy_init(self, name, arr):
         Parameters
         ----------
         name : str
-            Name of corrosponding NDArray.
+            Name of corresponding NDArray.
 
         arr : NDArray
             NDArray to be initialized.
@@ -424,12 +424,14 @@ def _init_weight(self, _, arr):
 
 @register
 class Constant(Initializer):
-    """Initializes the weights to a scalar value.
+    """Initializes the weights to a given value.
+    The value passed in can be a scalar or a NDarray that matches the shape
+    of the parameter to be set.
 
     Parameters
     ----------
-    value : float
-        Fill value.
+    value : float, NDArray
+        Value to set.
     """
     def __init__(self, value):
         super(Constant, self).__init__(value=value)
@@ -651,7 +653,7 @@ def _init_weight(self, _, arr):
 
 @register
 class LSTMBias(Initializer):
-    """Initialize all bias of an LSTMCell to 0.0 except for
+    """Initialize all biases of an LSTMCell to 0.0 except for
     the forget gate whose bias is set to custom value.
 
     Parameters
diff --git a/python/mxnet/rnn/io.py b/python/mxnet/rnn/io.py
index 8eba9d21e39..a8890c9e7e7 100644
--- a/python/mxnet/rnn/io.py
+++ b/python/mxnet/rnn/io.py
@@ -27,7 +27,8 @@
 from ..io import DataIter, DataBatch, DataDesc
 from .. import ndarray
 
-def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n', start_label=0):
+def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n',
+                     start_label=0, unknown_token=None):
     """Encode sentences and (optionally) build a mapping
     from string tokens to integer indices. Unknown keys
     will be added to vocabulary.
@@ -46,6 +47,9 @@ def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n',
         of sentence by default.
     start_label : int
         lowest index.
+    unknown_token: str
+        Symbol to represent unknown token.
+        If not specified, unknown token will be skipped.
 
     Returns
     -------
@@ -65,9 +69,11 @@ def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n',
         coded = []
         for word in sent:
             if word not in vocab:
-                assert new_vocab, "Unknown token %s"%word
+                assert (new_vocab or unknown_token), "Unknown token %s"%word
                 if idx == invalid_label:
                     idx += 1
+                if unknown_token:
+                    word = unknown_token
                 vocab[word] = idx
                 idx += 1
             coded.append(vocab[word])
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 686802d3c48..19fe0749598 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1644,16 +1644,20 @@ def discard_stderr():
     with discard_stderr():
         ...
     """
+    with open(os.devnull, 'w') as bit_bucket:
+        try:
+            stderr_fileno = sys.stderr.fileno()
+            old_stderr = os.dup(stderr_fileno)
+            try:
+                os.dup2(bit_bucket.fileno(), stderr_fileno)
+                yield
+            finally:
+                os.dup2(old_stderr, stderr_fileno)
+        except AttributeError:
+            # On some systems is stderr not a file descriptor but actually a virtual pipeline
+            # that can not be copied
+            yield
 
-    try:
-        stderr_fileno = sys.stderr.fileno()
-        old_stderr = os.dup(stderr_fileno)
-        bit_bucket = open(os.devnull, 'w')
-        os.dup2(bit_bucket.fileno(), stderr_fileno)
-        yield
-    finally:
-        os.dup2(old_stderr, stderr_fileno)
-        bit_bucket.close()
 
 class DummyIter(mx.io.DataIter):
     """A dummy iterator that always returns the same batch of data
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
index 7289df19712..87c9bc72be0 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
@@ -224,13 +224,24 @@ class FeedForward private(
     var i = 0
     while (data.hasNext && i != numBatch) {
       val batch = data.next()
-      i += 1
-      ExecutorManager.loadData(batch, dataArrays)
-      predExec.forward(isTrain = false)
-      val padded = batch.pad
-      val realSize = batchSize - padded
-      for ((list, nd) <- outputs zip predExec.outputs) {
-        list += nd.slice(0, realSize).copy()
+      try {
+        i += 1
+        ExecutorManager.loadData(batch, dataArrays)
+        predExec.forward(isTrain = false)
+        val padded = batch.pad
+        val realSize = batchSize - padded
+        for ((list, nd) <- outputs zip predExec.outputs) {
+          // The slice is being written to a value so that dispose can be called after the copy.
+          // The one liner nd.slice().copy() leads to leaking the memory of the slice.
+          val ndSliced = nd.slice(0, realSize)
+          try {
+            list += ndSliced.copy()
+          } finally {
+            ndSliced.dispose()
+          }
+        }
+      } finally {
+        batch.dispose()
       }
     }
     // TODO(Yizhi): we can use Symbol.concat to do the same thing. Can it be more efficient?
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 469107aa58c..49f4d35136f 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -65,12 +65,12 @@ object NDArray {
     val ndArgs = ArrayBuffer.empty[NDArray]
     val posArgs = ArrayBuffer.empty[String]
     args.foreach {
-      case arr: NDArray =>
-        ndArgs.append(arr)
-      case arrFunRet: NDArrayFuncReturn =>
-        arrFunRet.arr.foreach(ndArgs.append(_))
-      case arg =>
-        posArgs.append(arg.toString)
+        case arr: NDArray =>
+          ndArgs.append(arr)
+        case arrFunRet: NDArrayFuncReturn =>
+          arrFunRet.arr.foreach(ndArgs.append(_))
+        case arg =>
+          posArgs.append(arg.toString)
     }
 
     require(posArgs.length <= function.arguments.length,
@@ -81,6 +81,7 @@ object NDArray {
         ++ function.arguments.slice(0, posArgs.length).zip(posArgs) - "out"
       ).map { case (k, v) => k -> v.toString }
 
+
     val (oriOutputs, outputVars) =
       if (kwargs != null && kwargs.contains("out")) {
         val output = kwargs("out")
@@ -537,6 +538,10 @@ object NDArray {
     new NDArray(handleRef.value)
   }
 
+  private def _crop_assign(kwargs: Map[String, Any] = null)(args: Any*) : NDArrayFuncReturn = {
+    genericNDArrayFunctionInvoke("_crop_assign", args, kwargs)
+  }
+
   // TODO: imdecode
 }
 
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/annotation/Experimental.scala b/scala-package/core/src/main/scala/org/apache/mxnet/annotation/Experimental.scala
new file mode 100644
index 00000000000..33d1d330979
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/annotation/Experimental.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.annotation
+
+import java.lang.annotation.{ElementType, Retention, Target, _}
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(Array(ElementType.TYPE, ElementType.FIELD, ElementType.METHOD, ElementType.PARAMETER,
+  ElementType.CONSTRUCTOR, ElementType.LOCAL_VARIABLE, ElementType.PACKAGE))
+class Experimental {}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
index 51089382097..70c64877887 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
@@ -28,7 +28,8 @@ import scala.collection.immutable.ListMap
 /**
  * NDArrayIter object in mxnet. Taking NDArray to get dataiter.
  *
- * @param data NDArrayIter supports single or multiple data and label.
+ * @param data Specify the data as well as the name.
+ *             NDArrayIter supports single or multiple data and label.
  * @param label Same as data, but is not fed to the model during testing.
  * @param dataBatchSize Batch Size
  * @param shuffle Whether to shuffle the data
@@ -38,15 +39,35 @@ import scala.collection.immutable.ListMap
  * the size of data does not match batch_size. Roll over is intended
  * for training and can cause problems if used for prediction.
  */
-class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = IndexedSeq.empty,
-                  private val dataBatchSize: Int = 1, shuffle: Boolean = false,
-                  lastBatchHandle: String = "pad",
-                  dataName: String = "data", labelName: String = "label") extends DataIter {
-  private val logger = LoggerFactory.getLogger(classOf[NDArrayIter])
+class NDArrayIter(data: IndexedSeq[(String, NDArray)],
+                  label: IndexedSeq[(String, NDArray)],
+                  private val dataBatchSize: Int, shuffle: Boolean,
+                  lastBatchHandle: String) extends DataIter {
+
+  /**
+   * @param data Specify the data. Data names will be data_0, data_1, ..., etc.
+   * @param label Same as data, but is not fed to the model during testing.
+   *              Label names will be label_0, label_1, ..., etc.
+   * @param dataBatchSize Batch Size
+   * @param shuffle Whether to shuffle the data
+   * @param lastBatchHandle "pad", "discard" or "roll_over". How to handle the last batch
+   *
+   * This iterator will pad, discard or roll over the last batch if
+   * the size of data does not match batch_size. Roll over is intended
+   * for training and can cause problems if used for prediction.
+   */
+  def this(data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = IndexedSeq.empty,
+           dataBatchSize: Int = 1, shuffle: Boolean = false,
+           lastBatchHandle: String = "pad",
+           dataName: String = "data", labelName: String = "label") {
+    this(IO.initData(data, allowEmpty = false, dataName),
+      IO.initData(label, allowEmpty = true, labelName),
+      dataBatchSize, shuffle, lastBatchHandle)
+  }
 
+  private val logger = LoggerFactory.getLogger(classOf[NDArrayIter])
 
-  private val (_dataList: IndexedSeq[NDArray],
-  _labelList: IndexedSeq[NDArray]) = {
+  val (initData: IndexedSeq[(String, NDArray)], initLabel: IndexedSeq[(String, NDArray)]) = {
     // data should not be null and size > 0
     require(data != null && data.size > 0,
       "data should not be null and data.size should not be zero")
@@ -55,17 +76,17 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
       "label should not be null. Use IndexedSeq.empty if there are no labels")
 
     // shuffle is not supported currently
-    require(shuffle == false, "shuffle is not supported currently")
+    require(!shuffle, "shuffle is not supported currently")
 
     // discard final part if lastBatchHandle equals discard
     if (lastBatchHandle.equals("discard")) {
-      val dataSize = data(0).shape(0)
+      val dataSize = data(0)._2.shape(0)
       require(dataBatchSize <= dataSize,
         "batch_size need to be smaller than data size when not padding.")
       val keepSize = dataSize - dataSize % dataBatchSize
-      val dataList = data.map(ndArray => {ndArray.slice(0, keepSize)})
+      val dataList = data.map { case (name, ndArray) => (name, ndArray.slice(0, keepSize)) }
       if (!label.isEmpty) {
-        val labelList = label.map(ndArray => {ndArray.slice(0, keepSize)})
+        val labelList = label.map { case (name, ndArray) => (name, ndArray.slice(0, keepSize)) }
         (dataList, labelList)
       } else {
         (dataList, label)
@@ -75,13 +96,9 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
     }
   }
 
-
-  val initData: IndexedSeq[(String, NDArray)] = IO.initData(_dataList, false, dataName)
-  val initLabel: IndexedSeq[(String, NDArray)] = IO.initData(_labelList, true, labelName)
-  val numData = _dataList(0).shape(0)
-  val numSource = initData.size
-  var cursor = -dataBatchSize
-
+  val numData = initData(0)._2.shape(0)
+  val numSource: MXUint = initData.size
+  private var cursor = -dataBatchSize
 
   private val (_provideData: ListMap[String, Shape],
                _provideLabel: ListMap[String, Shape]) = {
@@ -112,8 +129,8 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * reset the iterator
    */
   override def reset(): Unit = {
-    if (lastBatchHandle.equals("roll_over") && cursor>numData) {
-      cursor = -dataBatchSize + (cursor%numData)%dataBatchSize
+    if (lastBatchHandle.equals("roll_over") && cursor > numData) {
+      cursor = -dataBatchSize + (cursor%numData) % dataBatchSize
     } else {
       cursor = -dataBatchSize
     }
@@ -154,16 +171,16 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
     newArray
   }
 
-  private def _getData(data: IndexedSeq[NDArray]): IndexedSeq[NDArray] = {
+  private def _getData(data: IndexedSeq[(String, NDArray)]): IndexedSeq[NDArray] = {
     require(cursor < numData, "DataIter needs reset.")
     if (data == null) {
       null
     } else {
       if (cursor + dataBatchSize <= numData) {
-        data.map(ndArray => {ndArray.slice(cursor, cursor + dataBatchSize)}).toIndexedSeq
+        data.map { case (_, ndArray) => ndArray.slice(cursor, cursor + dataBatchSize) }
       } else {
         // padding
-        data.map(_padData).toIndexedSeq
+        data.map { case (_, ndArray) => _padData(ndArray) }
       }
     }
   }
@@ -173,7 +190,7 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * @return the data of current batch
    */
   override def getData(): IndexedSeq[NDArray] = {
-    _getData(_dataList)
+    _getData(initData)
   }
 
   /**
@@ -181,7 +198,7 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * @return the label of current batch
    */
   override def getLabel(): IndexedSeq[NDArray] = {
-    _getData(_labelList)
+    _getData(initLabel)
   }
 
   /**
@@ -189,7 +206,7 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
    * @return
    */
   override def getIndex(): IndexedSeq[Long] = {
-    (cursor.toLong to (cursor + dataBatchSize).toLong).toIndexedSeq
+    cursor.toLong to (cursor + dataBatchSize).toLong
   }
 
   /**
@@ -213,3 +230,66 @@ class NDArrayIter (data: IndexedSeq[NDArray], label: IndexedSeq[NDArray] = Index
 
   override def batchSize: Int = dataBatchSize
 }
+
+object NDArrayIter {
+
+  /**
+   * Builder class for NDArrayIter.
+   */
+  class Builder() {
+    private var data: IndexedSeq[(String, NDArray)] = IndexedSeq.empty
+    private var label: IndexedSeq[(String, NDArray)] = IndexedSeq.empty
+    private var dataBatchSize: Int = 1
+    private var lastBatchHandle: String = "pad"
+
+    /**
+     * Add one data input with its name.
+     * @param name Data name.
+     * @param data Data nd-array.
+     * @return The builder object itself.
+     */
+    def addData(name: String, data: NDArray): Builder = {
+      this.data = this.data ++ IndexedSeq((name, data))
+      this
+    }
+
+    /**
+     * Add one label input with its name.
+     * @param name Label name.
+     * @param label Label nd-array.
+     * @return The builder object itself.
+     */
+    def addLabel(name: String, label: NDArray): Builder = {
+      this.label = this.label ++ IndexedSeq((name, label))
+      this
+    }
+
+    /**
+     * Set the batch size of the iterator.
+     * @param batchSize batch size.
+     * @return The builder object itself.
+     */
+    def setBatchSize(batchSize: Int): Builder = {
+      this.dataBatchSize = batchSize
+      this
+    }
+
+    /**
+     * How to handle the last batch.
+     * @param lastBatchHandle Can be "pad", "discard" or "roll_over".
+     * @return The builder object itself.
+     */
+    def setLastBatchHandle(lastBatchHandle: String): Builder = {
+      this.lastBatchHandle = lastBatchHandle
+      this
+    }
+
+    /**
+     * Build the NDArrayIter object.
+     * @return the built object.
+     */
+    def build(): NDArrayIter = {
+      new NDArrayIter(data, label, dataBatchSize, false, lastBatchHandle)
+    }
+  }
+}
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
index 0f4b7c0e7a3..1b922b3c05b 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
@@ -24,7 +24,7 @@ import scala.sys.process._
 
 class IOSuite extends FunSuite with BeforeAndAfterAll {
 
-  private var tu = new TestUtil
+  private val tu = new TestUtil
 
   test("test MNISTIter & MNISTPack") {
     // get data
@@ -258,7 +258,11 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     assert(batchCount === nBatch0)
 
     // test discard
-    val dataIter1 = new NDArrayIter(data, label, 128, false, "discard")
+    val dataIter1 = new NDArrayIter.Builder()
+      .addData("data0", data(0)).addData("data1", data(1))
+      .addLabel("label", label(0))
+      .setBatchSize(128)
+      .setLastBatchHandle("discard").build()
     val nBatch1 = 7
     batchCount = 0
     while(dataIter1.hasNext) {
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
index 90fe2604e8b..3bbc7fd6a90 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
@@ -52,8 +52,9 @@ private[mxnet] object APIDocGenerator{
     val apacheLicence = "/*\n* Licensed to the Apache Software Foundation (ASF) under one or more\n* contributor license agreements.  See the NOTICE file distributed with\n* this work for additional information regarding copyright ownership.\n* The ASF licenses this file to You under the Apache License, Version 2.0\n* (the \"License\"); you may not use this file except in compliance with\n* the License.  You may obtain a copy of the License at\n*\n*    http://www.apache.org/licenses/LICENSE-2.0\n*\n* Unless required by applicable law or agreed to in writing, software\n* distributed under the License is distributed on an \"AS IS\" BASIS,\n* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n* See the License for the specific language governing permissions and\n* limitations under the License.\n*/\n"
     val scalaStyle = "// scalastyle:off"
     val packageDef = "package org.apache.mxnet"
+    val imports = "import org.apache.mxnet.annotation.Experimental"
     val absClassDef = s"abstract class $packageName"
-    val finalStr = s"$apacheLicence\n$scalaStyle\n$packageDef\n$absClassDef {\n${absFuncs.mkString("\n")}\n}"
+    val finalStr = s"$apacheLicence\n$scalaStyle\n$packageDef\n$imports\n$absClassDef {\n${absFuncs.mkString("\n")}\n}"
     import java.io._
     val pw = new PrintWriter(new File(FILE_PATH + s"$packageName.scala"))
     pw.write(finalStr)
@@ -97,9 +98,11 @@ private[mxnet] object APIDocGenerator{
       argDef += "name : String = null"
       argDef += "attr : Map[String, String] = null"
     } else {
+      argDef += "out : Option[NDArray] = None"
       returnType = "org.apache.mxnet.NDArrayFuncReturn"
     }
-    s"def ${func.name} (${argDef.mkString(", ")}) : ${returnType}"
+    val experimentalTag = "@Experimental"
+    s"$experimentalTag\ndef ${func.name} (${argDef.mkString(", ")}) : $returnType"
   }
 
 
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index ce5b532bc8b..082c64a609c 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -21,7 +21,7 @@ import org.apache.mxnet.init.Base._
 import org.apache.mxnet.utils.{CToScalaUtils, OperatorBuildUtils}
 
 import scala.annotation.StaticAnnotation
-import scala.collection.mutable.ListBuffer
+import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
@@ -57,14 +57,13 @@ private[mxnet] object NDArrayMacro {
 
     val newNDArrayFunctions = {
       if (isContrib) ndarrayFunctions.filter(_.name.startsWith("_contrib_"))
-      else ndarrayFunctions.filter(!_.name.startsWith("_contrib_"))
+      else ndarrayFunctions.filterNot(_.name.startsWith("_"))
     }
 
      val functionDefs = newNDArrayFunctions flatMap { NDArrayfunction =>
         val funcName = NDArrayfunction.name
         val termName = TermName(funcName)
-        if (!NDArrayfunction.name.startsWith("_") || NDArrayfunction.name.startsWith("_contrib_")) {
-          Seq(
+       Seq(
             // scalastyle:off
             // (yizhi) We are investigating a way to make these functions type-safe
             // and waiting to see the new approach is stable enough.
@@ -75,16 +74,7 @@ private[mxnet] object NDArrayMacro {
             q"def $termName(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, null)}".asInstanceOf[DefDef]
             // scalastyle:on
           )
-        } else {
-          // Default private
-          Seq(
-            // scalastyle:off
-            q"private def $termName(kwargs: Map[String, Any] = null)(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, kwargs)}".asInstanceOf[DefDef],
-            q"private def $termName(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, null)}".asInstanceOf[DefDef]
-            // scalastyle:on
-          )
         }
-      }
 
     structGeneration(c)(functionDefs, annottees : _*)
   }
@@ -109,6 +99,7 @@ private[mxnet] object NDArrayMacro {
       // Construct Implementation field
       var impl = ListBuffer[String]()
       impl += "val map = scala.collection.mutable.Map[String, Any]()"
+      impl += "val args = scala.collection.mutable.ArrayBuffer.empty[NDArray]"
       ndarrayfunction.listOfArgs.foreach({ ndarrayarg =>
         // var is a special word used to define variable in Scala,
         // need to changed to something else in order to make it work
@@ -123,14 +114,32 @@ private[mxnet] object NDArrayMacro {
         else {
           argDef += s"${currArgName} : ${ndarrayarg.argType}"
         }
-        var base = "map(\"" + ndarrayarg.argName + "\") = " + currArgName
-        if (ndarrayarg.isOptional) {
-          base = "if (!" + currArgName + ".isEmpty)" + base + ".get"
-        }
-        impl += base
+        // NDArray arg implementation
+        val returnType = "org.apache.mxnet.NDArray"
+
+        // TODO: Currently we do not add place holder for NDArray
+        // Example: an NDArray operator like the following format
+        // nd.foo(arg1: NDArray(required), arg2: NDArray(Optional), arg3: NDArray(Optional)
+        // If we place nd.foo(arg1, arg3 = arg3), do we need to add place holder for arg2?
+        // What it should be?
+        val base =
+          if (ndarrayarg.argType.equals(returnType)) {
+            s"args += $currArgName"
+          } else if (ndarrayarg.argType.equals(s"Array[$returnType]")){
+            s"args ++= $currArgName"
+          } else {
+            "map(\"" + ndarrayarg.argName + "\") = " + currArgName
+          }
+        impl.append(
+          if (ndarrayarg.isOptional) s"if (!$currArgName.isEmpty) $base.get"
+          else base
+        )
       })
+      // add default out parameter
+      argDef += "out : Option[NDArray] = None"
+      impl += "if (!out.isEmpty) map(\"out\") = out.get"
       // scalastyle:off
-      impl += "org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(\"" + ndarrayfunction.name + "\", null, map.toMap)"
+      impl += "org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(\"" + ndarrayfunction.name + "\", args.toSeq, map.toMap)"
       // scalastyle:on
       // Combine and build the function string
       val returnType = "org.apache.mxnet.NDArrayFuncReturn"
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index bacbdb2e307..81430c2ab26 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -41,7 +41,7 @@ private[mxnet] object SymbolImplMacros {
     impl(c)(annottees: _*)
   }
   def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
-    newAPIImpl(c)(annottees: _*)
+    typedAPIImpl(c)(annottees: _*)
   }
   // scalastyle:on havetype
 
@@ -82,7 +82,7 @@ private[mxnet] object SymbolImplMacros {
   /**
     * Implementation for Dynamic typed API Symbol.api.<functioname>
     */
-  private def newAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Any] = {
+  private def typedAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Any] = {
     import c.universe._
 
     val isContrib: Boolean = c.prefix.tree match {
@@ -104,6 +104,7 @@ private[mxnet] object SymbolImplMacros {
       // Construct Implementation field
       var impl = ListBuffer[String]()
       impl += "val map = scala.collection.mutable.Map[String, Any]()"
+      impl += "var args = Seq[org.apache.mxnet.Symbol]()"
       symbolfunction.listOfArgs.foreach({ symbolarg =>
         // var is a special word used to define variable in Scala,
         // need to changed to something else in order to make it work
@@ -118,17 +119,28 @@ private[mxnet] object SymbolImplMacros {
         else {
           argDef += s"${currArgName} : ${symbolarg.argType}"
         }
-        var base = "map(\"" + symbolarg.argName + "\") = " + currArgName
-        if (symbolarg.isOptional) {
-          base = "if (!" + currArgName + ".isEmpty)" + base + ".get"
+        // Symbol arg implementation
+        val returnType = "org.apache.mxnet.Symbol"
+        val base =
+        if (symbolarg.argType.equals(s"Array[$returnType]")) {
+          if (symbolarg.isOptional) s"if (!$currArgName.isEmpty) args = $currArgName.get.toSeq"
+          else s"args = $currArgName.toSeq"
+        } else {
+          if (symbolarg.isOptional) {
+            // scalastyle:off
+            s"if (!$currArgName.isEmpty) map(" + "\"" + symbolarg.argName + "\"" + s") = $currArgName.get"
+            // scalastyle:on
+          }
+          else "map(\"" + symbolarg.argName + "\"" + s") = $currArgName"
         }
+
         impl += base
       })
       argDef += "name : String = null"
       argDef += "attr : Map[String, String] = null"
       // scalastyle:off
       // TODO: Seq() here allows user to place Symbols rather than normal arguments to run, need to fix if old API deprecated
-      impl += "org.apache.mxnet.Symbol.createSymbolGeneral(\"" + symbolfunction.name + "\", name, attr, Seq(), map.toMap)"
+      impl += "org.apache.mxnet.Symbol.createSymbolGeneral(\"" + symbolfunction.name + "\", name, attr, args, map.toMap)"
       // scalastyle:on
       // Combine and build the function string
       val returnType = "org.apache.mxnet.Symbol"
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 43ff1f78fe1..f2b806094af 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -36,24 +36,6 @@
       </properties>
     </profile>
   </profiles>
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
-        <configuration>
-          <argLine>
-            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
-            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
-          </argLine>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.scalastyle</groupId>
-        <artifactId>scalastyle-maven-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
diff --git a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
index 74bc1dbb71f..72bbbe0fed0 100644
--- a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
+++ b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
@@ -46,24 +46,26 @@ class MXNetGeneralSuite extends SharedSparkContext {
       "/dataset/mxnet-spark-test/train.txt" + " -P " + testDataDir + " -q") !
   }
 
-  override def beforeAll(): Unit = {
-    val tempDirFile = Files.createTempDirectory(s"mxnet-spark-test-${System.currentTimeMillis()}").
-      toFile
-    testDataDir = tempDirFile.getPath
-    tempDirFile.deleteOnExit()
-    downloadTestData()
-  }
-
+//  override def beforeAll(): Unit = {
+//  val tempDirFile = Files.createTempDirectory(s"mxnet-spark-test-${System.currentTimeMillis()}").
+//      toFile
+//    testDataDir = tempDirFile.getPath
+//    tempDirFile.deleteOnExit()
+//    downloadTestData()
+//  }
 
-  test("run spark with MLP") {
-    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
-    val model = buildMlp().fit(trainData)
-    assert(model != null)
-  }
+  test("Dummy test on Spark") {
 
-  test("run spark with LeNet") {
-    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
-    val model = buildLeNet().fit(trainData)
-    assert(model != null)
   }
+//  test("run spark with MLP") {
+//    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
+//    val model = buildMlp().fit(trainData)
+//    assert(model != null)
+//  }
+//
+//  test("run spark with LeNet") {
+//    val trainData = parseRawData(sc, s"$testDataDir/train.txt")
+//    val model = buildLeNet().fit(trainData)
+//    assert(model != null)
+//  }
 }
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 34bd4b20aa5..9aabe04656e 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -36,7 +36,6 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "../imperative/imperative_utils.h"
-#include "../imperative/cached_op.h"
 
 using namespace mxnet;
 
@@ -161,8 +160,12 @@ int MXCreateCachedOp(SymbolHandle handle,
   std::vector<std::string> input_names;
   input_names.reserve(inputs.size());
   for (const auto& i : inputs) input_names.push_back(i->attrs.name);
-  *out = new CachedOpPtr(new CachedOp(
-      *sym, std::vector<std::pair<std::string, std::string> >()));
+  *out = new std::shared_ptr<Imperative::CachedOp>(
+      new Imperative::CachedOp(
+        *sym,
+        std::vector<std::pair<std::string, std::string> >(),
+        input_names,
+        std::unordered_map<std::string, std::vector<NDArray> >()));
   API_END();
 }
 
@@ -170,6 +173,11 @@ int MXCreateCachedOpEx(SymbolHandle handle,
                        int num_flags,
                        const char** keys,
                        const char** vals,
+                       int num_args,
+                       const char** arg_names,
+                       int num_params,
+                       const char** param_names,
+                       NDArrayHandle* params,
                        CachedOpHandle *out) {
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(handle);
 
@@ -178,7 +186,17 @@ int MXCreateCachedOpEx(SymbolHandle handle,
   for (int i = 0; i < num_flags; ++i) {
     flags.push_back({keys[i], vals[i]});
   }
-  *out = new CachedOpPtr(new CachedOp(*sym, flags));
+  std::vector<std::string> args;
+  for (int i = 0; i < num_args; ++i) {
+    args.push_back(arg_names[i]);
+  }
+  std::unordered_map<std::string, std::vector<NDArray> > param_dict;
+  for (int i = 0; i < num_params; ++i) {
+    param_dict[param_names[i]].emplace_back(
+        *reinterpret_cast<NDArray*>(params[i]));
+  }
+  *out = new std::shared_ptr<Imperative::CachedOp>(
+      new Imperative::CachedOp(*sym, flags, args, param_dict));
   API_END();
 }
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 4666b6adf0c..e5e9b522890 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -577,7 +577,8 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
                      const mx_uint num_excluded_symbols,
                      const SymbolHandle *excluded_symbols,
                      const mx_uint num_offline,
-                     const char **offline_params) {
+                     const char **offline_params,
+                     const char *quantized_dtype) {
   nnvm::Symbol *s = new nnvm::Symbol();
   API_BEGIN();
   nnvm::Symbol *sym = static_cast<nnvm::Symbol*>(sym_handle);
@@ -594,7 +595,9 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
   for (size_t i = 0; i < num_offline; ++i) {
     offline.emplace(offline_params[i]);
   }
+  std::string quantized_type(quantized_dtype);
   g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
+  g.attrs["quantized_dtype"] = std::make_shared<nnvm::any>(std::move(quantized_type));
   g = ApplyPass(std::move(g), "QuantizeGraph");
   s->outputs = g.outputs;
   *ret_sym_handle = s;
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index e70cc197c0c..dc0436e02a8 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -278,8 +278,6 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
 }
 
 void ThreadedEngine::Push(OprHandle op, Context exec_ctx, int priority, bool profiling) {
-  BulkFlush();
-
   ThreadedOpr* threaded_opr = ThreadedOpr::CastFromBase(op);
   OprBlock* opr_block = OprBlock::New();
   opr_block->opr = threaded_opr;
@@ -325,6 +323,7 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx,
         << device_count_;
   }
 #endif
+  BulkFlush();
   ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name, wait);
   opr->temporary = true;
   const bool profiling = profiler_->IsProfiling(profiler::Profiler::kImperative);
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index 2f77380baf8..97f258c1061 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -76,6 +76,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     if (is_worker_) return;
     gpu_worker_nthreads_ = common::GetNumThreadsPerGPU();
     cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+    gpu_copy_nthreads_ = dmlc::GetEnv("MXNET_GPU_COPY_NTHREADS", 2);
     // create CPU task
     int cpu_priority_nthreads = dmlc::GetEnv("MXNET_CPU_PRIORITY_NTHREADS", 4);
     cpu_priority_worker_.reset(new ThreadWorkerBlock<kPriorityQueue>());
@@ -128,8 +129,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
         const FnProperty prop = opr_block->opr->prop;
         const bool is_copy = (prop == FnProperty::kCopyFromGPU ||
                               prop == FnProperty::kCopyToGPU);
-        const size_t nthread = gpu_worker_nthreads_;
         if (is_copy) {
+          const size_t nthread = gpu_copy_nthreads_;
           auto ptr = gpu_copy_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
             // Signify to kernel that GPU is being used, so reserve cores as necessary
             OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
@@ -150,6 +151,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
             }
           }
         } else {
+          const size_t nthread = gpu_worker_nthreads_;
           auto ptr = gpu_normal_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
             // Signify to kernel that GPU is being used, so reserve cores as necessary
             OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
@@ -194,6 +196,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   size_t cpu_worker_nthreads_;
   /*! \brief number of concurrent thread each gpu worker uses */
   size_t gpu_worker_nthreads_;
+  /*! \brief number of concurrent thread each gpu copy worker uses */
+  size_t gpu_copy_nthreads_;
   // cpu worker
   common::LazyAllocArray<ThreadWorkerBlock<kWorkerQueue> > cpu_normal_workers_;
   // cpu priority worker
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 72919d90c62..697e4869a04 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -134,10 +134,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_.get_var();
   }
 
-  OpStatePtr state() const override {
-    return state_;
-  }
-
   explicit StatefulComputeExecutor(const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
                                    ExecType exec_type,
@@ -146,6 +142,7 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
         state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
+  friend Graph AttachOpExecs(Graph g);
   OpStatePtr state_;
   FStatefulCompute fcompute_;
   ExecType exec_type_;
@@ -173,16 +170,13 @@ class StatefulComputeExExecutor : public OpExecutor {
     return state_.get_var();
   }
 
-  OpStatePtr state() const override {
-    return state_;
-  }
-
   explicit StatefulComputeExExecutor(const OpStatePtr& state,
                                      const FStatefulComputeEx& fcompute,
                                      ExecType exec_type)
       : state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
+  friend Graph AttachOpExecs(Graph g);
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -247,15 +241,16 @@ class FComputeExExecutor : public OpExecutor {
   ExecType exec_type_;
 };
 
-void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
+// pass to attach operator executors
+Graph AttachOpExecs(Graph g) {
   using nnvm::DTypeVector;
   using nnvm::ShapeVector;
   using nnvm::FMutateInputs;
 
-  static auto& fcreate_op_state = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& fmutate_inputs = nnvm::Op::GetAttr<FMutateInputs>("FMutateInputs");
-  static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
-  static auto& is_layer_backward = nnvm::Op::GetAttr<bool>("TIsLayerOpBackward");
+  auto& fcreate_op_state = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  auto& fmutate_inputs = nnvm::Op::GetAttr<FMutateInputs>("FMutateInputs");
+  auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
+  auto& is_layer_backward = nnvm::Op::GetAttr<bool>("TIsLayerOpBackward");
 
   const auto& vdtype = g.GetAttr<DTypeVector>("dtype");
   const auto& vshape = g.GetAttr<ShapeVector>("shape");
@@ -264,88 +259,82 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
 
   // get the graph
   const auto& idx = g.indexed_graph();
-  OpExecVector& ret = *p_ret;
+  std::vector<std::shared_ptr<OpExecutor> > ret(idx.num_nodes());
 
   // initialize the nodes
-  const auto& inode = idx[i];
-  if (inode.source->is_variable()) return;
-  const nnvm::Op *op = inode.source->op();
-  ExecType exec_type = ExecType::kSync;
-  std::vector<uint32_t> mutate_index;
-  if (fmutate_inputs.count(op)) {
-    mutate_index = fmutate_inputs[op](inode.source->attrs);
-  }
-  if (fexec_type.count(op)) {
-    exec_type = fexec_type[op](inode.source->attrs);
-  }
-  CHECK(dispatch_modes[i] != DispatchMode::kUndefined);
-  if (fcreate_op_state.count(op)) {
-    std::vector<TShape> ishape;
-    std::vector<int> itype;
-    for (const auto& e : inode.inputs) {
-      ishape.emplace_back(vshape[idx.entry_id(e)]);
-      itype.emplace_back(vdtype[idx.entry_id(e)]);
-    }
-
-    OpStatePtr state = fcreate_op_state[op](
-        inode.source->attrs, vctx[i], ishape, itype);
-    FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
-        op, "FStatefulComputeEx", vctx[i]);
-    // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
-    if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
-    } else {
-      FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
-          op, "FStatefulCompute", vctx[i]);
-      CHECK(fcompute != nullptr)
-          << "One of FStatefulCompute and FStatefulComputeEx must be registered "
-          << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
-                                                         exec_type, mutate_index);
+  for (size_t i = 0; i < idx.num_nodes(); ++i) {
+    const auto& inode = idx[i];
+    if (inode.source->is_variable()) continue;
+    const nnvm::Op *op = inode.source->op();
+    ExecType exec_type = ExecType::kSync;
+    std::vector<uint32_t> mutate_index;
+    if (fmutate_inputs.count(op)) {
+      mutate_index = fmutate_inputs[op](inode.source->attrs);
     }
-  } else if (is_layer_backward.get(op, false)) {
-    CHECK_GE(inode.control_deps.size(), 1);
-    uint32_t fwd_id = inode.control_deps[0];
-    CHECK(vctx[fwd_id] == vctx[i]);
-    CHECK(ret[fwd_id] != nullptr);
-    FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
-        op, "FStatefulComputeEx", vctx[i]);
-    // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
-    if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          ret[fwd_id].get()->state(), fcompute_ex, exec_type);
-    } else {
-      FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
-          op, "FStatefulCompute", vctx[i]);
-      CHECK(fcompute != nullptr)
-          << "One of FStatefulCompute and FStatefulComputeEx must be registered "
-          << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(
-          ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
+    if (fexec_type.count(op)) {
+      exec_type = fexec_type[op](inode.source->attrs);
     }
-  } else {
-    FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
-    FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
-    if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<FComputeExExecutor>(
-          inode.source->attrs, fcomp_ex, exec_type);
-    } else if (fcompute != nullptr) {
-      ret[i] = std::make_shared<FComputeExecutor>(
-          inode.source->attrs, fcompute, exec_type, mutate_index);
+    CHECK(dispatch_modes[i] != DispatchMode::kUndefined);
+    if (fcreate_op_state.count(op)) {
+      std::vector<TShape> ishape;
+      std::vector<int> itype;
+      for (const auto& e : inode.inputs) {
+        ishape.emplace_back(vshape[idx.entry_id(e)]);
+        itype.emplace_back(vdtype[idx.entry_id(e)]);
+      }
+
+      OpStatePtr state = fcreate_op_state[op](
+          inode.source->attrs, vctx[i], ishape, itype);
+      FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
+          op, "FStatefulComputeEx", vctx[i]);
+      // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
+      if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
+      } else {
+        FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
+            op, "FStatefulCompute", vctx[i]);
+        CHECK(fcompute != nullptr)
+            << "One of FStatefulCompute and FStatefulComputeEx must be registered "
+            << "for stateful operator " << op->name;
+        ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+                                                           exec_type, mutate_index);
+      }
+    } else if (is_layer_backward.get(op, false)) {
+      CHECK_GE(inode.control_deps.size(), 1);
+      uint32_t fwd_id = inode.control_deps[0];
+      CHECK(vctx[fwd_id] == vctx[i]);
+      CHECK(ret[fwd_id] != nullptr);
+      FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
+          op, "FStatefulComputeEx", vctx[i]);
+      // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
+      if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<StatefulComputeExExecutor>(
+            dynamic_cast<StatefulComputeExExecutor*>(ret[fwd_id].get())->state_,
+            fcompute_ex, exec_type);
+      } else {
+        FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
+            op, "FStatefulCompute", vctx[i]);
+        CHECK(fcompute != nullptr)
+            << "One of FStatefulCompute and FStatefulComputeEx must be registered "
+            << "for stateful operator " << op->name;
+        ret[i] = std::make_shared<StatefulComputeExecutor>(
+            dynamic_cast<StatefulComputeExecutor*>(ret[fwd_id].get())->state_,
+            fcompute, exec_type, mutate_index);
+      }
     } else {
-      LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
+      FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
+      FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
+      if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<FComputeExExecutor>(
+            inode.source->attrs, fcomp_ex, exec_type);
+      } else if (fcompute != nullptr) {
+        ret[i] = std::make_shared<FComputeExecutor>(
+            inode.source->attrs, fcompute, exec_type, mutate_index);
+      } else {
+        LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
+      }
     }
   }
-}
-
-
-// pass to attach operator executors
-Graph AttachOpExecs(Graph g) {
-  const auto& idx = g.indexed_graph();
-  OpExecVector ret(idx.num_nodes());
-  for (size_t i = 0; i < idx.num_nodes(); ++i) {
-    CreateOpExecs(g, &ret, i);
-  }
   g.attrs["op_execs"] = std::make_shared<nnvm::any>(ret);
   return g;
 }
diff --git a/src/executor/attach_op_resource_pass.cc b/src/executor/attach_op_resource_pass.cc
index 56122cda6ff..681866296e1 100644
--- a/src/executor/attach_op_resource_pass.cc
+++ b/src/executor/attach_op_resource_pass.cc
@@ -30,15 +30,12 @@
 namespace mxnet {
 namespace exec {
 
-void AttachOpResources(
-    const Graph& g,
-    const OpExecVector& op_execs,
-    size_t start_nid,
-    size_t end_nid) {
+Graph AttachOpResources(Graph g) {
   static auto& fresource =
       nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
   static auto& fresource_ex =
       nnvm::Op::GetAttr<FResourceRequestEx>("FResourceRequestEx");
+  auto& op_execs = nnvm::get<OpExecVector>(*g.attrs.at("op_execs"));
   const auto& vctx = g.GetAttr<ContextVector>("context");
   const auto& vdispatch = g.GetAttr<DispatchModeVector>("dispatch_mode");
   const auto& dev_masks = g.GetAttr<DevMaskVector>("dev_mask");
@@ -46,7 +43,7 @@ void AttachOpResources(
   // Use global resource pool for each executor for now.
   std::map<Context, Resource> cached_temp;
   // Resource allocation
-  for (uint32_t nid = start_nid; nid < end_nid; ++nid) {
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
     const Context &ctx = vctx[nid];
@@ -87,12 +84,7 @@ void AttachOpResources(
       requested.push_back(ResourceManager::Get()->Request(ctx, ResourceRequest::kTempSpace));
     }
   }
+  return g;
 }
-
-void AttachOpResources(const Graph& g) {
-  const auto& op_execs = g.GetAttr<OpExecVector>("op_execs");
-  AttachOpResources(g, op_execs, 0, g.indexed_graph().num_nodes());
-}
-
 }  // namespace exec
 }  // namespace mxnet
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 26a24911894..99b1b162eae 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -82,10 +82,6 @@ class OpExecutor {
   virtual engine::VarHandle var() const {
     return nullptr;
   }
-  /*! \return return operator state */
-  virtual OpStatePtr state() const {
-    return OpStatePtr();
-  }
 };
 
 /*!
@@ -106,14 +102,6 @@ using ContextVector = std::vector<Context>;
  */
 using DevMaskVector = std::vector<int>;
 
-/*!
- * \brief create OpExecutor for a node in graph
- *
- * \param g input graph
- * \param p_ret OpExecVector for input and output
- * \param i the id of the node
- */
-void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i);
 /*!
  * \brief Attach OpExecutor to the graph attributes.
  *
@@ -127,20 +115,12 @@ Graph AttachOpExecs(Graph g);
  * \brief Attach Resource to the OpExecVector of the graph.
  *
  * \param g input graph need to contain op_exec attribute.
- */
-void AttachOpResources(const Graph& g);
-/*!
- * \brief Attach Resource to the OpExecVector
  *
- * \param g input graph
- * \param op_execs OpExecutor vector
- * \param start_nid starting node id
- * \param end_nid end node id
+ * \return graph with new attribute "op_exec" of type OpExecVector
+ *  The fields on the OpExecVector are not yet been setup.
  */
-void AttachOpResources(const Graph& g,
-                       const OpExecVector& op_execs,
-                       size_t start_nid,
-                       size_t end_nid);
+Graph AttachOpResources(Graph g);
+
 /*!
  * \brief Discover chance of inplace addto operators.
  *  i.e. z = plus(z, source_op), and encourage it to become z += source_op.
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 831b5f90023..e28867d5488 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -912,7 +912,7 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol,
   }
 
   g = AttachOpExecs(g);
-  AttachOpResources(g);
+  g = AttachOpResources(g);
   graph_ = std::move(g);
 
   if (shared_exec != nullptr) {
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index b40605bd25e..140b5a5d81e 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -19,78 +19,16 @@
 #include <unordered_set>
 #include <iostream>
 #include "./imperative_utils.h"
-#include "./cached_op.h"
-#include "../executor/exec_pass.h"
-#include "../profiler/profiler.h"
-
 
 namespace mxnet {
 
 DMLC_REGISTER_PARAMETER(CachedOpConfig);
 
-struct CachedOp::GraphInfo {
-  nnvm::Graph fwd_graph;
-  nnvm::Graph full_graph;
-  std::vector<OpReqType> bwd_output_reqs;
-  std::vector<uint32_t> bwd_input_eid;
-};
-
-struct CachedOp::DynamicRuntime {
-  GraphInfo info;
-  std::vector<NDArray> buff;
-  std::vector<OpStatePtr> op_states;
-};
-
-struct CachedOp::CachedOpState {
-  CachedOpState(const Context& context_,
-                const nnvm::Graph& fwd_graph_,
-                const nnvm::Graph& full_graph_) {
-    context = context_;
-    info.fwd_graph = fwd_graph_;
-    info.full_graph = full_graph_;
-
-    size_t max_nodes = info.full_graph.indexed_graph().num_nodes();
-    size_t max_entries = info.full_graph.indexed_graph().num_node_entries();
-    info.fwd_graph.attrs["context"] = std::make_shared<dmlc::any>(
-        std::vector<Context>(info.fwd_graph.indexed_graph().num_nodes(), context));
-    info.full_graph.attrs["context"] = std::make_shared<dmlc::any>(
-        std::vector<Context>(max_nodes, context));
-
-    buff.resize(max_entries);
-    arrays.resize(max_entries);
-    array_reqs.resize(max_entries);
-    dynamic_entries.resize(max_entries, false);
-    op_states.resize(max_nodes);
-    execs.resize(max_nodes);
-    opr_segs.resize(max_nodes);
-  }
-
-  std::mutex mutex;
-  Context context;
-  GraphInfo info;
-
-  bool recording = false;
-  bool fwd_alloc = false;
-  bool bwd_alloc = false;
-  bool fwd_exec_init = false;
-  bool bwd_exec_init = false;
-
-  std::vector<NDArray> buff;
-  std::vector<NDArray*> arrays;
-  std::vector<OpReqType> array_reqs;
-
-  std::vector<OpStatePtr> op_states;
-  std::vector<std::shared_ptr<exec::OpExecutor> > execs;
-  std::vector<imperative::EngineOprSeg> opr_segs;
-
-  std::vector<bool> dynamic_entries;
-  std::multimap<size_t, NDArray> fwd_reuse_pool;
-  std::multimap<size_t, NDArray> bwd_reuse_pool;
-};
-
-CachedOp::CachedOp(
+Imperative::CachedOp::CachedOp(
     const nnvm::Symbol& sym,
-    const std::vector<std::pair<std::string, std::string> >& flags) {
+    const std::vector<std::pair<std::string, std::string> >& flags,
+    const std::vector<std::string> arg_names,
+    const std::unordered_map<std::string, std::vector<NDArray> >& params) {
   using namespace nnvm;
   using namespace imperative;
   static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
@@ -130,22 +68,34 @@ CachedOp::CachedOp(
     fwd_graph_.attrs["forward_ref_count"] =
         std::make_shared<dmlc::any>(std::move(ref_count));
 
-    inlining_ = !config_.static_alloc &&
-        (idx.num_nodes() - idx.input_nodes().size()) <= config_.inline_limit;
+    inlining_ = (idx.num_nodes() - idx.input_nodes().size()) <= config_.inline_limit;
   }
 
   // Set params
   {
     const auto& idx = fwd_graph_.indexed_graph();
-    if (config_.data_indices.ndim() || config_.param_indices.ndim()) {
-      CHECK_EQ(config_.data_indices.ndim() + config_.param_indices.ndim(),
-               idx.input_nodes().size());
-    } else {
-      std::vector<uint32_t> tmp;
-      for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-        tmp.push_back(i);
+    std::unordered_map<std::string, size_t> arg_name_to_id;
+    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
+      const auto& name = idx[idx.input_nodes()[i]].source->attrs.name;
+      auto iter = params.find(name);
+      if (iter == params.end()) {
+        arg_name_to_id[name] = i;
+        continue;
+      }
+      fwd_params_idx_.push_back(i);
+      for (const auto& param : iter->second) {
+        params_[param.ctx()].emplace_back(param);
       }
-      config_.data_indices.assign(tmp.begin(), tmp.end());
+    }
+
+    CHECK_EQ(arg_name_to_id.size(), arg_names.size())
+        << "CachedOp expects " << arg_name_to_id.size()
+        << " inputs, given " << arg_names.size();
+
+    for (const auto& name : arg_names) {
+      auto iter = arg_name_to_id.find(name);
+      CHECK(iter != arg_name_to_id.end()) << "Unexpected input name " << name;
+      fwd_args_idx_.push_back(iter->second);
     }
   }
 
@@ -157,14 +107,9 @@ CachedOp::CachedOp(
     }
 
     std::vector<NodeEntry> xs;
-    const auto& idx = fwd_graph_.indexed_graph();
-    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-      auto nid = idx.input_nodes()[i];
-      if (idx.mutable_input_nodes().count(nid)) continue;
-      fwd_input_to_grad_output_[i] = xs.size();
-      xs.emplace_back(NodeEntry{idx[nid].weak_ref.lock(), 0, 0});
-    }
-
+    std::vector<NodePtr> args = sym.ListInputs(Symbol::kReadOnlyArgs);
+    xs.reserve(args.size());
+    for (const auto& i : args) xs.emplace_back(NodeEntry{i, 0, 0});
     CHECK_GT(xs.size(), 0)
         << "There are no inputs in computation graph that require gradients.";
 
@@ -180,7 +125,7 @@ CachedOp::CachedOp(
     size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
 
     full_graph_.outputs = fwd_graph_.outputs;
-    bwd_output_reqs_ = std::vector<OpReqType>(grad_graph_.outputs.size(), kWriteTo);
+    curr_grad_req_ = std::vector<bool>(grad_graph_.outputs.size(), true);
     for (const auto& i : grad_graph_.outputs) full_graph_.outputs.emplace_back(i);
     const auto& idx = full_graph_.indexed_graph();
 
@@ -224,10 +169,7 @@ CachedOp::CachedOp(
   }
 }
 
-CachedOp::~CachedOp() {
-}
-
-std::vector<nnvm::NodeEntry> CachedOp::Gradient(
+std::vector<nnvm::NodeEntry> Imperative::CachedOp::Gradient(
     const nnvm::NodePtr& node,
     const std::vector<nnvm::NodeEntry>& ograds) {
   using namespace nnvm;
@@ -264,15 +206,13 @@ std::vector<nnvm::NodeEntry> CachedOp::Gradient(
   return ret;
 }
 
-
-bool CachedOp::SetForwardGraph(
-    GraphInfo* info,
-    const bool recording,
-    const std::vector<NDArray*>& inputs) {
+nnvm::Graph Imperative::CachedOp::GetForwardGraph(
+    const bool recording, const std::vector<NDArray*>& inputs) {
   using namespace nnvm;
   using namespace imperative;
+  std::lock_guard<std::mutex> lock(mutex_);
   CHECK_EQ(inputs.size(), num_inputs());
-  nnvm::Graph& g = info->fwd_graph;
+  nnvm::Graph& g = fwd_graph_;
 
   ShapeVector shape_inputs;
   DTypeVector dtype_inputs;
@@ -297,22 +237,18 @@ bool CachedOp::SetForwardGraph(
     g.attrs.erase("forward_mem_plan");
     g.attrs.erase("full_mem_plan");
   } else if (g.attrs.count(recording ? "full_mem_plan" : "forward_mem_plan")) {
-    return true;
+    return g;
   }
 
   const auto& idx = g.indexed_graph();
 
   StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
+  for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
   const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
   CHECK_EQ(stypes.size(), storage.size());
   for (size_t i = 0; i < stypes.size(); i++) {
-    if (stypes[i] != kDefaultStorage) storage[i] = exec::kDynamicStorageID;
-  }
-  for (const auto i : idx.input_nodes()) {
-    storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
-  }
-  for (size_t i = 0; i < idx.outputs().size(); ++i) {
-    storage[idx.entry_id(idx.outputs()[i])] = exec::kExternalStorageID;
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
   }
 
   auto mem_plan = PlanMemory(
@@ -321,50 +257,51 @@ bool CachedOp::SetForwardGraph(
   g.attrs[recording ? "full_mem_plan" : "forward_mem_plan"] =
       std::make_shared<dmlc::any>(std::move(mem_plan));
 
-  return false;
+  return g;
 }
 
-bool CachedOp::SetBackwardGraph(
-    GraphInfo* info,
+nnvm::Graph Imperative::CachedOp::GetBackwardGraph(
+    const OpStatePtr& op_state,
     const std::vector<OpReqType>& reqs,
-    const std::vector<NDArray*>& inputs,
-    bool detect_inplace_addto) {
+    const std::vector<NDArray*>& inputs) {
   using namespace nnvm;
   using namespace imperative;
   std::lock_guard<std::mutex> lock(mutex_);
-  Context default_ctx = inputs[0]->ctx();
-  nnvm::Graph& g = info->full_graph;
-
-  if (info->bwd_output_reqs != reqs) {
-    info->bwd_output_reqs = reqs;
-    info->bwd_input_eid.clear();
+  nnvm::Graph& g = full_graph_;
+  auto& state = op_state.get_state<CachedOpState>();
+  bool req_match = true;
+  for (size_t i = 0; i < reqs.size(); ++i) {
+    if (curr_grad_req_[i] != (reqs[i] != kNullOp)) {
+      curr_grad_req_[i] = reqs[i] != kNullOp;
+      req_match = false;
+    }
+  }
+  if (!req_match) {
     g = nnvm::Graph();
     g.outputs = fwd_graph_.outputs;
     for (size_t i = 0; i < grad_graph_.outputs.size(); ++i) {
-      if (info->bwd_output_reqs[i] == kNullOp) continue;
-      g.outputs.emplace_back(grad_graph_.outputs[i]);
+      if (curr_grad_req_[i]) g.outputs.emplace_back(grad_graph_.outputs[i]);
     }
-    g.attrs["context"] = std::make_shared<dmlc::any>(
-        std::vector<Context>(g.indexed_graph().num_nodes(), default_ctx));
+    bwd_input_eid_.clear();
   }
 
   const auto& idx = g.indexed_graph();
 
-  if (info->bwd_input_eid.size() != inputs.size()) {
-    info->bwd_input_eid.clear();
+  if (bwd_input_eid_.size() != inputs.size()) {
+    bwd_input_eid_.clear();
     for (const auto& i : bwd_ograd_dep_) {
       auto eid = idx.entry_id(ograd_entries_[i]);
-      info->bwd_input_eid.push_back(eid);
+      bwd_input_eid_.push_back(eid);
     }
     for (const auto& i : bwd_in_dep_) {
       auto eid = idx.entry_id(idx.input_nodes()[i], 0);
-      info->bwd_input_eid.push_back(eid);
+      bwd_input_eid_.push_back(eid);
     }
     for (const auto& i : bwd_out_dep_) {
       auto eid = idx.entry_id(idx.outputs()[i]);
-      info->bwd_input_eid.push_back(eid);
+      bwd_input_eid_.push_back(eid);
     }
-    CHECK_EQ(inputs.size(), info->bwd_input_eid.size());
+    CHECK_EQ(inputs.size(), bwd_input_eid_.size());
   }
 
   size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
@@ -375,22 +312,25 @@ bool CachedOp::SetBackwardGraph(
     for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
     }
-    for (size_t i = 0; i < inputs.size(); ++i) ++ref_count[info->bwd_input_eid[i]];
+    for (size_t i = 0; i < inputs.size(); ++i) ++ref_count[bwd_input_eid_[i]];
     for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)];
     g.attrs["backward_ref_count"] = std::make_shared<dmlc::any>(std::move(ref_count));
   }
 
-  auto shapes = info->fwd_graph.GetAttr<ShapeVector>("shape");
-  shapes.resize(idx.num_node_entries(), TShape());
-  auto dtypes = info->fwd_graph.GetAttr<DTypeVector>("dtype");
-  dtypes.resize(idx.num_node_entries(), -1);
-  auto stypes = info->fwd_graph.GetAttr<StorageTypeVector>("storage_type");
-  stypes.resize(idx.num_node_entries(), -1);
+  ShapeVector shapes(idx.num_node_entries(), TShape());
+  DTypeVector dtypes(idx.num_node_entries(), -1);
+  StorageTypeVector stypes(idx.num_node_entries(), -1);
+
+  for (size_t i = 0; i < num_forward_entries; ++i) {
+    shapes[i] = state.buff[i].shape();
+    dtypes[i] = state.buff[i].dtype();
+    stypes[i] = state.buff[i].storage_type();
+  }
 
   for (size_t i = 0; i < inputs.size(); ++i) {
-    shapes[info->bwd_input_eid[i]] = inputs[i]->shape();
-    dtypes[info->bwd_input_eid[i]] = inputs[i]->dtype();
-    stypes[info->bwd_input_eid[i]] = inputs[i]->storage_type();
+    shapes[bwd_input_eid_[i]] = inputs[i]->shape();
+    dtypes[bwd_input_eid_[i]] = inputs[i]->dtype();
+    stypes[bwd_input_eid_[i]] = inputs[i]->storage_type();
   }
 
   std::pair<uint32_t, uint32_t> node_range, entry_range;
@@ -402,353 +342,79 @@ bool CachedOp::SetBackwardGraph(
                               node_range, entry_range);
   match &= CheckAndInferType(&g, std::move(dtypes), false,
                              node_range, entry_range);
-  exec::DevMaskVector dev_mask(idx.num_nodes(), default_ctx.dev_mask());
+  exec::DevMaskVector dev_mask(idx.num_nodes(), inputs[0]->ctx().dev_mask());
   match &= CheckAndInferStorageType(&g, std::move(dev_mask), std::move(stypes),
                                     false, node_range, entry_range);
 
   if (!match) {
     g.attrs.erase("backward_mem_plan");
   } else if (g.attrs.count("backward_mem_plan")) {
-    return true;
+    return g;
   }
 
   StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
-  const auto& bwd_stypes = g.GetAttr<StorageTypeVector>("storage_type");
-  for (size_t i = 0; i < bwd_stypes.size(); i++) {
-    if (bwd_stypes[i] != kDefaultStorage) storage[i] = exec::kDynamicStorageID;
-  }
   for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID;
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
+  for (size_t i = 0; i < stypes.size(); i++) {
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
+  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
-      {num_forward_nodes, idx.num_nodes()},
-      {num_forward_entries, idx.num_node_entries()},
-      detect_inplace_addto);
+      {num_forward_nodes, idx.num_nodes()}, {num_forward_entries, idx.num_node_entries()});
   g.attrs["backward_mem_plan"] = std::make_shared<dmlc::any>(std::move(mem_plan));
 
-  return false;
-}
-
-OpStatePtr CachedOp::GetCachedOpState(
-    const Context& ctx) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  for (const auto& i : cached_op_states_[ctx]) {
-    // only create one state per device when not using static memory
-    if (!config_.static_alloc || i.unique()) {
-      return i;
-    }
-  }
-  auto state_ptr = OpStatePtr::Create<CachedOpState>(ctx, fwd_graph_, full_graph_);
-
-  cached_op_states_[ctx].push_back(state_ptr);
-  return state_ptr;
-}
-
-void CachedOp::StaticAllocMemory(
-    const OpStatePtr& state_ptr,
-    bool recording,
-    bool keep_fwd) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  auto& state = state_ptr.get_state<CachedOpState>();
-  const auto& default_ctx = state.context;
-  nnvm::Graph& g = keep_fwd ? state.info.full_graph : state.info.fwd_graph;
-  const auto& idx = g.indexed_graph();
-  const auto& vstorage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(
-      keep_fwd ? "backward_mem_plan" : (recording ? "full_mem_plan" : "forward_mem_plan"));
-  std::vector<int> addto_entry;
-  if (g.attrs.count("addto_entry")) {
-    addto_entry = g.GetAttr<std::vector<int> >("addto_entry");
-  }
-  size_t start_eid =
-      keep_fwd ? state.info.fwd_graph.indexed_graph().num_node_entries() : 0;
-  size_t end_eid = idx.num_node_entries();
-
-  if (!keep_fwd) state.fwd_alloc = false;
-  state.bwd_alloc = false;
-  for (size_t i = start_eid; i < state.buff.size(); ++i) {
-    state.buff[i] = NDArray();
-    state.arrays[i] = &state.buff[i];
-    state.array_reqs[i] = kNullOp;
-    state.dynamic_entries[i] = false;
-  }
-
-  for (auto i : idx.input_nodes()) {
-    auto eid = idx.entry_id(i, 0);
-    if (eid >= start_eid) state.dynamic_entries[eid] = true;
-  }
-  for (auto i : idx.outputs()) {
-    auto eid = idx.entry_id(i);
-    if (eid >= start_eid) state.dynamic_entries[eid] = true;
-  }
-
-  for (size_t i = start_eid; i < end_eid; ++i) {
-    if (addto_entry.size() && addto_entry[i]) {
-      state.array_reqs[i] = kAddTo;
-    } else if (vstorage_inplace[i] >= 0) {
-      state.array_reqs[i] = kWriteInplace;
-    } else if (vstorage_inplace[i] == -2) {
-      // -2 indicate that the entry is never referenced.
-      state.array_reqs[i] = kNullOp;
-    } else {
-      state.array_reqs[i] = kWriteTo;
-    }
-  }
-
-  auto& reuse_pool = keep_fwd ? state.bwd_reuse_pool : state.fwd_reuse_pool;
-  reuse_pool = imperative::AllocateMemory(
-      g, idx, default_ctx, start_eid, end_eid, mem_plan,
-      state.arrays, &state.array_reqs, std::move(reuse_pool));
-
-  state.recording = recording;
-  if (keep_fwd) {
-    state.bwd_alloc = true;
-  } else {
-    state.fwd_alloc = true;
-  }
+  return g;
 }
 
-void CachedOp::StaticInitExec(
-    const OpStatePtr& state_ptr,
-    bool recording,
-    bool keep_fwd) {
+void Imperative::CachedOp::Forward(
+    const std::shared_ptr<CachedOp>& op_ptr,
+    const std::vector<NDArray*>& args,
+    const std::vector<NDArray*>& outputs) {
   using namespace nnvm;
   using namespace imperative;
+  static const auto cached_op = nnvm::Op::Get("_CachedOp");
 
-  auto& state = state_ptr.get_state<CachedOpState>();
-  const auto& default_ctx = state.context;
-  nnvm::Graph& g = keep_fwd ? state.info.full_graph : state.info.fwd_graph;
-  const auto& idx = g.indexed_graph();
-  std::vector<int> skip_plus_node;
-  if (g.attrs.count("skip_plus_node")) {
-    skip_plus_node = g.GetAttr<std::vector<int> >("skip_plus_node");
-  }
-  size_t start_nid =
-      keep_fwd ? state.info.fwd_graph.indexed_graph().num_nodes() : 0;
-  size_t end_nid = idx.num_nodes();
-
-  if (!keep_fwd) state.fwd_exec_init = false;
-  state.bwd_exec_init = false;
-
-  for (size_t i = start_nid; i < state.execs.size(); ++i) {
-    state.execs[i].reset();
-    state.opr_segs[i] = EngineOprSeg();
-  }
-
-  if (!config_.static_shape) {
-    for (size_t i = start_nid; i < end_nid; ++i) {
-      state.opr_segs[i].next_nid = i + 1;
-      state.opr_segs[i].skip = skip_plus_node.size() && skip_plus_node[i];
-    }
-  } else {
-    for (size_t i = start_nid; i < end_nid; ++i) {
-      exec::CreateOpExecs(g, &state.execs, i);
-    }
-    exec::AttachOpResources(g, state.execs, start_nid, end_nid);
-
-    for (size_t i = start_nid; i < end_nid; ++i) {
-      bool skip = idx[i].source->is_variable();
-      for (size_t j = 0; !skip && j < idx[i].inputs.size(); ++j) {
-        skip = state.dynamic_entries[idx.entry_id(idx[i].inputs[j])];
-      }
-      for (size_t j = 0; !skip && j < idx[i].source->num_outputs(); ++j) {
-        skip = state.dynamic_entries[idx.entry_id(i, j)];
-      }
-      if (skip) continue;
-      SetupOpExec(g, i, state.execs[i], state.arrays, state.array_reqs);
-    }
+  CHECK_EQ(args.size(), fwd_args_idx_.size())
+      << "CachedOp requires " << fwd_args_idx_.size()
+      << " inputs but got " << args.size();
 
-    size_t bulk_size = idx.num_nodes();
-    std::unordered_set<uint32_t> excludes;
-    if (recording || keep_fwd) {
-      bulk_size = keep_fwd ? config_.backward_bulk_size : config_.forward_bulk_size;
-      for (const auto& i : idx.outputs()) excludes.insert(idx.entry_id(i));
-      for (const auto& i : idx.input_nodes()) excludes.insert(idx.entry_id(i, 0));
-    }
+  Context default_ctx = args[0]->ctx();
 
-    CreateEngineOpSeg(idx, default_ctx, start_nid, end_nid, bulk_size, excludes,
-                      state.execs, skip_plus_node, &state.opr_segs);
-  }
 
-  if (keep_fwd) {
-    state.bwd_exec_init = true;
-  } else {
-    state.fwd_exec_init = true;
+  std::vector<NDArray*> inputs(num_inputs());
+  for (index_t i = 0; i < fwd_args_idx_.size(); ++i) {
+    inputs[fwd_args_idx_[i]] = args[i];
   }
-}
-
-void CachedOp::StaticRunOps(
-    const Context& default_ctx,
-    const nnvm::Graph& g,
-    const OpStatePtr& state_ptr,
-    size_t start_nid,
-    size_t end_nid) {
-  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
-
-  bool profiling = profiler::Profiler::Get()->GetState() == profiler::Profiler::kRunning;
-  bool is_training = Imperative::Get()->is_training();
-  auto& state = state_ptr.get_state<CachedOpState>();
-  const auto& idx = g.indexed_graph();
-  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
-  const auto& op_execs = state.execs;
-
-  std::vector<NDArray*> ndinputs, ndoutputs;
-  nnvm::ShapeVector arg_shapes;
-  nnvm::DTypeVector arg_dtypes;
-  std::vector<OpReqType> req;
+  if (fwd_params_idx_.size()) {
+    CHECK(params_.find(default_ctx) != params_.end())
+        << "CachedOp is not initialized on context " << default_ctx;
 
-  for (size_t i = start_nid; config_.static_shape && i < end_nid; ++i) {
-    if (op_execs[i]) op_execs[i]->op_ctx.is_train = is_training;
-  }
-
-  for (size_t i = start_nid; i < end_nid; i = state.opr_segs[i].next_nid) {
-    const auto& opr_seg = state.opr_segs[i];
-    if (opr_seg.skip) continue;
-    if (opr_seg.opr != nullptr) {
-      Engine::Get()->Push(opr_seg.opr.get(), default_ctx, 0, profiling);
-    } else {
-      const nnvm::IndexedGraph::Node& node = idx[i];
-      if (node.source->is_variable()) continue;
-      auto num_outputs = node.source->num_outputs();
-      ndinputs.clear();
-      ndinputs.reserve(node.inputs.size());
-      for (const auto& j : node.inputs) {
-        ndinputs.emplace_back(state.arrays[idx.entry_id(j)]);
-        CHECK(!ndinputs.back()->is_none());
-      }
-      ndoutputs.clear();
-      ndoutputs.reserve(num_outputs);
-      req.clear();
-      req.reserve(num_outputs);
-      for (size_t j = 0; j < num_outputs; ++j) {
-        size_t eid = idx.entry_id(i, j);
-        ndoutputs.emplace_back(state.arrays[eid]);
-        req.push_back(state.array_reqs[eid]);
-        CHECK(req.back() == kNullOp || !ndoutputs.back()->is_none());
-      }
-      const DispatchMode dispatch_mode = dispatch_modes[i];
-      if (createop.count(node.source->op())) {
-        arg_shapes.clear();
-        arg_dtypes.clear();
-        arg_shapes.reserve(ndinputs.size());
-        arg_dtypes.reserve(ndinputs.size());
-        for (size_t i = 0; i < ndinputs.size(); ++i) {
-          arg_shapes.emplace_back(ndinputs[i]->shape());
-          arg_dtypes.emplace_back(ndinputs[i]->dtype());
-        }
-        state.op_states[i] = createop[node.source->op()](
-            node.source->attrs, default_ctx, arg_shapes, arg_dtypes);
-        Imperative::Get()->InvokeOp(
-            default_ctx, node.source->attrs, ndinputs, ndoutputs, req,
-            dispatch_mode, state.op_states[i]);
-      } else if (is_layer_backward.get(node.source->op(), false)) {
-        nnvm::Node* fwd_node = node.source->control_deps[0].get();
-        auto fwd_node_id = idx.node_id(fwd_node);
-        Imperative::Get()->InvokeOp(
-            default_ctx, node.source->attrs, ndinputs, ndoutputs,
-            req, dispatch_mode, state.op_states[fwd_node_id]);
-      } else {
-        Imperative::Get()->InvokeOp(
-            default_ctx, node.source->attrs, ndinputs, ndoutputs, req,
-            dispatch_mode);
-      }
+    for (size_t i = 0; i < fwd_params_idx_.size(); ++i) {
+      inputs[fwd_params_idx_[i]] = &params_[default_ctx][i];
     }
   }
-}
-
-OpStatePtr CachedOp::StaticForward(
-    const Context& default_ctx,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace nnvm;
-  using namespace imperative;
 
+  // Initialize
   bool recording = Imperative::Get()->is_recording();
-  auto state_ptr = GetCachedOpState(default_ctx);
-  auto& state = state_ptr.get_state<CachedOpState>();
-  std::lock_guard<std::mutex> lock(state.mutex);
-
-  bool match = SetForwardGraph(&state.info, recording, inputs);
-  match = match && state.recording != recording;
-
-  nnvm::Graph& g = state.info.fwd_graph;
+  nnvm::Graph g = GetForwardGraph(recording, inputs);
   const auto& idx = g.indexed_graph();
-  if (!state.fwd_alloc || !match)  {
-    StaticAllocMemory(state_ptr, recording, false);
-  }
-
-  if (config_.static_shape) {
-    for (auto i : config_.param_indices) {
-      auto nid = idx.input_nodes()[i];
-      if (!state.arrays[idx.entry_id(nid, 0)]->IsSame(*inputs[i])) {
-        match = false;
-        auto ptr = &state.buff[idx.entry_id(nid, 0)];
-        CHECK_EQ(state.arrays[idx.entry_id(nid, 0)], ptr);
-        *state.arrays[idx.entry_id(nid, 0)] = *inputs[i];
-        state.dynamic_entries[idx.entry_id(nid, 0)] = false;
-      }
-    }
-    for (auto i : config_.data_indices) {
-      auto eid = idx.entry_id(idx.input_nodes()[i], 0);
-      state.arrays[eid] = inputs[i];
-    }
-  } else {
-    for (size_t i = 0; i < num_inputs(); ++i) {
-      auto nid = idx.input_nodes()[i];
-      state.arrays[idx.entry_id(nid, 0)] = inputs[i];
-    }
-  }
-
-  if (!state.fwd_exec_init || !match) {
-    StaticInitExec(state_ptr, recording, false);
-  }
-
-  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
-  const auto& shapes = g.GetAttr<ShapeVector>("shape");
-  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  size_t num_inputs = idx.input_nodes().size();
 
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto eid = idx.entry_id(idx.outputs()[i]);
-    state.arrays[eid] = outputs[i];
-    if (!outputs[i]->is_none()) continue;
-    *outputs[i] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
-                          shapes[eid], default_ctx, true, dtypes[eid]);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i]->ctx(), default_ctx)
+        << "CachedOp requires all inputs to live on the same context. But "
+        << idx[idx.input_nodes()[0]].source->attrs.name << " is on " << default_ctx
+        << " while " << idx[idx.input_nodes()[i]].source->attrs.name << " is on "
+        << inputs[i]->ctx();
   }
 
-  StaticRunOps(default_ctx, g, state_ptr, 0, idx.num_nodes());
-
-  return recording ? state_ptr : OpStatePtr();
-}
-
-
-OpStatePtr CachedOp::DynamicForward(
-    const Context& default_ctx,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  // Initialize
-  bool recording = Imperative::Get()->is_recording();
-  auto op_state = OpStatePtr::Create<DynamicRuntime>();
-  auto& runtime = op_state.get_state<DynamicRuntime>();
-  {
-    auto state_ptr = GetCachedOpState(default_ctx);
-    auto& state = state_ptr.get_state<CachedOpState>();
-    std::lock_guard<std::mutex> lock(state.mutex);
-    SetForwardGraph(&state.info, recording, inputs);
-    runtime.info.fwd_graph = state.info.fwd_graph;
-  }
-  nnvm::Graph& g = runtime.info.fwd_graph;
-  const auto& idx = g.indexed_graph();
-  size_t num_inputs = idx.input_nodes().size();
-  auto& buff = runtime.buff;
-  auto& states = runtime.op_states;
+  auto op_state_ptr = OpStatePtr::Create<CachedOpState>();
+  auto& cached_op_state = op_state_ptr.get_state<CachedOpState>();
+  auto& buff = cached_op_state.buff;
+  auto& states = cached_op_state.states;
 
   // Allocate entries
   states.resize(idx.num_nodes());
@@ -780,98 +446,57 @@ OpStatePtr CachedOp::DynamicForward(
   AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
 
-  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
-  const auto& shapes = g.GetAttr<ShapeVector>("shape");
-  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
-
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto eid = idx.entry_id(idx.outputs()[i]);
-    arrays[eid] = outputs[i];
-    if (!outputs[i]->is_none()) continue;
-    *outputs[i] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
-                          shapes[eid], default_ctx, true, dtypes[eid]);
-  }
-
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
   if (recording && !inlining_) Imperative::Get()->set_is_recording(false);
+  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.forward_bulk_size);
 
-  RunGraph(false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
-           std::move(ref_count), &states, dispatch_modes);
+  Imperative::Get()->RunGraph(
+      false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
+      std::move(ref_count), &states, dispatch_modes);
 
+  Engine::Get()->set_bulk_size(prev_bulk_size);
   Imperative::Get()->set_is_recording(recording);
 
-  return op_state;
-}
-
-void CachedOp::Forward(
-    const std::shared_ptr<CachedOp>& op_ptr,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
-  static const auto cached_op = nnvm::Op::Get("_CachedOp");
-
-  CHECK_EQ(inputs.size(), num_inputs());
-
-  Context default_ctx = inputs[0]->ctx();
-
-  const auto& idx = fwd_graph_.indexed_graph();
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i]->ctx(), default_ctx)
-        << "CachedOp requires all inputs to live on the same context. But "
-        << idx[idx.input_nodes()[0]].source->attrs.name
-        << " is on " << default_ctx << " while "
-        << idx[idx.input_nodes()[i]].source->attrs.name
-        << " is on " << inputs[i]->ctx();
-  }
-
-  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.forward_bulk_size);
-
-  OpStatePtr op_state;
-  if (config_.static_alloc) {
-    op_state = StaticForward(default_ctx, inputs, outputs);
-  } else {
-    op_state = DynamicForward(default_ctx, inputs, outputs);
+  for (size_t i = 0; i < idx.num_node_entries(); ++i) {
+    if (arrays[i] == &buff[i]) continue;
+    buff[i].shape_ = arrays[i]->shape_;
+    buff[i].dtype_ = arrays[i]->dtype_;
+    buff[i].storage_type_ = arrays[i]->storage_type_;
   }
 
-  Engine::Get()->set_bulk_size(prev_bulk_size);
-
-  if (Imperative::Get()->is_recording() && !inlining_) {
+  if (recording && !inlining_) {
     nnvm::NodeAttrs attrs;
     attrs.op = cached_op;
     attrs.name = "_cachedop";
     attrs.parsed = op_ptr;
     Imperative::Get()->RecordOp(
-        std::move(attrs), inputs, outputs, op_state,
+        std::move(attrs), inputs, outputs, op_state_ptr,
         &save_inputs(), &save_outputs());
   }
 }
 
 
-void CachedOp::DynamicBackward(
+void Imperative::CachedOp::Backward(
     const bool retain_graph,
-    const OpStatePtr& op_state,
+    const OpStatePtr& state,
     const std::vector<NDArray*>& inputs,
     const std::vector<OpReqType>& reqs,
     const std::vector<NDArray*>& outputs) {
   using namespace nnvm;
   using namespace imperative;
+  CHECK(!Imperative::Get()->is_recording())
+      << "CachedOp does not support higher order gradients. "
+      << "If you want to do backward with create_graph=True please "
+      << "do not use hybridize.";
 
   // Initialize
-  Context default_ctx = outputs[0]->ctx();
-  auto& runtime = op_state.get_state<DynamicRuntime>();
-  {
-    auto state_ptr = GetCachedOpState(default_ctx);
-    auto& state = state_ptr.get_state<CachedOpState>();
-    std::lock_guard<std::mutex> lock(state.mutex);
-    state.info.fwd_graph = runtime.info.fwd_graph;
-    SetBackwardGraph(&state.info, reqs, inputs);
-    runtime.info.full_graph = state.info.full_graph;
-    runtime.info.bwd_input_eid = state.info.bwd_input_eid;
-  }
-  nnvm::Graph& g = runtime.info.full_graph;
+  nnvm::Graph g = GetBackwardGraph(state, reqs, inputs);
   const auto& idx = g.indexed_graph();
-  auto& buff = runtime.buff;
-  auto& states = runtime.op_states;
+
+  auto& cached_op_state = state.get_state<CachedOpState>();
+  auto& buff = cached_op_state.buff;
+  auto& states = cached_op_state.states;
 
   size_t num_forward_outputs = fwd_graph_.outputs.size();
   size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
@@ -881,7 +506,7 @@ void CachedOp::DynamicBackward(
   arrays.reserve(buff.size());
   for (size_t i = 0; i < buff.size(); ++i) arrays.push_back(&buff[i]);
   for (size_t i = 0; i < inputs.size(); ++i) {
-    arrays[runtime.info.bwd_input_eid[i]] = inputs[i];
+    arrays[bwd_input_eid_[i]] = inputs[i];
   }
   for (size_t i = 0, j = num_forward_outputs; i < reqs.size(); ++i) {
     if (reqs[i] == kNullOp) continue;
@@ -905,14 +530,20 @@ void CachedOp::DynamicBackward(
     if (ref_count[i] == 0) array_reqs[i] = kNullOp;
   }
 
+  Context default_ctx = outputs[0]->ctx();
   const auto& mem_plan = g.GetAttr<MemoryPlanVector >("backward_mem_plan");
   AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
-  RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
-           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.backward_bulk_size);
+
+  Imperative::Get()->RunGraph(
+      retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
+      std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+
+  Engine::Get()->set_bulk_size(prev_bulk_size);
 
   if (retain_graph) {
     buff.resize(num_forward_entries);
@@ -922,99 +553,6 @@ void CachedOp::DynamicBackward(
   }
 }
 
-void CachedOp::StaticBackward(
-    const bool retain_graph,
-    const OpStatePtr& state_ptr,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<OpReqType>& reqs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace nnvm;
-  using namespace imperative;
-
-  Context default_ctx = outputs[0]->ctx();
-
-  auto& state = state_ptr.get_state<CachedOpState>();
-  std::lock_guard<std::mutex> lock(state.mutex);
-
-  bool match = SetBackwardGraph(&state.info, reqs, inputs, true);
-
-  nnvm::Graph& g = state.info.full_graph;
-  const auto& idx = g.indexed_graph();
-  auto num_forward_nodes = state.info.fwd_graph.indexed_graph().num_nodes();
-
-  if (!state.bwd_alloc || !match) {
-    StaticAllocMemory(state_ptr, true, true);
-  }
-
-  if (config_.static_shape) {
-    for (auto i : config_.param_indices) {
-      const auto iter = fwd_input_to_grad_output_.find(i);
-      if (iter == fwd_input_to_grad_output_.end()) continue;
-      auto entry = grad_graph_.outputs[iter->second];
-      if (!idx.exist(entry.node.get())) continue;
-      auto eid = idx.entry_id(entry);
-      if (!state.arrays[eid]->IsSame(*outputs[iter->second]) ||
-          !(state.array_reqs[eid] == reqs[iter->second])) {
-        match = false;
-        state.array_reqs[eid] = reqs[iter->second];
-        *state.arrays[eid] = *outputs[iter->second];
-        state.dynamic_entries[eid] = false;
-      }
-    }
-    for (auto i : config_.data_indices) {
-      const auto iter = fwd_input_to_grad_output_.find(i);
-      if (iter == fwd_input_to_grad_output_.end()) continue;
-      auto entry = grad_graph_.outputs[iter->second];
-      if (!idx.exist(entry.node.get())) continue;
-      auto eid = idx.entry_id(entry);
-      state.array_reqs[eid] = reqs[iter->second];
-      state.arrays[eid] = outputs[iter->second];
-    }
-  } else {
-    for (size_t i = 0; i < grad_graph_.outputs.size(); ++i) {
-      auto entry = grad_graph_.outputs[i];
-      if (!idx.exist(entry.node.get())) continue;
-      auto eid = idx.entry_id(entry);
-      state.array_reqs[eid] = reqs[i];
-      state.arrays[eid] = outputs[i];
-    }
-  }
-
-  if (!state.bwd_exec_init || !match) {
-    StaticInitExec(state_ptr, true, true);
-  }
-
-  for (size_t i = 0; i < state.info.bwd_input_eid.size(); ++i) {
-    auto eid = state.info.bwd_input_eid[i];
-    if (state.dynamic_entries[eid]) state.arrays[eid] = inputs[i];
-  }
-
-  StaticRunOps(default_ctx, g, state_ptr, num_forward_nodes, idx.num_nodes());
-}
-
-void CachedOp::Backward(
-    const bool retain_graph,
-    const OpStatePtr& state,
-    const std::vector<NDArray*>& inputs,
-    const std::vector<OpReqType>& reqs,
-    const std::vector<NDArray*>& outputs) {
-  using namespace imperative;
-  CHECK(!Imperative::Get()->is_recording())
-      << "CachedOp does not support higher order gradients. "
-      << "If you want to do backward with create_graph=True please "
-      << "do not use hybridize.";
-
-  int prev_bulk_size = Engine::Get()->set_bulk_size(config_.backward_bulk_size);
-
-  if (config_.static_alloc) {
-    StaticBackward(retain_graph, state, inputs, reqs, outputs);
-  } else {
-    DynamicBackward(retain_graph, state, inputs, reqs, outputs);
-  }
-
-  Engine::Get()->set_bulk_size(prev_bulk_size);
-}
-
 
 NNVM_REGISTER_OP(_CachedOp)
 .set_num_inputs([](const NodeAttrs& attrs) {
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
deleted file mode 100644
index 60a40c5e4a5..00000000000
--- a/src/imperative/cached_op.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef MXNET_IMPERATIVE_CACHED_OP_H_
-#define MXNET_IMPERATIVE_CACHED_OP_H_
-
-#include <mxnet/imperative.h>
-#include <vector>
-#include <atomic>
-#include <utility>
-#include <string>
-#include <unordered_map>
-
-namespace mxnet {
-/*! \brief CachedOp Parameters */
-struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
-  uint32_t inline_limit;
-  uint32_t forward_bulk_size;
-  uint32_t backward_bulk_size;
-  bool static_alloc;
-  bool static_shape;
-  nnvm::Tuple<uint32_t> data_indices;
-  nnvm::Tuple<uint32_t> param_indices;
-  DMLC_DECLARE_PARAMETER(CachedOpConfig) {
-    DMLC_DECLARE_FIELD(static_alloc)
-    .set_default(false)
-    .describe("Statically allocate memory to improve speed. "
-              "Memory usage may increase.");
-    DMLC_DECLARE_FIELD(static_shape)
-    .set_default(false)
-    .describe("Optimize for invariant input shapes between iterations. "
-              "Must also set static_alloc to True. "
-              "Change of input shapes is still allowed but slower.");
-    DMLC_DECLARE_FIELD(inline_limit)
-    .set_default(2)
-    .describe("Maximum number of operators that can be inlined.");
-    DMLC_DECLARE_FIELD(forward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
-    .describe("Segment size of bulk execution during forward pass.");
-    DMLC_DECLARE_FIELD(backward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
-    .describe("Segment size of bulk execution during backward pass.");
-    DMLC_DECLARE_FIELD(data_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
-    .describe("Position of argument variables.");
-    DMLC_DECLARE_FIELD(param_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
-    .describe("Position of parameters.");
-  }
-};
-
-class CachedOp {
- public:
-  CachedOp(
-      const nnvm::Symbol& sym,
-      const std::vector<std::pair<std::string, std::string> >& flags);
-  ~CachedOp();
-  uint32_t num_inputs() {
-    return fwd_graph_.indexed_graph().input_nodes().size();
-  }
-  uint32_t num_outputs() {
-    return fwd_graph_.outputs.size();
-  }
-  uint32_t num_backward_inputs() {
-    return bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size();
-  }
-  std::vector<bool>& save_inputs() {
-    return save_inputs_;
-  }
-  std::vector<bool>& save_outputs() {
-    return save_outputs_;
-  }
-  const std::unordered_set<uint32_t>& mutable_input_nodes() {
-    return fwd_graph_.indexed_graph().mutable_input_nodes();
-  }
-  std::vector<nnvm::NodeEntry> Gradient(
-      const nnvm::NodePtr& node,
-      const std::vector<nnvm::NodeEntry>& ograds);
-  void Forward(
-      const std::shared_ptr<CachedOp>& op_ptr,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
-  void Backward(
-      const bool retain_graph,
-      const OpStatePtr& state,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& outputs);
-
- private:
-  struct GraphInfo;
-  struct DynamicRuntime;
-  struct CachedOpState;
-
-  OpStatePtr GetCachedOpState(const Context& ctx);
-  bool SetForwardGraph(
-      GraphInfo* info,
-      const bool recording,
-      const std::vector<NDArray*>& inputs);
-  bool SetBackwardGraph(
-      GraphInfo* info,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& inputs,
-      bool detect_inplace_addto = false);
-  OpStatePtr DynamicForward(
-      const Context& default_ctx,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
-  void DynamicBackward(
-      const bool retain_graph,
-      const OpStatePtr& op_state,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& outputs);
-  void StaticAllocMemory(
-      const OpStatePtr& state_ptr,
-      bool recording,
-      bool keep_fwd);
-  void StaticInitExec(
-      const OpStatePtr& state_ptr,
-      bool recording,
-      bool keep_fwd);
-  void StaticRunOps(
-      const Context& default_ctx,
-      const nnvm::Graph& g,
-      const OpStatePtr& state_ptr,
-      size_t start_nid,
-      size_t end_nid);
-  OpStatePtr StaticForward(
-      const Context& default_ctx,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
-  void StaticBackward(
-      const bool retain_graph,
-      const OpStatePtr& state_ptr,
-      const std::vector<NDArray*>& inputs,
-      const std::vector<OpReqType>& reqs,
-      const std::vector<NDArray*>& outputs);
-
-  CachedOpConfig config_;
-  nnvm::Graph fwd_graph_;
-  nnvm::Graph grad_graph_;
-  nnvm::Graph full_graph_;
-  bool inlining_;
-  std::vector<nnvm::NodeEntry> ograd_entries_;
-  std::vector<uint32_t> bwd_in_dep_, bwd_out_dep_, bwd_ograd_dep_;
-  std::unordered_map<uint32_t, uint32_t> fwd_input_to_grad_output_;
-  std::vector<bool> save_inputs_, save_outputs_;
-  std::vector<OpReqType> bwd_output_reqs_;
-
-  std::mutex mutex_;
-  std::unordered_map<Context, std::vector<OpStatePtr> > cached_op_states_;
-};
-
-using CachedOpPtr = std::shared_ptr<CachedOp>;
-
-}  // namespace mxnet
-#endif  // MXNET_IMPERATIVE_CACHED_OP_H_
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index e1654259a2f..7caf305eac7 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -19,7 +19,6 @@
 #include <unordered_set>
 #include <iostream>
 #include "./imperative_utils.h"
-#include "./cached_op.h"
 
 namespace mxnet {
 #if DMLC_CXX11_THREAD_LOCAL
@@ -267,6 +266,95 @@ void Imperative::RecordOp(
   }
 }
 
+void Imperative::RunGraph(
+    const bool retain_graph,
+    const nnvm::IndexedGraph& idx,
+    const std::vector<NDArray*> arrays,
+    size_t node_start, size_t node_end,
+    std::vector<OpReqType>&& array_reqs,
+    std::vector<uint32_t>&& ref_count,
+    std::vector<OpStatePtr> *p_states,
+    const DispatchModeVector &dispatch_modes) {
+  using namespace nnvm;
+  using namespace imperative;
+  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
+  static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
+
+  std::vector<OpStatePtr>& states = *p_states;
+  bool recording = is_recording();
+
+  std::vector<NDArray*> ndinputs, ndoutputs;
+  ShapeVector arg_shapes;
+  DTypeVector arg_dtypes;
+  std::vector<OpReqType> req;
+
+  for (size_t i = node_start; i < node_end; ++i) {
+    const nnvm::IndexedGraph::Node& node = idx[i];
+    if (node.source->op() == nullptr) continue;
+    auto num_outputs = node.source->num_outputs();
+    ndinputs.clear();
+    ndinputs.reserve(node.inputs.size());
+    for (const auto& j : node.inputs) {
+      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
+      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
+    }
+    ndoutputs.clear();
+    ndoutputs.reserve(num_outputs);
+    req.clear();
+    req.reserve(num_outputs);
+    for (size_t j = 0; j < num_outputs; ++j) {
+      size_t eid = idx.entry_id(i, j);
+      ndoutputs.emplace_back(arrays[eid]);
+      req.push_back(array_reqs[eid]);
+      CHECK(!ndoutputs.back()->is_none());
+    }
+    const Context& ctx = ndoutputs[0]->ctx();
+    const DispatchMode dispatch_mode = dispatch_modes[i];
+    if (node.source->op() == bwd_cached_op) {
+      const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
+      nnvm::Node* fwd_node = node.source->control_deps[0].get();
+      auto fwd_node_id = idx.node_id(fwd_node);
+      cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
+    } else if (createop.count(node.source->op())) {
+      arg_shapes.clear();
+      arg_dtypes.clear();
+      arg_shapes.reserve(ndinputs.size());
+      arg_dtypes.reserve(ndinputs.size());
+      for (size_t i = 0; i < ndinputs.size(); ++i) {
+        arg_shapes.emplace_back(ndinputs[i]->shape());
+        arg_dtypes.emplace_back(ndinputs[i]->dtype());
+      }
+      states[i] = createop[node.source->op()](
+          node.source->attrs, ctx, arg_shapes, arg_dtypes);
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode, states[i]);
+      if (recording) RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[i]);
+    } else if (is_layer_backward.get(node.source->op(), false)) {
+      nnvm::Node* fwd_node = node.source->control_deps[0].get();
+      auto fwd_node_id = idx.node_id(fwd_node);
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+               req, dispatch_mode, states[fwd_node_id]);
+      if (recording) {
+        RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[fwd_node_id]);
+      }
+    } else {
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode);
+      if (recording) RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs);
+    }
+
+    for (const auto& j : node.inputs) {
+      size_t eid = idx.entry_id(j);
+      --ref_count[eid];
+      if (ref_count[eid] == 0) arrays[eid]->ptr_.reset();
+    }
+    for (size_t j = 0; j < ndoutputs.size(); ++j) {
+      size_t eid = idx.entry_id(i, j);
+      if (ref_count[eid] == 0) arrays[eid]->ptr_.reset();
+    }
+  }
+}
+
+
 std::vector<NDArray*> Imperative::Backward(
     const std::vector<NDArray*>& outputs,
     const std::vector<NDArray*>& ograds,
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
deleted file mode 100644
index 464aefc220d..00000000000
--- a/src/imperative/imperative_utils.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "./imperative_utils.h"
-#include "./cached_op.h"
-
-namespace mxnet {
-namespace imperative {
-void RunGraph(
-    const bool retain_graph,
-    const nnvm::IndexedGraph& idx,
-    const std::vector<NDArray*> arrays,
-    size_t node_start, size_t node_end,
-    std::vector<OpReqType>&& array_reqs,
-    std::vector<uint32_t>&& ref_count,
-    std::vector<OpStatePtr> *p_states,
-    const DispatchModeVector &dispatch_modes) {
-  using namespace nnvm;
-  using namespace imperative;
-  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
-  static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
-
-  const auto imp = Imperative::Get();
-
-  std::vector<OpStatePtr>& states = *p_states;
-  bool recording = imp->is_recording();
-
-  std::vector<NDArray*> ndinputs, ndoutputs;
-  ShapeVector arg_shapes;
-  DTypeVector arg_dtypes;
-  std::vector<OpReqType> req;
-
-  for (size_t i = node_start; i < node_end; ++i) {
-    const nnvm::IndexedGraph::Node& node = idx[i];
-    if (node.source->op() == nullptr) continue;
-    auto num_outputs = node.source->num_outputs();
-    ndinputs.clear();
-    ndinputs.reserve(node.inputs.size());
-    for (const auto& j : node.inputs) {
-      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
-      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
-    }
-    ndoutputs.clear();
-    ndoutputs.reserve(num_outputs);
-    req.clear();
-    req.reserve(num_outputs);
-    for (size_t j = 0; j < num_outputs; ++j) {
-      size_t eid = idx.entry_id(i, j);
-      ndoutputs.emplace_back(arrays[eid]);
-      req.push_back(array_reqs[eid]);
-      CHECK(array_reqs[eid] == kNullOp || !ndoutputs.back()->is_none());
-    }
-    const Context& ctx = ndoutputs[0]->ctx();
-    const DispatchMode dispatch_mode = dispatch_modes[i];
-    if (node.source->op() == bwd_cached_op) {
-      const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
-      nnvm::Node* fwd_node = node.source->control_deps[0].get();
-      auto fwd_node_id = idx.node_id(fwd_node);
-      cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
-    } else if (createop.count(node.source->op())) {
-      arg_shapes.clear();
-      arg_dtypes.clear();
-      arg_shapes.reserve(ndinputs.size());
-      arg_dtypes.reserve(ndinputs.size());
-      for (size_t i = 0; i < ndinputs.size(); ++i) {
-        arg_shapes.emplace_back(ndinputs[i]->shape());
-        arg_dtypes.emplace_back(ndinputs[i]->dtype());
-      }
-      states[i] = createop[node.source->op()](
-          node.source->attrs, ctx, arg_shapes, arg_dtypes);
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode, states[i]);
-      if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[i]);
-      }
-    } else if (is_layer_backward.get(node.source->op(), false)) {
-      nnvm::Node* fwd_node = node.source->control_deps[0].get();
-      auto fwd_node_id = idx.node_id(fwd_node);
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
-               req, dispatch_mode, states[fwd_node_id]);
-      if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[fwd_node_id]);
-      }
-    } else {
-      imp->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode);
-      if (recording) {
-        imp->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs);
-      }
-    }
-
-    for (const auto& j : node.inputs) {
-      size_t eid = idx.entry_id(j);
-      --ref_count[eid];
-      if (ref_count[eid] == 0) *arrays[eid] = NDArray();
-    }
-    for (size_t j = 0; j < ndoutputs.size(); ++j) {
-      size_t eid = idx.entry_id(i, j);
-      if (ref_count[eid] == 0) *arrays[eid] = NDArray();
-    }
-  }
-}
-
-}  // namespace imperative
-}  // namespace mxnet
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 726531d0299..06b7e058dd1 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -23,7 +23,6 @@
 #include <utility>
 #include <algorithm>
 #include <vector>
-#include <map>
 #include <string>
 #include "../executor/graph_executor.h"
 #include "../executor/exec_pass.h"
@@ -39,24 +38,11 @@ namespace mxnet {
 namespace imperative {
 
 struct MemoryPlanInfo {
-  int storage_id;
-  uint32_t root;
+  uint32_t sid;
   size_t size;
   bool inplace;
 };
 
-struct EngineOprDeleter {
-  void operator()(engine::Opr* handle) {
-    Engine::Get()->DeleteOperator(handle);
-  }
-};
-
-struct EngineOprSeg {
-  bool skip;
-  size_t next_nid;
-  std::unique_ptr<engine::Opr, EngineOprDeleter> opr;
-};
-
 using MemoryPlanVector = std::vector<MemoryPlanInfo>;
 
 inline Context GetContext(const nnvm::NodeAttrs& attrs,
@@ -729,12 +715,10 @@ inline std::vector<Context> PlaceDevice(const nnvm::IndexedGraph& idx) {
 
 
 inline MemoryPlanVector PlanMemory(
-    nnvm::Graph* p_g,
-    nnvm::StorageVector&& storage,
+    nnvm::Graph* p_g, nnvm::StorageVector&& storage,
     const std::vector<uint32_t>& ref_count,
     const std::pair<uint32_t, uint32_t>& node_range = {0, 0},
-    const std::pair<uint32_t, uint32_t>& entry_range = {0, 0},
-    bool detect_inplace_addto = false) {
+    const std::pair<uint32_t, uint32_t>& entry_range = {0, 0}) {
   using namespace nnvm;
   nnvm::Graph& g = *p_g;
   const auto& idx = g.indexed_graph();
@@ -744,31 +728,31 @@ inline MemoryPlanVector PlanMemory(
   g.attrs["ref_count"] = std::make_shared<dmlc::any>(ref_count);
   g.attrs["storage"] = std::make_shared<dmlc::any>(std::move(storage));
   g = nnvm::ApplyPass(g, "PlanMemory");
-  if (detect_inplace_addto) g = exec::DetectInplaceAddTo(g);
 
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<ShapeVector>("shape");
-  const auto& storage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
-  const auto& storage_ids = g.GetAttr<StorageVector>("storage_id");
+  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  auto storage_ids = g.MoveCopyAttr<StorageVector>("storage_id");
+  auto storage_inplace = g.MoveCopyAttr<std::vector<int> >("storage_inplace_index");
   uint32_t entry_start = entry_range.first;
   uint32_t entry_end =
       entry_range.second > entry_start ? entry_range.second : idx.num_node_entries();
   MemoryPlanVector mem_plan(idx.num_node_entries());
-  std::unordered_map<int, uint32_t> sid_to_root;
+  std::unordered_map<int, uint32_t> sid_to_loc;
 
   for (uint32_t i = entry_start; i < entry_end; ++i) {
+    if (stypes[i] != kDefaultStorage) continue;
     if (storage_ids[i] < 0) {
-      mem_plan[i] = {storage_ids[i], i, 0, false};
-    } else if (!sid_to_root.count(storage_ids[i])) {
+      mem_plan[i] = {i, mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size(), false};
+    } else if (!sid_to_loc.count(storage_ids[i])) {
       CHECK_LT(storage_inplace[i], 0);
-      sid_to_root[storage_ids[i]] = i;
-      mem_plan[i] = {storage_ids[i], i,
-                     mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size(),
-                     false};
+      sid_to_loc[storage_ids[i]] = i;
+      mem_plan[i].sid = i;
+      mem_plan[i].size = mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size();
     } else {
-      uint32_t root = sid_to_root[storage_ids[i]];
-      mem_plan[i] = {storage_ids[i], root, 0, storage_inplace[i] >= 0};
-      mem_plan[root].size = std::max(mem_plan[root].size,
+      uint32_t loc = sid_to_loc[storage_ids[i]];
+      mem_plan[i] = {loc, 0, storage_inplace[i] >= 0};
+      mem_plan[loc].size = std::max(mem_plan[loc].size,
           mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size());
     }
   }
@@ -777,213 +761,39 @@ inline MemoryPlanVector PlanMemory(
 }
 
 
-inline std::multimap<size_t, NDArray> AllocateMemory(
-    const nnvm::Graph& g,
-    const nnvm::IndexedGraph& idx,
-    const Context& default_ctx,
-    const uint32_t entry_start, const uint32_t entry_end,
-    const MemoryPlanVector& mem_plan,
-    const std::vector<NDArray*>& arrays,
-    std::vector<OpReqType> *array_reqs,
-    std::multimap<size_t, NDArray>&& pool = std::multimap<size_t, NDArray>()) {
+inline void AllocateMemory(const nnvm::Graph& g,
+                    const nnvm::IndexedGraph& idx,
+                    const Context& default_ctx,
+                    const uint32_t entry_start, const uint32_t entry_end,
+                    const MemoryPlanVector& mem_plan,
+                    const std::vector<NDArray*>& arrays,
+                    std::vector<OpReqType> *array_reqs) {
   using namespace nnvm;
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<ShapeVector>("shape");
   const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
 
-  std::multimap<size_t, NDArray> new_pool;
-
   for (uint32_t i = entry_start; i < entry_end; ++i) {
-    if (mem_plan[i].storage_id == exec::kExternalStorageID) continue;
-    CHECK(arrays[i]->is_none());
-    if (mem_plan[i].storage_id == exec::kDynamicStorageID) {
-      *arrays[i] = NDArray(static_cast<NDArrayStorageType>(stypes[i]),
-                           shapes[i], default_ctx, true, dtypes[i]);
-      continue;
-    }
-    CHECK_EQ(stypes[i], kDefaultStorage);
-    if (mem_plan[i].root == i) {
-      CHECK_GT(mem_plan[i].size, 0);
-      auto iter = pool.lower_bound(mem_plan[i].size);
-      if (iter != pool.end()) {
-        *arrays[i] = iter->second.AsArray(shapes[i], dtypes[i]);
-        new_pool.insert(*iter);
-        pool.erase(iter);
-      } else {
+    if (!arrays[i]->is_none()) continue;
+    if (stypes[i] == kDefaultStorage) {
+      if (mem_plan[i].sid == i) {
+        CHECK_GT(mem_plan[i].size, 0);
         NDArray buff(TShape({static_cast<nnvm::dim_t>(mem_plan[i].size)}),
                      default_ctx, true, mshadow::kUint8);
         *arrays[i] = buff.AsArray(shapes[i], dtypes[i]);
-        new_pool.insert({mem_plan[i].size, buff});
-      }
-    } else {
-      CHECK_GE(mem_plan[mem_plan[i].root].storage_id, 0);
-      *arrays[i] = arrays[mem_plan[i].root]->AsArray(shapes[i], dtypes[i]);
-      if (mem_plan[i].inplace && array_reqs->at(i) == kWriteTo) {
-        array_reqs->at(i) = kWriteInplace;
-      }
-    }
-  }
-
-  return new_pool;
-}
-
-inline void SetupOpExec(
-    const nnvm::Graph& g,
-    size_t nid,
-    const std::shared_ptr<exec::OpExecutor>& exec,
-    const std::vector<NDArray*> arrays,
-    const std::vector<OpReqType> array_reqs) {
-  const auto& idx = g.indexed_graph();
-  const auto& inode = idx[nid];
-  CHECK_EQ(exec->in_array.size(), 0U);
-  CHECK_EQ(exec->out_array.size(), 0U);
-  for (const auto& e : inode.inputs) {
-    CHECK(!arrays[idx.entry_id(e)]->is_none()) << inode.source->attrs.name;
-    exec->in_array.push_back(*arrays[idx.entry_id(e)]);
-  }
-  for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
-    uint32_t eid = idx.entry_id(nid, index);
-    CHECK(!arrays[eid]->is_none()) << inode.source->attrs.name;
-    exec->out_array.push_back(*arrays[eid]);
-    exec->req.push_back(array_reqs[eid]);
-  }
-
-  exec->Setup();
-}
-
-inline Engine::OprHandle CreateEngineOp(
-    const Context& default_ctx,
-    const std::vector<std::shared_ptr<exec::OpExecutor> >& execs) {
-  CHECK_GT(execs.size(), 0);
-  std::vector<Engine::VarHandle> use_vars, mutate_vars;
-
-  for (const auto& exec : execs) {
-    CHECK_GT(exec->out_array.size(), 0);
-    CHECK(execs.size() == 1 || exec->exec_type() == ExecType::kSync);
-
-    // the variables
-    for (const auto& nd : exec->in_array) {
-      use_vars.push_back(nd.var());
-    }
-    for (auto& r : exec->op_ctx.requested) {
-      mutate_vars.push_back(r.var);
-    }
-    for (auto& nd : exec->out_array) {
-      mutate_vars.push_back(nd.var());
-    }
-    if (exec->var() != nullptr) {
-      mutate_vars.push_back(exec->var());
-    }
-  }
-
-  // dedup vars
-  Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars);
-  bool is_gpu = default_ctx.dev_mask() == gpu::kDevMask;
-  bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync;
-
-  auto exec_fun = [execs, is_async, is_gpu] (
-      RunContext ctx, Engine::CallbackOnComplete on_complete) {
-    if (is_async) {
-      execs[0]->op_ctx.async_on_complete = on_complete;
-    }
-    for (const auto& exec : execs) exec->Run(ctx, is_gpu);
-    // call on complete only if it is async op
-    if (!is_async) {
-      if (is_gpu) {
-      #if MXNET_USE_CUDA
-        // Wait GPU kernel to finish.
-        ctx.get_stream<gpu>()->Wait();
-      #else
-        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-      #endif
-      }
-      on_complete();
-    }
-  };
-
-  return Engine::Get()->NewOperator(
-      exec_fun, use_vars, mutate_vars, FnProperty::kNormal);
-}
-
-inline void CreateEngineOpSeg(
-    const nnvm::IndexedGraph& idx,
-    const Context default_ctx,
-    const size_t start_nid,
-    const size_t end_nid,
-    const size_t bulk_size,
-    const std::unordered_set<uint32_t>& excludes,
-    const std::vector<std::shared_ptr<exec::OpExecutor> >& execs,
-    const std::vector<int> skip_plus_node,
-    std::vector<EngineOprSeg> *opr_segs) {
-  size_t seg_start = start_nid;
-  std::vector<std::shared_ptr<exec::OpExecutor> > seg_execs;
-  for (size_t nid = start_nid; nid < end_nid; ++nid) {
-    const auto& node = idx[nid];
-    if (node.source->is_variable()) continue;
-    if (skip_plus_node.size() && skip_plus_node[nid]) continue;
-    auto& exec = execs[nid];
-    bool is_async = exec->exec_type() != ExecType::kSync;
-    bool valid = exec->out_array.size() > 0;
-
-    // Stop at async nodes and invalid node (due to input/output is not allocated)
-    bool stop = is_async || !valid || seg_execs.size() >= bulk_size;
-    for (size_t i = 0; i < node.inputs.size() && !stop; ++i) {
-      if (excludes.count(idx.entry_id(node.inputs[i]))) stop = true;
-    }
-    auto num_outputs = node.source->num_outputs();
-    for (size_t i = 0; i < num_outputs && !stop; ++i) {
-      if (excludes.count(idx.entry_id(nid, i))) stop = true;
-    }
-
-    // Create opr segment for previous nodes.
-    if (stop && nid > seg_start) {
-      auto& seg = (*opr_segs)[seg_start];
-      if (seg_execs.size()) {
-        seg = EngineOprSeg{false, nid};
-        seg.opr.reset(CreateEngineOp(default_ctx, seg_execs));
       } else {
-        seg = EngineOprSeg{true, nid, nullptr};
+        *arrays[i] = arrays[mem_plan[i].sid]->AsArray(shapes[i], dtypes[i]);
+        if (mem_plan[i].inplace && array_reqs->at(i) == kWriteTo) {
+          array_reqs->at(i) = kWriteInplace;
+        }
       }
-      seg_start = nid;
-      seg_execs.clear();
-    }
-
-    seg_execs.push_back(exec);
-
-    auto& seg = (*opr_segs)[nid];
-    if (is_async) {
-      seg = EngineOprSeg{false, nid + 1};
-      seg.opr.reset(CreateEngineOp(default_ctx, seg_execs));
-      seg_execs.clear();
-      seg_start = nid + 1;
-    } else if (!valid) {
-      seg = EngineOprSeg{false, nid + 1, nullptr};
-      seg_execs.clear();
-      seg_start = nid + 1;
-    }
-  }
-  // The last segment
-  if (end_nid > seg_start) {
-    auto& seg = (*opr_segs)[seg_start];
-    if (seg_execs.size()) {
-      seg = EngineOprSeg{false, end_nid};
-      seg.opr.reset(CreateEngineOp(default_ctx, seg_execs));
     } else {
-      seg = EngineOprSeg{true, end_nid, nullptr};
+      *arrays[i] = NDArray(static_cast<NDArrayStorageType>(stypes[i]),
+                           shapes[i], default_ctx, true, dtypes[i]);
     }
   }
 }
 
-
-void RunGraph(const bool retain_graph,
-              const nnvm::IndexedGraph& idx,
-              const std::vector<NDArray*> arrays,
-              size_t node_start, size_t node_end,
-              std::vector<OpReqType>&& array_reqs,
-              std::vector<uint32_t>&& ref_count,
-              std::vector<OpStatePtr> *p_states,
-              const DispatchModeVector &dispatch_modes);
-
 }  // namespace imperative
 }  // namespace mxnet
 
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index 22af7d92750..ce9c79c403a 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -23,6 +23,7 @@
  * \brief Default augmenter.
  */
 #include <mxnet/base.h>
+#include <dmlc/optional.h>
 #include <utility>
 #include <string>
 #include <algorithm>
@@ -46,10 +47,14 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
   int resize;
   /*! \brief whether we do random cropping */
   bool rand_crop;
+  /*! \brief whether we do random resized cropping */
+  bool random_resized_crop;
   /*! \brief [-max_rotate_angle, max_rotate_angle] */
   int max_rotate_angle;
   /*! \brief max aspect ratio */
   float max_aspect_ratio;
+  /*! \brief min aspect ratio */
+  dmlc::optional<float> min_aspect_ratio;
   /*! \brief random shear the image [-max_shear_ratio, max_shear_ratio] */
   float max_shear_ratio;
   /*! \brief max crop size */
@@ -58,12 +63,24 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
   int min_crop_size;
   /*! \brief max scale ratio */
   float max_random_scale;
-  /*! \brief min scale_ratio */
+  /*! \brief min scale ratio */
   float min_random_scale;
+  /*! \brief max area */
+  float max_random_area;
+  /*! \brief min area */
+  float min_random_area;
   /*! \brief min image size */
   float min_img_size;
   /*! \brief max image size */
   float max_img_size;
+  /*! \brief max random brightness */
+  float brightness;
+  /*! \brief max random contrast */
+  float contrast;
+  /*! \brief max random saturation */
+  float saturation;
+  /*! \brief pca noise level */
+  float pca_noise;
   /*! \brief max random in H channel */
   int random_h;
   /*! \brief max random in S channel */
@@ -80,6 +97,9 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
   int pad;
   /*! \brief shape of the image data*/
   TShape data_shape;
+  /*! \brief random seed for augmentations */
+  dmlc::optional<int> seed_aug;
+
   // declare parameters
   DMLC_DECLARE_PARAMETER(DefaultImageAugmentParam) {
     DMLC_DECLARE_FIELD(resize).set_default(-1)
@@ -87,33 +107,65 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
                   "before applying other augmentations.");
     DMLC_DECLARE_FIELD(rand_crop).set_default(false)
         .describe("If or not randomly crop the image");
+    DMLC_DECLARE_FIELD(random_resized_crop).set_default(false)
+        .describe("If or not perform random resized cropping "
+                  "on the image, as a standard preprocessing "
+                  "for resnet training on ImageNet data.");
     DMLC_DECLARE_FIELD(max_rotate_angle).set_default(0.0f)
         .describe("Rotate by a random degree in ``[-v, v]``");
     DMLC_DECLARE_FIELD(max_aspect_ratio).set_default(0.0f)
+        .describe("Change the aspect (namely width/height) to a random value. "
+                  "If min_aspect_ratio is None then the aspect ratio ins sampled from "
+                  "[1 - max_aspect_ratio, 1 + max_aspect_ratio], "
+                  "else it is in ``[min_aspect_ratio, max_aspect_ratio]``");
+    DMLC_DECLARE_FIELD(min_aspect_ratio).set_default(dmlc::optional<float>())
         .describe("Change the aspect (namely width/height) to a random value "
-                  "in ``[1 - max_aspect_ratio, 1 + max_aspect_ratio]``");
+                  "in ``[min_aspect_ratio, max_aspect_ratio]``");
     DMLC_DECLARE_FIELD(max_shear_ratio).set_default(0.0f)
         .describe("Apply a shear transformation (namely ``(x,y)->(x+my,y)``) "
                   "with ``m`` randomly chose from "
                   "``[-max_shear_ratio, max_shear_ratio]``");
     DMLC_DECLARE_FIELD(max_crop_size).set_default(-1)
         .describe("Crop both width and height into a random size in "
-                  "``[min_crop_size, max_crop_size]``");
+                  "``[min_crop_size, max_crop_size].``"
+                  "Ignored if ``random_resized_crop`` is True.");
     DMLC_DECLARE_FIELD(min_crop_size).set_default(-1)
         .describe("Crop both width and height into a random size in "
-                  "``[min_crop_size, max_crop_size]``");
+                  "``[min_crop_size, max_crop_size].``"
+                  "Ignored if ``random_resized_crop`` is True.");
     DMLC_DECLARE_FIELD(max_random_scale).set_default(1.0f)
         .describe("Resize into ``[width*s, height*s]`` with ``s`` randomly"
-                  " chosen from ``[min_random_scale, max_random_scale]``");
+                  " chosen from ``[min_random_scale, max_random_scale]``. "
+                  "Ignored if ``random_resized_crop`` is True.");
     DMLC_DECLARE_FIELD(min_random_scale).set_default(1.0f)
         .describe("Resize into ``[width*s, height*s]`` with ``s`` randomly"
-                  " chosen from ``[min_random_scale, max_random_scale]``");
+                  " chosen from ``[min_random_scale, max_random_scale]``"
+                  "Ignored if ``random_resized_crop`` is True.");
+    DMLC_DECLARE_FIELD(max_random_area).set_default(1.0f)
+        .describe("Change the area (namely width * height) to a random value "
+                  "in ``[min_random_area, max_random_area]``. "
+                  "Ignored if ``random_resized_crop`` is False.");
+    DMLC_DECLARE_FIELD(min_random_area).set_default(1.0f)
+        .describe("Change the area (namely width * height) to a random value "
+                  "in ``[min_random_area, max_random_area]``. "
+                  "Ignored if ``random_resized_crop`` is False.");
     DMLC_DECLARE_FIELD(max_img_size).set_default(1e10f)
         .describe("Set the maximal width and height after all resize and"
                   " rotate argumentation  are applied");
     DMLC_DECLARE_FIELD(min_img_size).set_default(0.0f)
         .describe("Set the minimal width and height after all resize and"
                   " rotate argumentation  are applied");
+    DMLC_DECLARE_FIELD(brightness).set_default(0.0f)
+        .describe("Add a random value in ``[-brightness, brightness]`` to "
+                  "the brightness of image.");
+    DMLC_DECLARE_FIELD(contrast).set_default(0.0f)
+        .describe("Add a random value in ``[-contrast, contrast]`` to "
+                  "the contrast of image.");
+    DMLC_DECLARE_FIELD(saturation).set_default(0.0f)
+        .describe("Add a random value in ``[-saturation, saturation]`` to "
+                  "the saturation of image.");
+        DMLC_DECLARE_FIELD(pca_noise).set_default(0.0f)
+                .describe("Add PCA based noise to the image.");
     DMLC_DECLARE_FIELD(random_h).set_default(0)
         .describe("Add a random value in ``[-random_h, random_h]`` to "
                   "the H channel in HSL color space.");
@@ -136,6 +188,8 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
     DMLC_DECLARE_FIELD(pad).set_default(0)
         .describe("Change size from ``[width, height]`` into "
                   "``[pad + width + pad, pad + height + pad]`` by padding pixes");
+    DMLC_DECLARE_FIELD(seed_aug).set_default(dmlc::optional<int>())
+        .describe("Random seed for augmentations.");
   }
 };
 
@@ -156,6 +210,7 @@ class DefaultImageAugmenter : public ImageAugmenter {
   // contructor
   DefaultImageAugmenter() {
     rotateM_ = cv::Mat(2, 3, CV_32F);
+    seed_init_state = false;
   }
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
@@ -196,7 +251,23 @@ class DefaultImageAugmenter : public ImageAugmenter {
   }
   cv::Mat Process(const cv::Mat &src, std::vector<float> *label,
                   common::RANDOM_ENGINE *prnd) override {
+    if (!seed_init_state && param_.seed_aug.has_value()) {
+      prnd->seed(param_.seed_aug.value());
+      seed_init_state = true;
+    }
     using mshadow::index_t;
+    bool is_cropped = false;
+
+    float max_aspect_ratio = 1.0f;
+    float min_aspect_ratio = 1.0f;
+    if (param_.min_aspect_ratio.has_value()) {
+      max_aspect_ratio = param_.max_aspect_ratio;
+      min_aspect_ratio = param_.min_aspect_ratio.value();
+    } else {
+      max_aspect_ratio = 1 + param_.max_aspect_ratio;
+      min_aspect_ratio = 1 - param_.max_aspect_ratio;
+    }
+
     cv::Mat res;
     if (param_.resize != -1) {
       int new_height, new_width;
@@ -220,8 +291,9 @@ class DefaultImageAugmenter : public ImageAugmenter {
 
     // normal augmentation by affine transformation.
     if (param_.max_rotate_angle > 0 || param_.max_shear_ratio > 0.0f
-        || param_.rotate > 0 || rotate_list_.size() > 0 || param_.max_random_scale != 1.0
-        || param_.min_random_scale != 1.0 || param_.max_aspect_ratio != 0.0f
+        || param_.rotate > 0 || rotate_list_.size() > 0
+        || param_.max_random_scale != 1.0f || param_.min_random_scale != 1.0
+        || min_aspect_ratio != 1.0f || max_aspect_ratio != 1.0f
         || param_.max_img_size != 1e10f || param_.min_img_size != 0.0f) {
       std::uniform_real_distribution<float> rand_uniform(0, 1);
       // shear
@@ -236,11 +308,17 @@ class DefaultImageAugmenter : public ImageAugmenter {
       float a = cos(angle / 180.0 * M_PI);
       float b = sin(angle / 180.0 * M_PI);
       // scale
-      float scale = rand_uniform(*prnd) *
-          (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
+      float scale = 1.0f;
+      if (!param_.random_resized_crop) {
+        scale = rand_uniform(*prnd) *
+            (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
+      }
       // aspect ratio
-      float ratio = rand_uniform(*prnd) *
-          param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
+      float ratio = 1.0f;
+      if (!param_.random_resized_crop) {
+        ratio = rand_uniform(*prnd) *
+            (max_aspect_ratio - min_aspect_ratio) + min_aspect_ratio;
+      }
       float hs = 2 * scale / (1 + ratio);
       float ws = ratio * hs;
       // new width and height
@@ -276,8 +354,59 @@ class DefaultImageAugmenter : public ImageAugmenter {
                          cv::Scalar(param_.fill_value, param_.fill_value, param_.fill_value));
     }
 
-    // crop logic
-    if (param_.max_crop_size != -1 || param_.min_crop_size != -1) {
+    if (param_.random_resized_crop) {
+      // random resize crop
+      CHECK(param_.min_random_scale == 1.0f &&
+        param_.max_random_scale == 1.0f &&
+        param_.min_crop_size == -1 &&
+        param_.max_crop_size == -1 &&
+        !param_.rand_crop) <<
+        "\nSetting random_resized_crop to true conflicts with "
+        "min_random_scale, max_random_scale, "
+        "min_crop_size, max_crop_size, "
+        "and rand_crop.";
+
+      if (param_.max_random_area != 1.0f || param_.min_random_area != 1.0f
+          || max_aspect_ratio != 1.0f || min_aspect_ratio != 1.0f) {
+            CHECK(min_aspect_ratio > 0.0f);
+            CHECK(param_.min_random_area <= param_.max_random_area);
+            CHECK(min_aspect_ratio <= max_aspect_ratio);
+            std::uniform_real_distribution<float> rand_uniform_area(param_.min_random_area,
+                                                                    param_.max_random_area);
+            std::uniform_real_distribution<float> rand_uniform_ratio(min_aspect_ratio,
+                                                                     max_aspect_ratio);
+            std::uniform_real_distribution<float> rand_uniform(0, 1);
+            float area = res.rows * res.cols;
+            for (int i = 0; i < 10; ++i) {
+              float rand_area = rand_uniform_area(*prnd);
+              float ratio = rand_uniform_ratio(*prnd);
+              float target_area = area * rand_area;
+              int y_area = std::round(std::sqrt(target_area / ratio));
+              int x_area = std::round(std::sqrt(target_area * ratio));
+              if (rand_uniform(*prnd) > 0.5) {
+                float temp_y_area = y_area;
+                y_area = x_area;
+                x_area = temp_y_area;
+              }
+              if (y_area <= res.rows && x_area <= res.cols) {
+                index_t rand_y_area =
+                    std::uniform_int_distribution<index_t>(0, res.rows - y_area)(*prnd);
+                index_t rand_x_area =
+                    std::uniform_int_distribution<index_t>(0, res.cols - x_area)(*prnd);
+                cv::Rect roi(rand_x_area, rand_y_area, x_area, y_area);
+                int interpolation_method = GetInterMethod(param_.inter_method, x_area, y_area,
+                                                          param_.data_shape[2],
+                                                          param_.data_shape[1], prnd);
+                cv::resize(res(roi), res, cv::Size(param_.data_shape[2], param_.data_shape[1]),
+                           0, 0, interpolation_method);
+                is_cropped = true;
+                break;
+              }
+            }
+      }
+    } else if (!param_.random_resized_crop &&
+        (param_.max_crop_size != -1 || param_.min_crop_size != -1)) {
+      // random_crop
       CHECK(res.cols >= param_.max_crop_size && res.rows >= \
               param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
           << "input image size smaller than max_crop_size";
@@ -296,7 +425,28 @@ class DefaultImageAugmenter : public ImageAugmenter {
                                                 param_.data_shape[2], param_.data_shape[1], prnd);
       cv::resize(res(roi), res, cv::Size(param_.data_shape[2], param_.data_shape[1])
                 , 0, 0, interpolation_method);
-    } else {
+      is_cropped = true;
+    }
+
+    if (!is_cropped) {
+      // center crop
+      int interpolation_method = GetInterMethod(param_.inter_method, res.cols, res.rows,
+                                                param_.data_shape[2],
+                                                param_.data_shape[1], prnd);
+      if (res.rows < param_.data_shape[1]) {
+        index_t new_cols = static_cast<index_t>(static_cast<float>(param_.data_shape[1]) /
+                                                static_cast<float>(res.rows) *
+                                                static_cast<float>(res.cols));
+        cv::resize(res, res, cv::Size(new_cols, param_.data_shape[1]),
+                   0, 0, interpolation_method);
+      }
+      if (res.cols < param_.data_shape[2]) {
+        index_t new_rows = static_cast<index_t>(static_cast<float>(param_.data_shape[2]) /
+                                                static_cast<float>(res.cols) *
+                                                static_cast<float>(res.rows));
+        cv::resize(res, res, cv::Size(param_.data_shape[2], new_rows),
+                   0, 0, interpolation_method);
+      }
       CHECK(static_cast<index_t>(res.rows) >= param_.data_shape[1]
             && static_cast<index_t>(res.cols) >= param_.data_shape[2])
           << "input image size smaller than input shape";
@@ -312,13 +462,48 @@ class DefaultImageAugmenter : public ImageAugmenter {
       res = res(roi);
     }
 
+    // color jitter
+    if (param_.brightness > 0.0f || param_.contrast > 0.0f || param_.saturation > 0.0f) {
+      std::uniform_real_distribution<float> rand_uniform(0, 1);
+      float alpha_b = 1.0 + std::uniform_real_distribution<float>(-param_.brightness,
+                                                                  param_.brightness)(*prnd);
+      float alpha_c = 1.0 + std::uniform_real_distribution<float>(-param_.contrast,
+                                                                  param_.contrast)(*prnd);
+      float alpha_s = 1.0 + std::uniform_real_distribution<float>(-param_.saturation,
+                                                                  param_.saturation)(*prnd);
+      int rand_order[3] = {0, 1, 2};
+      std::random_shuffle(std::begin(rand_order), std::end(rand_order));
+      for (int i = 0; i < 3; ++i) {
+        if (rand_order[i] == 0) {
+          // brightness
+          res.convertTo(res, -1, alpha_b, 0);
+        }
+        if (rand_order[i] == 1) {
+          // contrast
+          cvtColor(res, temp_, CV_RGB2GRAY);
+          float gray_mean = cv::mean(temp_)[0];
+          res.convertTo(res, -1, alpha_c, (1 - alpha_c) * gray_mean);
+        }
+        if (rand_order[i] == 2) {
+          // saturation
+          cvtColor(res, temp_, CV_RGB2GRAY);
+          cvtColor(temp_, temp_, CV_GRAY2BGR);
+          cv::addWeighted(res, alpha_s, temp_, 1 - alpha_s, 0.0, res);
+        }
+      }
+    }
+
     // color space augmentation
     if (param_.random_h != 0 || param_.random_s != 0 || param_.random_l != 0) {
       std::uniform_real_distribution<float> rand_uniform(0, 1);
       cvtColor(res, res, CV_BGR2HLS);
-      int h = rand_uniform(*prnd) * param_.random_h * 2 - param_.random_h;
-      int s = rand_uniform(*prnd) * param_.random_s * 2 - param_.random_s;
-      int l = rand_uniform(*prnd) * param_.random_l * 2 - param_.random_l;
+      // use an approximation of gaussian distribution to reduce extreme value
+      float rh = rand_uniform(*prnd); rh += 4 * rand_uniform(*prnd); rh = rh / 5;
+      float rs = rand_uniform(*prnd); rs += 4 * rand_uniform(*prnd); rs = rs / 5;
+      float rl = rand_uniform(*prnd); rl += 4 * rand_uniform(*prnd); rl = rl / 5;
+      int h = rh * param_.random_h * 2 - param_.random_h;
+      int s = rs * param_.random_s * 2 - param_.random_s;
+      int l = rl * param_.random_l * 2 - param_.random_l;
       int temp[3] = {h, l, s};
       int limit[3] = {180, 255, 255};
       for (int i = 0; i < res.rows; ++i) {
@@ -333,18 +518,50 @@ class DefaultImageAugmenter : public ImageAugmenter {
       }
       cvtColor(res, res, CV_HLS2BGR);
     }
+
+    // pca noise
+    if (param_.pca_noise > 0.0f) {
+      std::normal_distribution<float> rand_normal(0, param_.pca_noise);
+      float pca_alpha_r = rand_normal(*prnd);
+      float pca_alpha_g = rand_normal(*prnd);
+      float pca_alpha_b = rand_normal(*prnd);
+      float pca_r = eigvec[0][0] * pca_alpha_r + eigvec[0][1] * pca_alpha_g +
+           eigvec[0][2] * pca_alpha_b;
+      float pca_g = eigvec[1][0] * pca_alpha_r + eigvec[1][1] * pca_alpha_g +
+           eigvec[1][2] * pca_alpha_b;
+      float pca_b = eigvec[2][0] * pca_alpha_r + eigvec[2][1] * pca_alpha_g +
+           eigvec[2][2] * pca_alpha_b;
+      float pca[3] = { pca_b, pca_g, pca_r };
+      for (int i = 0; i < res.rows; ++i) {
+        for (int j = 0; j < res.cols; ++j) {
+          for (int k = 0; k < 3; ++k) {
+            int vp = res.at<cv::Vec3b>(i, j)[k];
+            vp += pca[k];
+            vp = std::max(0, std::min(255, vp));
+            res.at<cv::Vec3b>(i, j)[k] = vp;
+          }
+        }
+      }
+    }
     return res;
   }
 
+
  private:
   // temporal space
   cv::Mat temp_;
   // rotation param
   cv::Mat rotateM_;
+  // eigval and eigvec for adding pca noise
+  // store eigval * eigvec as eigvec
+  float eigvec[3][3] = { { 55.46f * -0.5675f, 4.794f * 0.7192f,  1.148f * 0.4009f },
+                         { 55.46f * -0.5808f, 4.794f * -0.0045f, 1.148f * -0.8140f },
+                         { 55.46f * -0.5836f, 4.794f * -0.6948f, 1.148f * 0.4203f } };
   // parameters
   DefaultImageAugmentParam param_;
   /*! \brief list of possible rotate angle */
   std::vector<int> rotate_list_;
+  bool seed_init_state;
 };
 
 ImageAugmenter* ImageAugmenter::Create(const std::string& name) {
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index c99280ac7ea..8b93e8358d5 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -39,6 +39,7 @@
 #include "./mshadow_op.h"
 #include "./random/sampler.h"
 #include "./random/sample_op.h"
+#include "./tensor/elemwise_binary_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
@@ -72,12 +73,6 @@ struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
   }
 };
 
-struct prelu_grad {
-  MSHADOW_XINLINE static real_t Map(real_t a) {
-    return a > 0.0f ? 0.0f : a;
-  }
-};
-
 template<typename xpu, typename DType>
 class LeakyReLUOp : public Operator {
  public:
@@ -98,28 +93,51 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 3, DType> data;
     Tensor<xpu, 3, DType> out;
     Tensor<xpu, 3, DType> mask;
-    Tensor<xpu, 1, DType> weight;
     int n = in_data[leakyrelu::kData].shape_[0];
     int k = in_data[leakyrelu::kData].shape_[1];
     Shape<3> dshape = Shape3(n, k, in_data[leakyrelu::kData].Size()/n/k);
     data = in_data[leakyrelu::kData].get_with_shape<xpu, 3, DType>(dshape, s);
     out = out_data[leakyrelu::kOut].get_with_shape<xpu, 3, DType>(dshape, s);
+    if (req[leakyrelu::kOut] == kNullOp) {
+      return;
+    }
     switch (param_.act_type) {
       case leakyrelu::kLeakyReLU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
             s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(param_.slope));
         });
         break;
       }
       case leakyrelu::kPReLU: {
-        weight = in_data[leakyrelu::kGamma].get<xpu, 1, DType>(s);
-        if (weight.shape_.Size() == 1) {
-          Assign(out, req[leakyrelu::kOut],
-                 F<mshadow_op::xelu>(data, mshadow::expr::broadcast_scalar(weight, out.shape_)));
+        TShape gshape = expand_shape(in_data[leakyrelu::kGamma].shape_,
+                                     in_data[leakyrelu::kData].shape_);
+        TShape new_lshape, new_rshape, new_oshape;
+        const int ndim = op::BinaryBroadcastShapeCompact(in_data[leakyrelu::kData].shape_,
+                                                         gshape,
+                                                         out_data[leakyrelu::kOut].shape_,
+                                                         &new_lshape, &new_rshape, &new_oshape);
+        if (!ndim) {
+          MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
+            const size_t size = (minthree(out_data[leakyrelu::kOut].Size(),
+                                          in_data[leakyrelu::kData].Size(),
+                                          in_data[leakyrelu::kGamma].Size())
+            + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
+                s, size, out_data[leakyrelu::kOut].dptr<DType>(),
+            in_data[leakyrelu::kData].dptr<DType>(), in_data[leakyrelu::kGamma].dptr<DType>());
+          });
         } else {
-          Assign(out, req[leakyrelu::kOut],
-                 F<mshadow_op::xelu>(data, mshadow::expr::broadcast<1>(weight, out.shape_)));
+          BROADCAST_NDIM_SWITCH(ndim, NDim, {
+            mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
+            mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
+            mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
+            mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, DType,
+                                                               mshadow_op::xelu>, xpu>::
+            template LaunchEx(s, new_oshape.Size(), req[leakyrelu::kOut], lstride, rstride, oshape,
+            in_data[leakyrelu::kData].dptr<DType>(), in_data[leakyrelu::kGamma].dptr<DType>(),
+            out_data[leakyrelu::kOut].dptr<DType>());
+          });
         }
         break;
       }
@@ -134,23 +152,23 @@ class LeakyReLUOp : public Operator {
           Tensor<xpu, 1, DType> out = mask.FlatTo1D();
           sampler.Sample(low, high, out, pgen, s);
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::mul, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
               s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_,
               DType(param_.upper_bound - param_.lower_bound));
           });
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::plus, Req>, xpu>::Launch(
               s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_,
               DType(param_.lower_bound));
           });
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
               s, mask.size(0) * mask.size(1) * mask.size(2), out.dptr_, data.dptr_, mask.dptr_);
           });
         } else {
           const float slope = (param_.lower_bound + param_.upper_bound) / 2.0f;
           MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::xelu, Req>, xpu>::Launch(
               s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(slope));
           });
         }
@@ -158,7 +176,7 @@ class LeakyReLUOp : public Operator {
       }
       case leakyrelu::kELU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::elu, Req>, xpu>::Launch(
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::elu, Req>, xpu>::Launch(
             s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_,
             DType(param_.slope));
         });
@@ -188,8 +206,6 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 3, DType> gdata;
     Tensor<xpu, 3, DType> grad;
     Tensor<xpu, 3, DType> mask;
-    Tensor<xpu, 1, DType> weight;
-    Tensor<xpu, 1, DType> grad_weight;
     int n = out_grad[leakyrelu::kOut].shape_[0];
     int k = out_grad[leakyrelu::kOut].shape_[1];
     Shape<3> dshape = Shape3(n, k, out_grad[leakyrelu::kOut].Size()/n/k);
@@ -206,29 +222,38 @@ class LeakyReLUOp : public Operator {
       case leakyrelu::kLeakyReLU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<
-            mxnet_op::backward_grad_tuned<mxnet::op::mshadow_op::xelu_grad>, Req>, xpu>::Launch(
+            mxnet_op::backward_grad_tuned<mshadow_op::xelu_grad>, Req>, xpu>::Launch(
               s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
               output.dptr_, DType(param_.slope));
         });
         break;
       }
       case leakyrelu::kPReLU: {
-        weight = in_data[leakyrelu::kGamma].get<xpu, 1, DType>(s);
-        grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, DType>(s);
-        if (weight.shape_.Size() == 1) {
-          Shape<4> gshape = Shape4(1, grad.shape_[0], grad.shape_[1], grad.shape_[2]);
-          Assign(grad_weight, req[leakyrelu::kGamma],
-                 sumall_except_dim<0>(reshape(F<prelu_grad>(data) * grad, gshape)));
-          Assign(gdata, req[leakyrelu::kData],
-                 F<mshadow_op::xelu_grad>(data,
-                                          mshadow::expr::broadcast_scalar(weight, data.shape_))
-                 * grad);
+        TShape gshape = expand_shape(in_grad[leakyrelu::kGamma].shape_,
+                                     in_grad[leakyrelu::kData].shape_);
+        TShape new_lshape, new_rshape, new_oshape;
+        const bool need_bc = BinaryBroadcastShapeCompact(in_grad[leakyrelu::kData].shape_,
+                                                         gshape,
+                                                         out_grad[leakyrelu::kOut].shape_,
+                                                         &new_lshape,
+                                                         &new_rshape,
+                                                         &new_oshape) != 0;
+        if (!need_bc) {
+          ElemwiseBinaryOp::BackwardUseIn<xpu,
+                                          mshadow_op::xelu_grad,
+                                          mshadow_op::prelu_grad>(
+            nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
+                                     in_data[leakyrelu::kData],
+                                     in_data[leakyrelu::kGamma]}, req, in_grad);
         } else {
-          Assign(grad_weight, req[leakyrelu::kGamma],
-                 sumall_except_dim<1>(F<prelu_grad>(data) * grad));
-          Assign(gdata, req[leakyrelu::kData],
-                 F<mshadow_op::xelu_grad>(data, mshadow::expr::broadcast<1>(weight, data.shape_))
-                 * grad);
+          BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
+            BinaryBroadcastBackwardUseInImpl<xpu, NDim, DType,
+              mshadow_op::xelu_grad, mshadow_op::prelu_grad>(
+                ctx, {out_grad[leakyrelu::kOut],
+                      in_data[leakyrelu::kData],
+                      in_data[leakyrelu::kGamma]}, req, in_grad,
+                new_lshape, new_rshape, new_oshape);
+          });
         }
         break;
       }
@@ -239,7 +264,7 @@ class LeakyReLUOp : public Operator {
       case leakyrelu::kELU: {
         MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
           mxnet_op::Kernel<mxnet_op::op_with_req<
-            mxnet_op::backward_grad_tuned<mxnet::op::mshadow_op::elu_grad>, Req>, xpu>::Launch(
+            mxnet_op::backward_grad_tuned<mshadow_op::elu_grad>, Req>, xpu>::Launch(
               s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
               output.dptr_, DType(param_.slope));
         });
@@ -251,6 +276,24 @@ class LeakyReLUOp : public Operator {
   }
 
  private:
+  /*! \brief Minimum of three */
+  static MSHADOW_XINLINE size_t minthree(const size_t a, const size_t b, const size_t c) {
+    return a < b ? (a < c ? a : c) : (b < c ? b : c);
+  }
+  static inline TShape expand_shape(const TShape& src, const TShape& dst) {
+    TShape result(dst.ndim());
+    int s = src.ndim() - 1;
+    for (int i = dst.ndim() - 1; i >= 0; i--) {
+      if (s >= 0 && (dst[i] == src[s] || src[s] == 1)) {
+        result[i] = src[s];
+        s--;
+      } else {
+        result[i] = 1;
+      }
+    }
+    CHECK(s == -1) << "Cannot broadcast gamma to data. gamma: " << src << ", data: " << dst;
+    return result;
+  }
   LeakyReLUParam param_;
 };  // class LeakyReLUOp
 
@@ -281,10 +324,12 @@ class LeakyReLUProp : public OperatorProperty {
     if (dshape.ndim() == 0) return false;
     if (param_.act_type == leakyrelu::kPReLU) {
       const TShape &gshape = in_shape->at(leakyrelu::kGamma);
-      if (gshape.ndim() == 1 && gshape.Size() == 1)
-        in_shape->at(leakyrelu::kGamma) = TShape(Shape1(1));
-      else
+      if (gshape.ndim() == 0) {
         in_shape->at(leakyrelu::kGamma) = TShape(Shape1(dshape[1]));
+      }
+      if (dshape == gshape) {
+        SHAPE_ASSIGN_CHECK(*out_shape, 0, dshape);
+      }
     }
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -396,6 +441,11 @@ class LeakyReLUProp : public OperatorProperty {
     }
   }
 
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
   Operator* CreateOperator(Context ctx) const override {
     LOG(FATAL) << "Not Implemented.";
     return NULL;
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 19fa4f8ead8..5953568c7fa 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -126,6 +126,8 @@ MXNET_UNARY_MATH_OP_NC(relu, a > DType(0) ? a : DType(0));
 
 MXNET_UNARY_MATH_OP_NC(relu_grad, a > DType(0) ? DType(1) : DType(0));
 
+MXNET_BINARY_MATH_OP_NC(prelu_grad, a > DType(0) ? DType(0) : a);
+
 MXNET_BINARY_MATH_OP_NC(xelu, a > DType(0) ? a :
                         DType(static_cast<float>(a) * static_cast<float>(b)));
 
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 5632d73c261..d40abaf1fd6 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -125,6 +125,8 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   }
 };
 
+void ConvolutionParamParser(nnvm::NodeAttrs* attrs);
+
 typedef ParamOpSign<ConvolutionParam> ConvSignature;
 
 }  // namespace op
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 0e8a929e1ba..ef70ccd6ec1 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -331,7 +331,7 @@ inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
                              dispatch_mode, wanted_mode);
 }
 
-static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
+void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
   using namespace mshadow;
   ConvolutionParam param_;
   try {
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 84cf6403043..bc3ee366007 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -51,7 +51,11 @@ class CuDNNPoolingOp {
         mode_ = CUDNN_POOLING_MAX;
         break;
       case pool_enum::kAvgPooling:
-        mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        if (param_.count_include_pad.has_value() && !param_.count_include_pad.value()) {
+          mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+        } else {
+          mode_ = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        }
         break;
       default:
         LOG(FATAL) << "Not implmented";
@@ -263,7 +267,7 @@ class CuDNNPoolingOp {
                                              &(pad_vec[0]),
                                              &(stride_vec[0])));
       #else
-      LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
+      LOG(FATAL) << "3D pooling only support CUDNN v5 and above";
       #endif
     }
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index a278456ea26..fae72bd9221 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -161,15 +161,15 @@ void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
 
   NDArray in_buffer = in_data;
+  MKLDNNStream *stream = MKLDNNStream::Get();
+
   if (in_data.IsView() && in_data.IsMKLDNNData())
     in_buffer = in_data.Reorder2Default();
 
   auto input_mem = in_buffer.GetMKLDNNData();
   MKLDNNActForward &fwd = GetActForward(param, ctx, in_buffer, *input_mem);
-  auto out_mem = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_primitive_desc(),
-                                 req);
+  auto out_mem = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_primitive_desc(), req, &in_buffer);
   fwd.SetNewMem(*input_mem, *out_mem.second);
-  MKLDNNStream *stream = MKLDNNStream::Get();
   stream->RegisterPrim(fwd.GetFwd());
   CommitOutput(out_data, out_mem);
   stream->Submit();
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index bd2faf5775a..6a7c58f2991 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -324,13 +324,16 @@ typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
  * The difference is that the first function can create MKLDNN memory with
  * special layouts in an NDArray, while the second one can only create MKLDNN
  * memory with default layouts.
+ * Also an optional in_arr parameter can be passed in the first function with
+ * the kWriteInPlace req to validate if mkldnn can support write in place;
+ * otherwise new memory will be written to an copied back onto out_arr.
  * If these two functions are used, we have to call CommitOutput to write
  * the output back to the output NDArray.
  */
-mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
                                 const mkldnn::memory::primitive_desc &desc,
-                                OpReqType req);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+                                OpReqType req, const NDArray* in_arr = nullptr);
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
                                        const mkldnn::memory::primitive_desc &desc,
                                        OpReqType req);
 /* This function has to be used with one of the functions above. */
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 1bd1581dbc2..b182aa0b68d 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -77,29 +77,42 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
   }
 }
 
-mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+bool CanWriteTo(const NDArray &out_arr,
+                const NDArray &in_arr,
+                const mkldnn::memory::primitive_desc &desc) {
+  auto in_mem = in_arr.GetMKLDNNData();
+  bool add_same = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
+  bool pdesc_same = out_arr.GetMKLDNNData()->get_primitive_desc() == desc &&
+      in_mem->get_primitive_desc() == desc;
+  return add_same && pdesc_same;
+}
+
+mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
                                 const mkldnn::memory::primitive_desc &desc,
-                                OpReqType req) {
+                                OpReqType req,
+                                const NDArray* in_arr) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
     return mkldnn_output_t(OutDataOp::AddBack, tmp);
-  } else if (kWriteInplace == req) {
-    // MKLDNN ops may not support the case that the input and the output uses
-    // the same memory. Let's use an extra copy to make sure it always works.
+  } else if (req == kWriteInplace && in_arr != nullptr && CanWriteTo(out_arr, *in_arr, desc)) {
+    mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+    // mem is nullptr if out_arr is view and desc is MKLDNN format.
+    // need to Reorder2Default before calling CreateMKLDNNMem
+    CHECK(mem != nullptr);
+    return mkldnn_output_t(OutDataOp::Noop, mem);
+  } else if (req == kWriteInplace) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+  }
+  mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+  if (nullptr == mem) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
     return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-  } else {
-    mkldnn::memory *mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
-    if (mem == nullptr) {
-      auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-    } else {
-      return mkldnn_output_t(OutDataOp::Noop, mem);
-    }
   }
+  return mkldnn_output_t(OutDataOp::Noop, mem);
 }
 
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
                                        const mkldnn::memory::primitive_desc &desc,
                                        OpReqType req) {
   if (kAddTo == req) {
@@ -113,7 +126,7 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
     auto def_format = GetDefaultFormat(_desc.desc());
     mkldnn::memory *mem = nullptr;
     if (def_format == _desc.desc().data.format) {
-      mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
+      mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
     }
     if (mem == nullptr) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
new file mode 100644
index 00000000000..23f2fe69463
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_convolution-inl.h
+ * \brief
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <utility>
+#include "../convolution-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
+    const ConvolutionParam& param, const bool is_train, const NDArray &data,
+    const NDArray &weights, const NDArray *bias, const NDArray &output);
+
+class MKLDNNConvForward {
+ public:
+  mkldnn::convolution_forward::primitive_desc fwd_pd;
+
+  MKLDNNConvForward(const ConvolutionParam& param, const bool is_train,
+                    const NDArray &data, const NDArray &weights,
+                    const NDArray *bias, const NDArray &output): fwd_pd(
+                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
+  }
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
+                 const mkldnn::memory *bias, const mkldnn::memory &output);
+
+  const mkldnn::convolution_forward &GetFwd() const {
+    return *fwd_;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::convolution_forward> fwd_;
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> weight_;
+  std::shared_ptr<mkldnn::memory> bias_;
+  std::shared_ptr<mkldnn::memory> out_;
+};
+
+typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
+
+MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs,
+    const bool is_train, const NDArray &data, const NDArray &weights,
+    const NDArray *bias, const NDArray &output);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index f851a6d2535..cf04ea8da3d 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -23,11 +23,14 @@
  * \author Da Zheng
 */
 
+
+#if MXNET_USE_MKLDNN == 1
+
 #include "../convolution-inl.h"
 #include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "./mkldnn_convolution-inl.h"
 
-#if MXNET_USE_MKLDNN == 1
 namespace mxnet {
 namespace op {
 
@@ -37,8 +40,8 @@ bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
   return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
 }
 
-static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
-    const ConvolutionParam& param, bool is_train, const NDArray &data,
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
+    const ConvolutionParam& param, const bool is_train, const NDArray &data,
     const NDArray &weights, const NDArray *bias, const NDArray &output) {
   auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
   auto data_md = GetMemDesc(data);
@@ -162,73 +165,51 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   }
 }
 
-class MKLDNNConvForward {
-  std::shared_ptr<mkldnn::convolution_forward> fwd;
-  std::shared_ptr<mkldnn::memory> data;
-  std::shared_ptr<mkldnn::memory> weight;
-  std::shared_ptr<mkldnn::memory> bias;
-  std::shared_ptr<mkldnn::memory> out;
+void MKLDNNConvForward::SetNewMem(const mkldnn::memory &data,
+                                  const mkldnn::memory &weight,
+                                  const mkldnn::memory *bias,
+                                  const mkldnn::memory &output) {
+  if (this->data_ == nullptr)
+    this->data_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.src_primitive_desc(), data.get_data_handle()));
+  else
+    this->data_->set_data_handle(data.get_data_handle());
 
- public:
-  mkldnn::convolution_forward::primitive_desc fwd_pd;
+  if (this->weight_ == nullptr)
+    this->weight_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+  else
+    this->weight_->set_data_handle(weight.get_data_handle());
 
-  MKLDNNConvForward(const ConvolutionParam& param, bool is_train,
-                    const NDArray &data, const NDArray &weights,
-                    const NDArray *bias, const NDArray &output): fwd_pd(
-                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
-  }
-
-  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
-                 const mkldnn::memory *bias, const mkldnn::memory &output) {
-    if (this->data == nullptr)
-      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.src_primitive_desc(), data.get_data_handle()));
-    else
-      this->data->set_data_handle(data.get_data_handle());
+  if (this->out_ == nullptr)
+    this->out_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+  else
+    this->out_->set_data_handle(output.get_data_handle());
 
-    if (this->weight == nullptr)
-      this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+  if (bias != nullptr) {
+    if (this->bias_ == nullptr)
+      this->bias_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
     else
-      this->weight->set_data_handle(weight.get_data_handle());
-
-    if (this->out == nullptr)
-      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
-    else
-      this->out->set_data_handle(output.get_data_handle());
-
-    if (bias != nullptr) {
-      if (this->bias == nullptr)
-        this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-                fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
-      else
-        this->bias->set_data_handle(bias->get_data_handle());
-      if (this->fwd == nullptr)
-        this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
-            new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                            mkldnn::primitive::at(*this->weight),
-                                            mkldnn::primitive::at(*this->bias),
-                                            *this->out));
-    } else if (this->fwd == nullptr) {
-      this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
-          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                          mkldnn::primitive::at(*this->weight),
-                                          *this->out));
-    }
+      this->bias_->set_data_handle(bias->get_data_handle());
+    if (this->fwd_ == nullptr)
+      this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
+          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data_),
+                                          mkldnn::primitive::at(*this->weight_),
+                                          mkldnn::primitive::at(*this->bias_),
+                                          *this->out_));
+  } else if (this->fwd_ == nullptr) {
+    this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
+        new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data_),
+                                        mkldnn::primitive::at(*this->weight_),
+                                        *this->out_));
   }
+}
 
-  const mkldnn::convolution_forward &GetFwd() const {
-    return *fwd;
-  }
-};
-
-typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
-
-static inline MKLDNNConvForward &GetConvFwd(
-    const nnvm::NodeAttrs& attrs, bool is_train,
-    const NDArray &data, const NDArray &weights,
-    const NDArray *bias, const NDArray &output) {
+MKLDNNConvForward &GetConvFwd(const nnvm::NodeAttrs& attrs, const bool is_train,
+                              const NDArray &data, const NDArray &weights,
+                              const NDArray *bias, const NDArray &output) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash> fwds;
 #else
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
index 4b6235ec446..691e1d371b5 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
@@ -119,6 +119,10 @@ void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam &param,
                               const NDArray &out_grad, const NDArray &in_data,
                               const NDArray *workspace, const OpReqType req,
                               const NDArray &in_grad);
+MKLDNNPoolingFwd &GetPoolingFwd(const PoolingParam &param,
+                                const bool is_train,
+                                const NDArray &data,
+                                const NDArray &output);
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
index 259af2b9402..9fd88a13c46 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling.cc
+++ b/src/operator/nn/mkldnn/mkldnn_pooling.cc
@@ -121,7 +121,11 @@ mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam &param) {
       return mkldnn::algorithm::pooling_max;
       break;
     case pool_enum::kAvgPooling:
-      return mkldnn::algorithm::pooling_avg_include_padding;
+      if (param.count_include_pad.has_value() && !param.count_include_pad.value()) {
+        return mkldnn::algorithm::pooling_avg_exclude_padding;
+      } else {
+        return mkldnn::algorithm::pooling_avg_include_padding;
+      }
       break;
     default:
       LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method.";
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
index aa59f13d06d..acfa358a796 100644
--- a/src/operator/nn/mkldnn/mkldnn_softmax.cc
+++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc
@@ -26,6 +26,7 @@
 #include "../softmax-inl.h"
 #include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "../../tensor/broadcast_reduce_op.h"
 
 #if MXNET_USE_MKLDNN == 1
 namespace mxnet {
@@ -38,11 +39,13 @@ void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   auto input_mem = in_data.GetMKLDNNData();
   mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
   mkldnn::memory::desc data_md = data_mpd.desc();
+  int axis = CheckAxis(param.axis, in_data.shape().ndim());
+
   auto cpu_engine = data_mpd.get_engine();
   auto prop = ctx.is_train
     ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
   mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop,
-      data_md, param.axis);
+      data_md, axis);
   mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine);
 
   auto output_memory = out_data.GetMKLDNNData();
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
index fdbfb1558f6..c51e1081d69 100644
--- a/src/operator/nn/mkldnn/mkldnn_sum.cc
+++ b/src/operator/nn/mkldnn/mkldnn_sum.cc
@@ -58,7 +58,6 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   std::vector<mkldnn::memory::primitive_desc> in_pds(inputs.size());
   std::vector<float> scales(inputs.size(), 1);
   in_prims.reserve(inputs.size());
-  bool pd_same = true;
   std::vector<NDArray> in_bufs(inputs.size());
   for (size_t i = 0; i < inputs.size(); i++) {
     const mkldnn::memory *in_mem;
@@ -73,31 +72,11 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   }
 
   mkldnn::sum::primitive_desc pdesc(scales, in_pds);
-  pd_same = pd_same && (pdesc.dst_primitive_desc() == in_pds[0]);
-  auto out_mem = const_cast<NDArray&>(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc());
-  bool addr_same = false;
-  const void *first_data_handle;
-  if (in_bufs[0].is_none())
-    first_data_handle = inputs[0].GetMKLDNNData()->get_data_handle();
-  else
-    first_data_handle = in_bufs[0].GetMKLDNNData()->get_data_handle();
-  if (out_mem)
-    addr_same = out_mem->get_data_handle() == first_data_handle;
-  if (((req == kWriteTo) || (req == kWriteInplace && pd_same && addr_same))
-      && out_mem) {
-    // do sum computation directly on output NDArray
-    MKLDNNStream *stream = MKLDNNStream::Get();
-    stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem));
-    stream->Submit();
-  } else {
-    // req == kWriteInplace but cannot be handled by mkldnn and
-    // req == kAddTo will run into this branch
-    auto mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req);
-    MKLDNNStream *stream = MKLDNNStream::Get();
-    stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *mem.second));
-    CommitOutput(out_data, mem);
-    stream->Submit();
-  }
+  auto mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req, &inputs[0]);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *mem.second));
+  CommitOutput(out_data, mem);
+  stream->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 9d004d295be..976aacf63a5 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -214,16 +214,19 @@ template <typename DType, int p = 1>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w, DType* out_data,
-                                       const bool getAvg = false) {
+                                       const bool get_avg = false, const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int c = (index / pooled_width) % channels;
     const int n = index / pooled_width / channels;
     int wstart = pw * stride_w - pad_w;
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (wend - wstart) : 1);
+    int pool_size = (get_avg? (wend - wstart) : 1);
     wstart = max(wstart, 0);
     wend = min(wend, width);
+    if (get_avg && !count_include_pad) {
+      pool_size = (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * width;
     for (int w = wstart; w < wend; ++w) {
@@ -244,7 +247,8 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_h, const int stride_w,
                                        const int pad_h, const int pad_w, DType* out_data,
-                                       const bool getAvg = false) {
+                                       const bool get_avg = false,
+                                       const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -254,11 +258,14 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
     int wstart = pw * stride_w - pad_w;
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
+    int pool_size = (get_avg? (hend - hstart) * (wend - wstart) : 1);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     hend = min(hend, height);
     wend = min(wend, width);
+    if (get_avg && !count_include_pad) {
+      pool_size = (hend - hstart) * (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
@@ -282,7 +289,8 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_d, const int stride_h, const int stride_w,
                                        const int pad_d, const int pad_h, const int pad_w,
-                                       DType* out_data, const bool getAvg = false) {
+                                       DType* out_data, const bool get_avg = false,
+                                       const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -295,13 +303,16 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
     int dend = min(dstart + kernel_d, depth + pad_d);
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+    int pool_size = (get_avg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     dend = min(dend, depth);
     hend = min(hend, height);
     wend = min(wend, width);
+    if (get_avg && !count_include_pad) {
+      pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * depth * height * width;
     for (int d = dstart; d < dend; ++d) {
@@ -311,7 +322,9 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
         }
       }
     }
-    out_data[index] = a_root_p<DType, p>::Map(sum);
+    out_data[index] = (pool_size == 0) ?
+                      DType(nanf("")) :
+                      a_root_p<DType, p>::Map(sum);
   }
 }
 
@@ -487,7 +500,8 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int channels, const int width,
                                          const int pooled_width, const int kernel_w,
                                          const int stride_w, const int pad_w, DType* in_grad,
-                                         const bool isAvg = false) {
+                                         const bool is_avg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -506,7 +520,12 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
       // figure out the pooling size
       int wstart = pw * stride_w - pad_w;
       int wend = min(wstart + kernel_w, width + pad_w);
-      int pool_size = (isAvg? (wend - wstart) : 1);
+      int pool_size = (is_avg? (wend - wstart) : 1);
+      if (is_avg && !count_include_pad) {
+        wstart = max(wstart, 0);
+        wend = min(wend, width);
+        pool_size = (wend - wstart);
+      }
       gradient +=
         lp_grad<DType, p>::Map(out_grad_slice[pw], in_data[index], out_data_slice[pw]) / pool_size;
     }
@@ -528,7 +547,8 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int kernel_h, const int kernel_w,
                                          const int stride_h, const int stride_w,
                                          const int pad_h, const int pad_w, DType* in_grad,
-                                         const bool isAvg = false) {
+                                         const bool is_avg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -553,8 +573,15 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
         int wstart = pw * stride_w - pad_w;
         int hend = min(hstart + kernel_h, height + pad_h);
         int wend = min(wstart + kernel_w, width + pad_w);
-        int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
+        int pool_size = (is_avg? (hend - hstart) * (wend - wstart) : 1);
         int out_index = ph * pooled_width + pw;
+        if (is_avg && !count_include_pad) {
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, height);
+          wend = min(wend, width);
+          pool_size = (hend - hstart) * (wend - wstart);
+        }
         gradient +=
           lp_grad<DType, p>::Map(out_grad_slice[out_index],
                                  in_data[index],
@@ -580,7 +607,8 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int kernel_d, const int kernel_h,
                                          const int kernel_w, const int stride_d, const int stride_h,
                                          const int stride_w, const int pad_d, const int pad_h,
-                                         const int pad_w, DType* in_grad, const bool isAvg = false) {
+                                         const int pad_w, DType* in_grad, const bool is_avg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCDHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -611,8 +639,17 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
           int dend = min(dstart + kernel_d, depth + pad_d);
           int hend = min(hstart + kernel_h, height + pad_h);
           int wend = min(wstart + kernel_w, width + pad_w);
-          int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+          int pool_size = (is_avg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
           int out_index = (pd * pooled_height + ph) * pooled_width + pw;
+          if (is_avg && !count_include_pad) {
+            dstart = max(dstart, 0);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            dend = min(dend, depth);
+            hend = min(hend, height);
+            wend = min(wend, width);
+            pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          }
           gradient += lp_grad<DType, p>::Map(out_grad_slice[out_index],
                                              in_data[index],
                                              out_data_slice[out_index]) / pool_size;
@@ -643,7 +680,7 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data) {
+                 DType* out_data, const bool count_include_pad) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
@@ -659,7 +696,8 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
-                                   kernel[0], stride[0], pad[0], out_data, true);
+                                   kernel[0], stride[0], pad[0], out_data,
+                                   true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -693,7 +731,8 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], out_data, true);
+                                   stride[0], stride[1], pad[0], pad[1], out_data,
+                                   true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -731,7 +770,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
-                                   pad[0], pad[1], pad[2], out_data, true);
+                                   pad[0], pad[1], pad[2], out_data, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -777,7 +816,8 @@ template<typename DType, int p>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
-                   const int pool_type, OpReqType req_type, DType* in_grad) {
+                   const int pool_type, OpReqType req_type, DType* in_grad,
+                   const bool count_include_pad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
@@ -798,7 +838,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
-                                     stride[0], pad[0], in_grad, true);
+                                     stride[0], pad[0], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -836,7 +876,8 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+                                     stride[0], stride[1], pad[0], pad[1], in_grad,
+                                     true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -878,7 +919,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
-                                     pad[2], in_grad, true);
+                                     pad[2], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 9fe43b2bd46..8f7a5edc832 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -216,7 +216,8 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType, int p = 1>
 inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, const bool getAvg = false) {
+                            DType* out_data,
+                            const bool get_avg = false, const bool count_include_pad = true) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -229,9 +230,12 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
       for (int pw = 0; pw < pooled_width; ++pw) {
         int wstart = pw * stride_w - pad_w;
         int wend = std::min(wstart + kernel_w, width + pad_w);
-        int pool_size = (getAvg ? (wend - wstart) : 1);
+        int pool_size = (get_avg ? (wend - wstart) : 1);
         wstart = std::max(wstart, 0);
         wend = std::min(wend, width);
+        if (get_avg && !count_include_pad) {
+          pool_size = (wend - wstart);
+        }
         DType sum = 0;
         for (int w = wstart; w < wend; ++w) {
           sum += a_pow_p<DType, p>::Map(in_data[w]) / pool_size;
@@ -251,7 +255,8 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType, int p = 1>
 inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, const bool getAvg = false) {
+                            DType* out_data,
+                            const bool get_avg = false, const bool count_include_pad = true) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -267,11 +272,14 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
           int wend = std::min(wstart + kernel_w, width + pad_w);
-          int pool_size = (getAvg ? (hend - hstart) * (wend - wstart) : 1);
+          int pool_size = (get_avg ? (hend - hstart) * (wend - wstart) : 1);
           hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
+          if (get_avg && !count_include_pad) {
+            pool_size = (hend - hstart) * (wend - wstart);
+          }
           DType sum = 0;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
@@ -294,7 +302,8 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType, int p = 1>
 inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, const bool getAvg = false) {
+                            DType* out_data,
+                            const bool get_avg = false, const bool count_include_pad = true) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -313,13 +322,16 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
             int dend = std::min(dstart + kernel_d, depth + pad_d);
             int hend = std::min(hstart + kernel_h, height + pad_h);
             int wend = std::min(wstart + kernel_w, width + pad_w);
-            int pool_size = (getAvg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+            int pool_size = (get_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
             dend = std::min(dend, depth);
             hend = std::min(hend, height);
             wend = std::min(wend, width);
+            if (get_avg && !count_include_pad) {
+              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
             DType sum = 0;
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
@@ -328,7 +340,9 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
                 }
               }
             }
-            out_data[(pd*pooled_height+ph)*pooled_width+pw] = a_root_p<DType, p>::Map(sum);
+            out_data[(pd*pooled_height+ph)*pooled_width+pw] = (pool_size == 0) ?
+                                                              DType(nanf("")) :
+                                                              a_root_p<DType, p>::Map(sum);
           }
         }
       }
@@ -509,8 +523,8 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
 template<typename DType, int p = 1>
 inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride,
-                              DType* in_grad, const bool isAvg = false) {
+                              const TShape& pad, const TShape& stride, DType* in_grad,
+                              const bool is_avg = false, const bool count_include_pad = true) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -523,9 +537,12 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const
       for (int pw = 0; pw < pooled_width; ++pw) {
         int wstart = pw * stride_w - pad_w;
         int wend = std::min(wstart + kernel_w, width + pad_w);
-        int pool_size = (isAvg ? (wend - wstart) : 1);
+        int pool_size = (is_avg ? (wend - wstart) : 1);
         wstart = std::max(wstart, 0);
         wend = std::min(wend, width);
+        if (is_avg && !count_include_pad) {
+          pool_size = (wend - wstart);
+        }
         for (int w = wstart; w < wend; ++w) {
           in_grad[w] += lp_grad<DType, p>::Map(out_grad[pw], in_data[w], out_data[pw]) / pool_size;
         }
@@ -545,8 +562,8 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const
 template<typename DType, int p = 1>
 inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride,
-                              DType* in_grad, const bool isAvg = false) {
+                              const TShape& pad, const TShape& stride, DType* in_grad,
+                              const bool is_avg = false, const bool count_include_pad = true) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -562,11 +579,14 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
           int wend = std::min(wstart + kernel_w, width + pad_w);
-          int pool_size = (isAvg ? (hend - hstart) * (wend - wstart) : 1);
+          int pool_size = (is_avg ? (hend - hstart) * (wend - wstart) : 1);
           hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
+          if (is_avg && !count_include_pad) {
+            pool_size = (hend - hstart) * (wend - wstart);
+          }
           const int pool_index = ph * pooled_width + pw;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
@@ -593,8 +613,8 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const
 template<typename DType, int p = 1>
 inline void unpool_sum_3d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride,
-                              DType* in_grad, const bool isAvg = false) {
+                              const TShape& pad, const TShape& stride, DType* in_grad,
+                              const bool is_avg = false, const bool count_include_pad = true) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -613,13 +633,16 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const DType* in_data, const
             int dend = std::min(dstart + kernel_d, depth + pad_d);
             int hend = std::min(hstart + kernel_h, height + pad_h);
             int wend = std::min(wstart + kernel_w, width + pad_w);
-            int pool_size = (isAvg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+            int pool_size = (is_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
             dend = std::min(dend, depth);
             hend = std::min(hend, height);
             wend = std::min(wend, width);
+            if (is_avg && !count_include_pad) {
+              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
             const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
@@ -660,13 +683,14 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data) {
+                 DType* out_data, const bool count_include_pad) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                      true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -678,7 +702,8 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                      true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -690,7 +715,8 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                      true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -723,7 +749,8 @@ template<typename DType, int p>
 inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
-                   const int pool_type, OpReqType req_type, DType* in_grad, const int p_value = 2) {
+                   const int pool_type, OpReqType req_type, DType* in_grad,
+                   const bool count_include_pad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
@@ -733,7 +760,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
       unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true);
+                        true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -747,7 +774,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
       unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true);
+                        true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kLpPooling == pool_type) {
@@ -761,7 +788,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
       unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true);
+                        true, count_include_pad);
     } else if (pool_enum::kSumPooling == pool_type) {
       unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kLpPooling == pool_type) {
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index a4770b49e85..ad74a8feae3 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -41,6 +41,8 @@
 namespace mxnet {
 namespace op {
 
+void PoolingParamParser(nnvm::NodeAttrs *attrs);
+
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
   TShape stride;
@@ -50,6 +52,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   bool global_pool;
   bool cudnn_off;
   dmlc::optional<int> p_value;
+  dmlc::optional<bool> count_include_pad;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(kernel).set_default(TShape())  // add default value here
     .enforce_nonzero()
@@ -81,7 +84,13 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding.");
 
     DMLC_DECLARE_FIELD(p_value).set_default(dmlc::optional<int>())
-    .describe("Value of p for Lp pooling, can be 1 or 2, required for Lp Pooling");
+    .describe("Value of p for Lp pooling, can be 1 or 2, required for Lp Pooling.");
+
+    DMLC_DECLARE_FIELD(count_include_pad).set_default(dmlc::optional<bool>())
+    .describe("Only used for AvgPool, specify whether to count padding elements for average"
+              "calculation. For example, with a 5*5 kernel on a 3*3 corner of a image,"
+              "the sum of the 9 valid elements will be divided by 25 if this is set to true,"
+              "or it will be divided by 9 if this is set to false. Defaults to true.");
   }
 
   bool operator==(const PoolingParam& other) const {
@@ -92,7 +101,8 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
            this->pooling_convention == other.pooling_convention &&
            this->global_pool        == other.global_pool &&
            this->cudnn_off          == other.cudnn_off &&
-           this->p_value            == other.p_value;
+           this->p_value            == other.p_value &&
+           this->count_include_pad  == other.count_include_pad;
   }
 };
 
@@ -112,6 +122,7 @@ struct hash<mxnet::op::PoolingParam> {
     ret = dmlc::HashCombine(ret, val.global_pool);
     ret = dmlc::HashCombine(ret, val.cudnn_off);
     ret = dmlc::HashCombine(ret, val.p_value);
+    ret = dmlc::HashCombine(ret, val.count_include_pad);
     return ret;
   }
 };
@@ -153,27 +164,29 @@ class PoolingOp {
     }
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
                         param_.p_value.value() : 1;
+    const bool count_include_pad = (param_.count_include_pad.has_value()) ?
+                                   param_.count_include_pad.value() : true;
     switch (p_value) {
       case 1:
         pool<DType, 1>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>());
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
         break;
       case 2:
         pool<DType, 2>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>());
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
         break;
       case 3:
         pool<DType, 3>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>());
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
@@ -201,6 +214,8 @@ class PoolingOp {
 
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
                         param_.p_value.value() : 1;
+    const bool count_include_pad = (param_.count_include_pad.has_value()) ?
+                                   param_.count_include_pad.value() : true;
     switch (p_value) {
       case 1:
         unpool<DType, 1>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -208,7 +223,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>());
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
         break;
       case 2:
         unpool<DType, 2>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -216,7 +231,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>());
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
         break;
       case 3:
         unpool<DType, 3>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -224,7 +239,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>());
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 3ff94da3c2d..3200a515d6f 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -35,7 +35,7 @@
 namespace mxnet {
 namespace op {
 
-static void PoolingParamParser(nnvm::NodeAttrs *attrs) {
+void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   using namespace mshadow;
   PoolingParam param;
   param.Init(attrs->dict);
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index f8cc6fee9a2..e9b104f1286 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -38,10 +38,8 @@ static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   // It seems MKLDNN softmax doesn't support training.
-  // and it only supports non-negative axis.
-  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) {
+  if (SupportMKLDNN(inputs[0]) && !ctx.is_train) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]);
     auto fn = SoftmaxCompute<cpu, mxnet_op::softmax_fwd>;
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index de3c7422c5f..0953cbaf519 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -322,6 +322,7 @@ IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::xelu_grad); // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::prelu_grad); // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::elu_grad); // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minimum);  // NOLINT()
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index 92b808dd460..bbd79417676 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -23,11 +23,31 @@
  * \brief
  */
 #include "./dequantize-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_dequantize-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(DequantizeParam);
 
+bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
+                           const int dev_mask,
+                           DispatchMode* dispatch_mode,
+                           std::vector<int> *in_attrs,
+                           std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_dequantize)
 .describe(R"code(Dequantize the input tensor into a float tensor.
 min_range and max_range are scalar floats that specify the range for
@@ -50,6 +70,10 @@ by keep zero centered for the quantized value:
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", DequantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", DequantizeType)
+.set_attr<FInferStorageType>("FInferStorageType", DequantizeStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNDequantizeCompute)
+#endif
 .set_attr<FCompute>("FCompute<cpu>", DequantizeCompute<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
new file mode 100644
index 00000000000..89c3c199488
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_dequantize-inl.h
+ * \author Wenting Jiang, Xinyu Chen
+ * \brief
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename SrcType, typename DstType>
+static void MKLDNNDequantizeComputeKer(const std::vector<NDArray> &inputs,
+                                       const std::vector<NDArray> &outputs,
+                                       const std::vector<OpReqType> &req) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  float real_range = 0.0;
+  float quantized_range = 0.0;
+  if (inputs[0].dtype() == mshadow::kUint8) {
+    quantized_range = MaxAbs(MaxValue<SrcType>(), MinValue<SrcType>());
+    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
+  } else if (inputs[0].dtype() == mshadow::kInt8) {
+    quantized_range = MinAbs(MaxValue<SrcType>(), MinValue<SrcType>());
+    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
+  } else {
+    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output type";
+  }
+  float scale = real_range / quantized_range;
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+    in_buffer = inputs[0].Reorder2Default();
+
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  size_t i_ndim = in_buffer.shape().ndim();
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+  for (size_t i = 0; i < i_ndim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  auto o_desc = mkldnn::memory::desc(i_dims,
+                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
+                                    i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNDequantizeCompute(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                    const std::vector<NDArray> &inputs,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &outputs) {
+  if (inputs[0].dtype() == mshadow::kUint8) {
+    MKLDNNDequantizeComputeKer<uint8_t, float>(inputs, outputs, req);
+  } else if (inputs[0].dtype() == mshadow::kInt8) {
+    MKLDNNDequantizeComputeKer<int8_t, float>(inputs, outputs, req);
+  } else {
+    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as input type";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
new file mode 100644
index 00000000000..f7709319d6a
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantize-inl.h
+ * \brief
+ * \author Wenting Jiang, Xinyu Chen
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../quantize-inl.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename SrcType, typename DstType>
+static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
+                                     const std::vector<NDArray>& outputs,
+                                     const QuantizeParam& param,
+                                     const std::vector<OpReqType> &req) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  float real_range = 0.0;
+  float quantized_range = 0.0;
+  if (param.out_type == mshadow::kUint8) {
+    real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    quantized_range = MaxAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = *inputs[1].data().dptr<float>();
+    *outputs[2].data().dptr<float>() = *inputs[2].data().dptr<float>();
+  } else if (param.out_type == mshadow::kInt8) {
+    real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = -real_range;
+    *outputs[2].data().dptr<float>() = real_range;
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+  }
+  float scale = quantized_range / real_range;
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+    in_buffer = inputs[0].Reorder2Default();
+
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  size_t i_ndim = in_buffer.shape().ndim();
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+  for (size_t i = 0; i < i_ndim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  auto o_desc = mkldnn::memory::desc(i_dims,
+                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
+                                    i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNQuantizeCompute(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                  const std::vector<NDArray> &inputs,
+                                  const std::vector<OpReqType> &req,
+                                  const std::vector<NDArray> &outputs) {
+  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+  if (param.out_type == mshadow::kUint8) {
+    MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+  } else if (param.out_type == mshadow::kInt8) {
+    MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
new file mode 100644
index 00000000000..fa6a32a4739
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantized_conv.cc
+ * \brief
+ * \author Wenting Jiang, Xinyu Chen
+*/
+
+#if MXNET_USE_MKLDNN == 1
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../../nn/mkldnn/mkldnn_convolution-inl.h"
+#include "../../nn/convolution-inl.h"
+#include "../quantization_utils.h"
+#include "../../tensor/matrix_op-inl.h"
+#include "../../elemwise_op_common.h"
+namespace mxnet {
+namespace op {
+
+static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
+                                       const OpContext &ctx,
+                                       const std::vector<NDArray> &in_data,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<NDArray> &out_data) {
+  CHECK_EQ(in_data[0].dtype(), mshadow::kUint8)
+    << "mkldnn_quantized_conv op only supports uint8 as input type";
+  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  NDArray weight = in_data[conv::kWeight];
+  MKLDNNConvForward &fwd = GetConvFwd(attrs, ctx.is_train,
+      in_data[conv::kData], weight,
+      param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
+
+  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
+  const mkldnn::memory *weight_mem;
+  // For inference, we want to reorder the weight array so we don't need to
+  // reorder data every time.
+  if (weight.IsDefaultData()) {
+    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
+    // We also need to modify the layout on the original weight array. The
+    // data conversion happens after the weight array is used.
+    weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_primitive_desc());
+  } else {
+    weight_mem = weight.GetMKLDNNData();
+    CHECK(weight_mem->get_primitive_desc() == fwd.fwd_pd.weights_primitive_desc());
+  }
+  auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(),
+                                 req[conv::kOut]);
+  const mkldnn::memory *bias_mem = nullptr;
+  if (!param.no_bias)
+    bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc());
+  fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+
+  CommitOutput(out_data[conv::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  const size_t num_inputs = param.no_bias ? 2 : 3;
+  mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+           out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
+           in_data[num_inputs].data().dptr<float>(),
+           in_data[num_inputs+1].data().dptr<float>(),
+           in_data[num_inputs+2].data().dptr<float>(),
+           in_data[num_inputs+3].data().dptr<float>());
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_conv)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedConvForward);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc
new file mode 100644
index 00000000000..83177ad9b34
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_pooling.cc
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantized_pooling.cc
+ * \brief
+ * \author Tao Lv, Xinyu Chen
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "../../nn/mkldnn/mkldnn_pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static void MKLDNNQuantizedPoolingForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                          const std::vector<NDArray> &in_data,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &out_data) {
+  CHECK(in_data[0].dtype() == mshadow::kUint8
+    || in_data[0].dtype() == mshadow::kInt8)
+    << "mkldnn_quantized_pooling op only supports uint8 and int8 as input type";
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  auto fwd = GetPoolingFwd(param, ctx.is_train, in_data[0], out_data[0]);
+  fwd.SetDataHandle(in_data[0], out_data[0]);
+  fwd.Execute();
+  out_data[1].data().dptr<float>()[0] = in_data[1].data().dptr<float>()[0];
+  out_data[2].data().dptr<float>()[0] = in_data[2].data().dptr<float>()[0];
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_pooling)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedPoolingForward);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
new file mode 100644
index 00000000000..409c53dd3b9
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* \file mkldnn_requantize-inl.h
+ * \brief
+ * \author Jin Huang, Xinyu Chen
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../requantize-inl.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
+                                       const OpContext& ctx,
+                                       const std::vector<NDArray>& inputs,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& outputs,
+                                       const float real_range) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  typedef int32_t SrcDType;
+  typedef int8_t  DstDType;
+  // check shapes
+  size_t i_dim = inputs[0].shape().ndim();
+  size_t o_dim = outputs[0].shape().ndim();
+  CHECK_EQ(i_dim, o_dim);
+  float first_quantized_range = MinAbs(MinValue<SrcDType>(),
+                                       MaxValue<SrcDType>());
+  float first_real_range = MaxAbs(*inputs[1].data().dptr<float>(),
+                                  *inputs[2].data().dptr<float>());
+  float first_scale = first_real_range / first_quantized_range;
+  float second_real_range = real_range;
+  float second_quantized_range = MinAbs(MaxValue<DstDType>(),
+                                        MinValue<DstDType>());
+  float second_scale = second_quantized_range / second_real_range;
+  float scale = first_scale * second_scale;
+  *outputs[1].data().dptr<float>() = -second_real_range;
+  *outputs[2].data().dptr<float>() = second_real_range;
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
+    in_buffer = inputs[0].Reorder2Default();
+
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_dim);
+  for (size_t i = 0; i < i_dim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  auto o_desc = mkldnn::memory::desc(i_dims,
+                                    (mkldnn::memory::data_type)data_type_enum<DstDType>::type,
+                                    i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  typedef int32_t SrcDType;
+  typedef int8_t  DstDType;
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  const RequantizeParam& param = nnvm::get<RequantizeParam>(attrs.parsed);
+  float real_range;
+  // Model is calibrated
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    real_range =
+          MaxAbs(param.min_calib_range.value(), param.max_calib_range.value());
+    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
+  // Model is not calibrated
+  } else {
+    TShape src_shape, dst_shape;
+    const size_t actual_float_size = sizeof(float);
+    const size_t actual_quantized_size = sizeof(SrcDType);
+    const size_t temp_reduce_size = ConfigReduce<cpu, SrcDType>(s,
+                         inputs[0].shape(), TShape({1}), &src_shape, &dst_shape);
+    Tensor<cpu, 1, char> temp_space =
+      ctx.requested[0].get_space_typed<cpu, 1, char>(
+      Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
+    Tensor<cpu, 1, float> actual_min_float(
+                 reinterpret_cast<float*>(temp_space.dptr_), Shape1(1), s);
+    Tensor<cpu, 1, float> actual_max_float(
+                 reinterpret_cast<float*>(temp_space.dptr_) + 1, Shape1(1), s);
+    const int dev_id = ctx.run_ctx.ctx.dev_id;
+    TBlob actual_min_quantized(reinterpret_cast<SrcDType*>(
+                       temp_space.dptr_ + 8), Shape1(1), cpu::kDevMask, dev_id);
+    TBlob actual_max_quantized(reinterpret_cast<SrcDType*>(
+                   temp_space.dptr_ + 8) + 1, Shape1(1), cpu::kDevMask, dev_id);
+    Tensor<cpu, 1, char> workspace(
+            temp_space.dptr_+2*actual_float_size+2*actual_quantized_size,
+            Shape1(temp_reduce_size), s);
+    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+        s, actual_min_quantized.reshape(dst_shape), kWriteTo,
+        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
+    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
+        actual_min_float.dptr_, actual_min_quantized.dptr<SrcDType>(),
+        inputs[1].Reorder2Default().data().dptr<float>(),
+        inputs[2].Reorder2Default().data().dptr<float>());
+    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+        s, actual_max_quantized.reshape(dst_shape), kWriteTo,
+        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
+    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
+        actual_max_float.dptr_, actual_max_quantized.dptr<SrcDType>(),
+        inputs[1].Reorder2Default().data().dptr<float>(),
+        inputs[2].Reorder2Default().data().dptr<float>());
+
+    real_range = MaxAbs(*actual_min_float.dptr_, *actual_max_float.dptr_);
+    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_REQUANTIZE_INL_H_
diff --git a/src/operator/quantization/quantize.cc b/src/operator/quantization/quantize.cc
index 32eb952fa5d..25fb19dddd1 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -23,11 +23,31 @@
  * \brief
  */
 #include "./quantize-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_quantize-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(QuantizeParam);
 
+bool QuantizeStorageType(const nnvm::NodeAttrs& attrs,
+                         const int dev_mask,
+                         DispatchMode* dispatch_mode,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_quantize)
 .describe(R"code(Quantize a input tensor from float to `out_type`,
 with user-specified `min_range` and `max_range`.
@@ -61,6 +81,10 @@ where
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizeType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizeStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeCompute)
+#endif
 .set_attr<FCompute>("FCompute<cpu>", QuantizeCompute<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 5ec745ccdf3..5376a0ee9f1 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -99,6 +99,7 @@ Graph QuantizeGraph(Graph &&src) {
   static auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
   auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
   auto excluded_nodes = src.GetAttr<std::unordered_set<NodePtr>>("excluded_nodes");
+  auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
 
   // mirror_map stores the mapping from the currently visited graph to the newly created quantized
   // graph. Key is the currently visited graph's node pointer, and value is a copied node of the key
@@ -129,7 +130,7 @@ Graph QuantizeGraph(Graph &&src) {
              mirror_node->op()->name != "_contrib_quantize")) {
           NodePtr quantize_node = InsertNode("_contrib_quantize",
             e.node->attrs.name + "_quantize", new_node, mirror_entry);
-          quantize_node->attrs.dict["out_type"] = "int8";
+          quantize_node->attrs.dict["out_type"] = quantized_dtype;
           quantize_node->op()->attr_parser(&(quantize_node->attrs));
 
           NodePtr min_node = InsertNode("min",
@@ -159,7 +160,11 @@ Graph QuantizeGraph(Graph &&src) {
         uint32_t min_index = 1;
         uint32_t max_index = 2;
         if (quantized_op_map.count(e.node->op())) {
-          size_t  num_outputs = e.node->num_outputs();
+          // here we calculate the output number (exclude min/max, in order to
+          // calculate min/max index from mirror node) based on assumption that
+          // there is only 1min and 1max output from mirror node (which is
+          // currently true)
+          size_t  num_outputs = mirror_node->num_outputs() - 2;
           min_index = num_outputs + 2 * e.index;
           max_index = num_outputs + 2 * e.index + 1;
         } else {
@@ -198,12 +203,15 @@ Graph QuantizeGraph(Graph &&src) {
         NodePtr mirror_node = mirror_map.at(e.node.get());
         NodeEntry mirror_entry = NodeEntry{
           mirror_node, e.index, e.version};
-        size_t num_outputs = e.node->num_outputs();
-        uint32_t min_index = num_outputs + 2 * e.index;
-        uint32_t max_index = num_outputs + 2 * e.index + 1;
-
         // if input node is quantized operator, add dequantize node
         if (NeedQuantize(e.node, excluded_nodes)) {
+          // here we calculate the output number (exclude min/max, in order to
+          // calculate min/max index from mirror node) based on assumption that
+          // there is only 1min and 1max output from mirror node (which is
+          // currently true)
+          size_t num_outputs = mirror_node->num_outputs() - 2;
+          uint32_t min_index = num_outputs + 2 * e.index;
+          uint32_t max_index = num_outputs + 2 * e.index + 1;
           NodePtr dequantize_node = CreateNode("_contrib_dequantize",
             e.node->attrs.name + "_dequantize");
           dequantize_node->inputs.emplace_back(mirror_entry);
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
index d7dc9fe4dbd..ed62228b924 100644
--- a/src/operator/quantization/quantized_conv.cc
+++ b/src/operator/quantization/quantized_conv.cc
@@ -24,6 +24,9 @@
  * \author Ziheng Jiang, Jun Wu
 */
 #include "../nn/convolution-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -86,12 +89,13 @@ bool QuantizedConvType(const nnvm::NodeAttrs& attrs,
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   CHECK_EQ(in_type->size(), param.no_bias? 6U : 9U);
   CHECK_EQ(out_type->size(), 3U);
+#ifndef MXNET_USE_MKLDNN
   TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
+#endif
   TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kInt8);
   if (!param.no_bias) {
     TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kInt8);
   }
-
   const size_t start = param.no_bias? 2 : 3;
   const size_t end = param.no_bias? 6 : 9;
   for (size_t i = start; i < end; ++i) {
@@ -104,6 +108,24 @@ bool QuantizedConvType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+bool QuantizedConvStorageType(const nnvm::NodeAttrs& attrs,
+                              const int dev_mask,
+                              DispatchMode* dispatch_mode,
+                              std::vector<int> *in_attrs,
+                              std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_conv)
 .describe(R"code(Convolution operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -119,7 +141,7 @@ and max thresholds representing the threholds for quantizing the float32 output
     return param.no_bias? 6 : 9;
   })
 .set_num_outputs(3)
-.set_attr_parser(ParamParser<ConvolutionParam>)
+.set_attr_parser(ConvolutionParamParser)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
@@ -137,6 +159,7 @@ and max thresholds representing the threholds for quantizing the float32 output
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedConvShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedConvType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedConvStorageType)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
diff --git a/src/operator/quantization/quantized_flatten-inl.h b/src/operator/quantization/quantized_flatten-inl.h
index 95f36615402..b7209fd28f5 100644
--- a/src/operator/quantization/quantized_flatten-inl.h
+++ b/src/operator/quantization/quantized_flatten-inl.h
@@ -62,11 +62,21 @@ void QuantizedFlattenCompute(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
 
-  typedef int8_t DstDType;
-  typedef int8_t  SrcDType;
-  Kernel<quantized_flatten, xpu>::Launch(s, outputs[0].Size(),
-    outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
-    inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>());
+  if (inputs[0].type_flag_ == mshadow::kUint8) {
+    typedef uint8_t SrcDType;
+    typedef uint8_t DstDType;
+    Kernel<quantized_flatten, xpu>::Launch(s, outputs[0].Size(),
+      outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+      inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>());
+  } else if (inputs[0].type_flag_ == mshadow::kInt8) {
+    typedef int8_t SrcDType;
+    typedef int8_t DstDType;
+    Kernel<quantized_flatten, xpu>::Launch(s, outputs[0].Size(),
+      outputs[0].dptr<DstDType>(), outputs[1].dptr<float>(), outputs[2].dptr<float>(),
+      inputs[0].dptr<SrcDType>(), inputs[1].dptr<float>(), inputs[2].dptr<float>());
+  } else {
+    LOG(FATAL) << "quantized_flatten op only supports int8 and uint8 as input and output type";
+  }
 }
 
 inline bool QuantizedFlattenShape(const nnvm::NodeAttrs& attrs,
@@ -96,10 +106,9 @@ inline bool QuantizedFlattenType(const nnvm::NodeAttrs& attrs,
                                  std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
   TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
   return (*in_attrs)[0] != -1;
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index a3105eb654d..779e244c862 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -23,6 +23,9 @@
 */
 #include <mxnet/op_attr_types.h>
 #include "../nn/pooling-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "../nn/mkldnn/mkldnn_pooling-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -79,8 +82,12 @@ bool QuantizedPoolingType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_type->size(), 3U);
   CHECK_EQ(out_type->size(), 3U);
   if (param.pool_type == pool_enum::kMaxPooling || param.pool_type == pool_enum::kAvgPooling) {
+#if MXNET_USE_MKLDNN  == 1
+    TYPE_ASSIGN_CHECK(*out_type, 0, (*in_type)[0]);
+#else
     TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
     TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt8);
+#endif
   } else {
     LOG(FATAL) << "QuantizedPoolingOp only supports pool_type=max/avg for now";
   }
@@ -91,6 +98,27 @@ bool QuantizedPoolingType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+inline static bool QuantizedPoolingStorageType(const nnvm::NodeAttrs &attrs,
+                                               const int dev_mask,
+                                               DispatchMode *dispatch_mode,
+                                               std::vector<int> *in_attrs,
+                                               std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3);
+
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#else
+  CHECK_EQ(out_attrs->size(), 3);
+#endif
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_pooling)
 .describe(R"code(Pooling operator for input and output data type of int8.
 The input and output data comes with min and max thresholds for quantizing
@@ -101,7 +129,7 @@ the float32 data into int8.
     This operator only supports `pool_type` of `avg` or `max`.)code" ADD_FILELINE)
 .set_num_inputs(3)
 .set_num_outputs(3)
-.set_attr_parser(ParamParser<PoolingParam>)
+.set_attr_parser(PoolingParamParser)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"data", "min_data", "max_data"};
@@ -112,6 +140,7 @@ the float32 data into int8.
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedPoolingShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedPoolingType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedPoolingStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize",
   [](const NodeAttrs& attrs) {
     const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
index 83ea37b835c..5ce0ff0b020 100644
--- a/src/operator/quantization/requantize.cc
+++ b/src/operator/quantization/requantize.cc
@@ -24,11 +24,31 @@
  */
 #include "./requantize-inl.h"
 #include "./quantize-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_requantize-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(RequantizeParam);
 
+bool RequantizeStorageType(const nnvm::NodeAttrs& attrs,
+                         const int dev_mask,
+                         DispatchMode* dispatch_mode,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_requantize)
 .describe(R"code(Given data that is quantized in int32 and the corresponding thresholds,
 requantize the data into int8 using min and max thresholds either calculated at runtime
@@ -43,7 +63,12 @@ inference accuracy.
 .set_num_outputs(3)
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", RequantizeType)
+.set_attr<FInferStorageType>("FInferStorageType", RequantizeStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNRequantizeForward)
+#else
 .set_attr<FCompute>("FCompute<cpu>", RequantizeForward<cpu>)
+#endif
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
     const RequantizeParam& param =
       nnvm::get<RequantizeParam>(attrs.parsed);
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 983f879888c..83c9034e364 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -22,8 +22,9 @@
  * \file shuffle_op.cc
  * \brief Operator to shuffle elements of an NDArray
  */
-#if (__GNUC__ > 4 && !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__)
-  #define USE_GNU_PARALLEL_SHUFFLE
+#if !defined (__ANDROID__) && ((__GNUC__ > 4 &&\
+    !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__))
+        #define USE_GNU_PARALLEL_SHUFFLE
 #endif
 
 #include <mxnet/operator_util.h>
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 87381960eb1..5f9e59dfa53 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -598,7 +598,11 @@ void EmbeddingOpBackward(const nnvm::NodeAttrs& attrs,
         uint64_t shape_out_prod =
           static_cast<uint64_t>(grad_out.shape_[0])*
           static_cast<uint64_t>(grad_out.shape_[1]);
-        if (shape_out_prod < (uint64_t)16384 && shape_in_prod < (uint64_t)16384) {
+
+        static bool default_addtakegrad =
+            dmlc::GetEnv("MXNET_FORCE_ADDTAKEGRAD", false);
+        if (!default_addtakegrad || (shape_out_prod < (uint64_t)16384 &&
+                                     shape_in_prod < (uint64_t)16384)) {
           AddTakeGrad(grad_in, data, grad_out);
         } else {
           AddTakeGradLargeBatchCaller(ctx, grad_in, data, grad_out);
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 3bf437376fa..bed97301fd1 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -28,9 +28,11 @@
 #if MXNET_USE_CUDA
   #include <cuda_runtime.h>
 #endif  // MXNET_USE_CUDA
+
 #include <mxnet/base.h>
 #include <mxnet/storage.h>
 #include <unordered_map>
+#include <algorithm>
 #include <vector>
 #include <mutex>
 #include <new>
@@ -43,7 +45,8 @@ namespace storage {
 
 #if MXNET_USE_CUDA
 /*!
- * \brief Storage manager with a memory pool on gpu.
+ * \brief Storage manager with a memory pool on gpu. Memory chunks are reused based on exact size
+ * match.
  */
 class GPUPooledStorageManager final : public StorageManager {
  public:
@@ -52,6 +55,11 @@ class GPUPooledStorageManager final : public StorageManager {
    */
   GPUPooledStorageManager() {
     reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
+    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+    if (page_size_ < NDEV) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than " << NDEV \
+                 << ". Got " << page_size_ << ".";
+    }
   }
   /*!
    * \brief Default destructor.
@@ -71,7 +79,7 @@ class GPUPooledStorageManager final : public StorageManager {
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
     cudaError_t err = cudaFree(handle.dptr);
-    size_t size = handle.size + NDEV;
+    size_t size = std::max(handle.size, page_size_);
     // ignore unloading error, as memory has already been recycled
     if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
@@ -83,10 +91,12 @@ class GPUPooledStorageManager final : public StorageManager {
   void ReleaseAll();
   // used memory
   size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
   // percentage of reserved memory
   int reserve_;
   // number of devices
-  const int NDEV = 32;
+  const size_t NDEV = 32;
   // memory pool
   std::unordered_map<size_t, std::vector<void*>> memory_pool_;
   DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
@@ -94,7 +104,7 @@ class GPUPooledStorageManager final : public StorageManager {
 
 void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = handle->size + NDEV;
+  size_t size = std::max(handle->size, page_size_);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
     size_t free, total;
@@ -119,7 +129,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
 
 void GPUPooledStorageManager::Free(Storage::Handle handle) {
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
-  size_t size = handle.size + NDEV;
+  size_t size = std::max(handle.size, page_size_);
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(handle.dptr);
 }
@@ -129,13 +139,172 @@ void GPUPooledStorageManager::ReleaseAll() {
     for (auto&& j : i.second) {
       Storage::Handle handle;
       handle.dptr = j;
-      handle.size = i.first - NDEV;
+      handle.size = i.first;
       DirectFreeNoLock(handle);
     }
   }
   memory_pool_.clear();
 }
 
+/*!
+ * \brief Storage manager with a memory pool, with rounded size, on gpu.
+ *
+ * This GPU mem pool uses a mixture of nearest pow2 (exponential) rounding and
+ * nearest multiple (linear) rounding to help alleviate the memory allocation stress
+ * in which the default naive exact-size-match pool falls short, such as in variable-length
+ * input/output cases like RNN workloads.
+ *
+ * \param cutoff the cutoff at which rounding is switched from exponential to linear. It's set
+ * through MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF environment variable. Must be between 20 (1 MB)
+ * and 34 (16 GB).
+ * Suppose the cutoff is X, the memory size buckets look like this:
+ * exp2(0), exp2(1), ..., exp2(X), 2*exp2(X), 3*exp2(X), ...
+ */
+class GPUPooledRoundedStorageManager final : public StorageManager {
+ public:
+  /*!
+   * \brief Default constructor.
+   */
+  GPUPooledRoundedStorageManager() {
+    reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
+    page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096);
+    cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24);
+    if (page_size_ < 32) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \
+                 << "Got: " << page_size_ << ".";
+    }
+    if (page_size_ != 1ul << log2_round_up(page_size_)) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " << page_size_ << ".";
+    }
+    page_size_ = log2_round_up(page_size_);
+    if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
+                 << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \
+                 << cut_off_ << ".";
+    }
+    if (cut_off_ < page_size_) {
+      LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
+                 << "smaller than log2 of MXNET_GPU_MEM_POOL_PAGE_SIZE. Got: " \
+                 << cut_off_ << " vs " << page_size_ << ".";
+    }
+    memory_pool_ = std::vector<std::vector<void*>>((1ul << (LOG2_MAX_MEM - cut_off_)) + cut_off_);
+  }
+  /*!
+   * \brief Default destructor.
+   */
+  ~GPUPooledRoundedStorageManager() {
+    ReleaseAll();
+  }
+
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
+
+  void DirectFree(Storage::Handle handle) override {
+    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+    DirectFreeNoLock(handle);
+  }
+
+ private:
+  inline int log2_round_up(size_t s) {
+    return static_cast<int>(std::ceil(std::log2(s)));
+  }
+  inline int div_pow2_round_up(size_t s, int divisor_log2) {
+    // (1025, 10) -> 2
+    // (2048, 10) -> 2
+    // (2049, 10) -> 3
+    size_t result = s >> divisor_log2;
+    return static_cast<int>(result + (s > (result << divisor_log2) ? 1 : 0));
+  }
+  inline int get_bucket(size_t s) {
+    int log_size = log2_round_up(s);
+    if (log_size > static_cast<int>(cut_off_))
+      return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
+    else
+      return std::max(log_size, static_cast<int>(page_size_));
+  }
+  inline size_t get_size(int bucket) {
+    if (bucket <= static_cast<int>(cut_off_))
+      return 1ul << bucket;
+    else
+      return (bucket - cut_off_ + 1) * (1ul << cut_off_);
+  }
+
+  void DirectFreeNoLock(Storage::Handle handle) {
+    cudaError_t err = cudaFree(handle.dptr);
+    size_t size = get_size(get_bucket(handle.size));
+    // ignore unloading error, as memory has already been recycled
+    if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
+    }
+    used_memory_ -= size;
+  }
+
+ private:
+  void ReleaseAll();
+  // number of devices
+  const int NDEV = 32;
+  // log2 of maximum page size. 16GB
+  const size_t LOG2_MAX_MEM = 34;
+  // address width in bits
+  static const int addr_width = sizeof(size_t) * 8;
+  // used memory
+  size_t used_memory_ = 0;
+  // page size
+  size_t page_size_;
+  // log2 of memory size before switching to exponential mode to linear mode
+  size_t cut_off_;
+  // percentage of reserved memory
+  int reserve_;
+  // memory pool
+  std::vector<std::vector<void*>> memory_pool_;
+  DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);
+};  // class GPUPooledRoundedStorageManager
+
+void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  int bucket = get_bucket(handle->size);
+  size_t size = get_size(bucket);
+  auto&& reuse_pool = memory_pool_[bucket];
+  if (reuse_pool.size() == 0) {
+    size_t free, total;
+    cudaMemGetInfo(&free, &total);
+    if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
+      ReleaseAll();
+
+    void* ret = nullptr;
+    cudaError_t e = cudaMalloc(&ret, size);
+    if (e != cudaSuccess && e != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "cudaMalloc failed: " << cudaGetErrorString(e);
+    }
+    used_memory_ += size;
+    handle->dptr = ret;
+  } else {
+    auto ret = reuse_pool.back();
+    reuse_pool.pop_back();
+    handle->dptr = ret;
+  }
+}
+
+void GPUPooledRoundedStorageManager::Free(Storage::Handle handle) {
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  int bucket = get_bucket(handle.size);
+  auto&& reuse_pool = memory_pool_[bucket];
+  reuse_pool.push_back(handle.dptr);
+}
+
+void GPUPooledRoundedStorageManager::ReleaseAll() {
+  for (size_t i = 0; i < memory_pool_.size(); i++) {
+    int size = get_size(i);
+    for (auto& j : memory_pool_[i]) {
+      Storage::Handle handle;
+      handle.size = size;
+      handle.dptr = j;
+      DirectFreeNoLock(handle);
+    }
+    memory_pool_[i].clear();
+  }
+}
+
 #endif  // MXNET_USE_CUDA
 
 }  // namespace storage
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 674c123d8e8..a0a3ed757ea 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -118,7 +118,21 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
 #if MXNET_USE_CUDA
             CUDA_CALL(cudaGetDeviceCount(&num_gpu_device));
             CHECK_GT(num_gpu_device, 0) << "GPU usage requires at least 1 GPU";
-            ptr = new storage::GPUPooledStorageManager();
+
+            const char *type = getenv("MXNET_GPU_MEM_POOL_TYPE");
+            const bool default_pool = (type == nullptr);
+            if (default_pool) type = "Naive";
+            std::string strategy = type;
+
+            if (strategy == "Round") {
+              ptr = new storage::GPUPooledRoundedStorageManager();
+              LOG(INFO) << "Using GPUPooledRoundedStorageManager.";
+            } else {
+              if (strategy != "Naive") {
+                LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << ".";
+              }
+              ptr = new storage::GPUPooledStorageManager();
+            }
 #else
             LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
 #endif  // MXNET_USE_CUDA
diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
index 76872d5e6cf..82fee67b114 100644
--- a/tests/cpp/operator/mkldnn.cc
+++ b/tests/cpp/operator/mkldnn.cc
@@ -30,6 +30,7 @@
 #include "gtest/gtest.h"
 #include "mxnet/imperative.h"
 #include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../../src/operator/nn/mkldnn/mkldnn_ops-inl.h"
 
 using namespace mxnet;
 
@@ -425,30 +426,45 @@ OpAttrs GetSumOp() {
  *    reordered to 5 dimensions.
  *
  */
-std::vector<NDArrayAttrs> GetTestInputArrays(InitFunc init_fn) {
+std::vector<NDArrayAttrs> GetTestInputArrays(InitFunc init_fn, bool rand = false) {
   TestArrayShapes tas = GetTestArrayShapes();
   std::vector<nnvm::TShape> shapes = tas.shapes;
   std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
 
   std::vector<NDArrayAttrs> in_arrs;
+  std::string desc;
   for (auto shape : shapes) {
     // Type 1.
     NDArray arr(shape, Context());
     in_arrs.emplace_back(arr, "Normal NDArray");
-    init_fn(&in_arrs.back().arr, false);
+    init_fn(&in_arrs.back().arr, rand);
     for (auto pd : pds) {
       if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
         continue;
 
       // Type 2, 3.
       arr = NDArray(shape, Context());
-      in_arrs.emplace_back(arr, "MKLDNN NDArray");
+      desc = "MKLDNN NDArray";
+      if (shape.ndim() != pd.desc().data.ndims) {
+        std::stringstream ss;
+        ss << "MKLDNN NDArray with different memory layout " <<
+           shape.ndim() << "/" << pd.desc().data.ndims;
+        desc = ss.str();
+      }
+      in_arrs.emplace_back(arr, desc);
       InitMKLDNNArray(&in_arrs.back().arr, pd, init_fn);
 
       // Type 4, 5, 6.
       arr = NDArray(shape, Context());
+      desc = "Reshaped MKLDNN NDArray";
+      if (shape.ndim() != pd.desc().data.ndims) {
+        std::stringstream ss;
+        ss << "Reshaped MKLDNN NDArray with different memory layout "
+           << shape.ndim() << "/" << pd.desc().data.ndims;
+        desc = ss.str();
+      }
       InitMKLDNNArray(&arr, pd, init_fn);
-      in_arrs.emplace_back(arr.Slice(1, arr.shape()[0] - 1), "Reshaped MKLDNN NDArray");
+      in_arrs.emplace_back(arr.Slice(1, arr.shape()[0] - 1), desc);
     }
   }
   return in_arrs;
@@ -495,6 +511,7 @@ std::vector<NDArrayAttrs> GetTestOutputArrays(const TShape &shape,
                                          const std::vector<mkldnn::memory::primitive_desc> &pds,
                                          const InitFunc init_fn) {
   std::vector<NDArrayAttrs> in_arrs;
+  std::string desc;
   // Type 1.
   NDArray arr(shape, Context());
   in_arrs.emplace_back(arr, "Normal NDArray");
@@ -538,7 +555,14 @@ std::vector<NDArrayAttrs> GetTestOutputArrays(const TShape &shape,
 
     // Type 2, 3.
     arr = NDArray(shape, Context());
-    in_arrs.emplace_back(arr, "MKLDNN NDArray");
+    desc = "MKLDNN NDArray";
+    if (shape.ndim() != pd.desc().data.ndims) {
+      std::stringstream ss;
+      ss << "MKLDNN NDArray with different memory layout "
+         << shape.ndim() << "/" << pd.desc().data.ndims;
+      desc = ss.str();
+    }
+    in_arrs.emplace_back(arr, desc);
     InitMKLDNNArray(&in_arrs.back().arr, pd, init_fn, true);
 
     // Type 8, 9.
@@ -548,7 +572,14 @@ std::vector<NDArrayAttrs> GetTestOutputArrays(const TShape &shape,
     NDArray arr = NDArray(s, Context());
     arr = arr.AsArray(shape, arr.dtype());
     InitMKLDNNArray(&arr, pd, init_fn, true);
-    in_arrs.emplace_back(arr, "Reused MKLDNN NDArray");
+    desc = "Reused MKLDNN NDArray";
+    if (shape.ndim() != pd.desc().data.ndims) {
+      std::stringstream ss;
+      ss << "Reused MKLDNN NDArray with different memory layout "
+         << shape.ndim() << "/" << pd.desc().data.ndims;
+      desc = ss.str();
+    }
+    in_arrs.emplace_back(arr, desc);
   }
   return in_arrs;
 }
@@ -587,7 +618,7 @@ void VerifySumResult(const std::vector<NDArray *> &in_arrs, const NDArray &arr)
   mshadow::default_real_t *d2 = in2.data().dptr<mshadow::default_real_t>();
   mshadow::default_real_t *o = out.data().dptr<mshadow::default_real_t>();
   for (size_t i = 0; i < in1.shape().Size(); i++)
-    EXPECT_EQ(d1[i] + d2[i], o[i]);
+    ASSERT_EQ(d1[i] + d2[i], o[i]);
 }
 
 void PrintVerifyMsg(const NDArrayAttrs &arr1, const NDArrayAttrs &arr2) {
@@ -736,4 +767,55 @@ TEST(IMPERATIVE, BinaryOp) {
   TestBinaryOp(attrs, VerifySumResult);
 }
 
+void VerifySumMemory(mkldnn::memory in_mem1, mkldnn::memory in_mem2, mkldnn::memory out_mem) {
+  float *in1 = static_cast<float*>(in_mem1.get_data_handle());
+  float *in2 = static_cast<float*>(in_mem2.get_data_handle());
+  float *out = static_cast<float*>(out_mem.get_data_handle());
+  for (size_t i = 0; i < in_mem1.get_primitive_desc().get_size() / sizeof(float); i++) {
+    ASSERT_EQ(in1[i] + in2[i], out[i]);
+  }
+}
+
+TEST(MKLDNN_BASE, MKLDNNSum) {
+  std::vector<NDArrayAttrs> in_arrs = GetTestInputArrays(InitDefaultArray);
+  std::vector<NDArrayAttrs> in_arrs2 = GetTestInputArrays(InitDefaultArray, true);
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  for (int i = 0; i < in_arrs.size(); i++) {
+    auto in_arr = in_arrs[i];
+    auto in_arr2 = in_arrs2[i];
+    std::vector<NDArrayAttrs> out_arrs = GetTestOutputArrays(in_arr.arr.shape(), pds,
+                                                             InitDefaultArray);
+    if (!SupportMKLDNN(in_arr.arr) || !in_arr.arr.IsMKLDNNData() || in_arr.arr.IsView())
+      continue;
+
+    for (auto out_arr : out_arrs) {
+      auto in_mem1 = in_arr.arr.GetMKLDNNData();
+      auto in_mem2 = in_arr.arr.GetMKLDNNData();
+      auto out_mem = out_arr.arr.GetMKLDNNData(in_mem1->get_primitive_desc());
+
+      // TODO(alexzai) : remove this noop when by reordering in MKLDNNSum
+      if (out_mem == nullptr)
+        continue;
+      PrintVerifyMsg(in_arr, in_arr);
+      op::MKLDNNSum(*in_mem1, *in_mem2, *out_mem);
+      MKLDNNStream::Get()->Submit();
+      VerifySumMemory(*in_mem1, *in_mem2, *out_mem);
+    }
+
+    // in place
+    auto input_mem = in_arr.arr.GetMKLDNNData();
+    auto input_mem2 = in_arr2.arr.GetMKLDNNData();
+    NDArrayAttrs orig_arr(in_arr.arr.Copy(in_arr.arr.ctx()), "In Place Copy");
+    PrintVerifyMsg(orig_arr, in_arr);
+    InitMKLDNNArray(&orig_arr.arr, input_mem->get_primitive_desc(), InitDefaultArray);
+    orig_arr.arr.CopyFrom(*input_mem);
+    auto old_mem = orig_arr.arr.GetMKLDNNData();
+    op::MKLDNNSum(*input_mem, *input_mem2, *input_mem);
+    MKLDNNStream::Get()->Submit();
+    VerifySumMemory(*old_mem, *input_mem2, *input_mem);
+  }
+}
+
 #endif
diff --git a/tests/cpp/storage/storage_test.cc b/tests/cpp/storage/storage_test.cc
index 269480b83c3..026c3660f32 100644
--- a/tests/cpp/storage/storage_test.cc
+++ b/tests/cpp/storage/storage_test.cc
@@ -1,5 +1,4 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
+/* * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -22,6 +21,7 @@
  * \file storage_test.cc
  * \brief cpu/gpu storage tests
 */
+#include <stdlib.h>
 #include <gtest/gtest.h>
 #include <dmlc/logging.h>
 #include <mxnet/storage.h>
@@ -43,7 +43,37 @@ TEST(Storage, Basic_CPU) {
 }
 
 #if MXNET_USE_CUDA
-TEST(Storage, Basic_GPU) {
+TEST(Storage_GPU, Basic_GPU) {
+  if (mxnet::test::unitTestsWithCuda) {
+    putenv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF=20");
+    putenv("MXNET_GPU_MEM_POOL_TYPE=Round");
+    auto &&storage = mxnet::Storage::Get();
+    mxnet::Context context_gpu = mxnet::Context::GPU(0);
+    auto &&handle = storage->Alloc(32, context_gpu);
+    auto &&handle2 = storage->Alloc(2097153, context_gpu);
+    EXPECT_EQ(handle.ctx, context_gpu);
+    EXPECT_EQ(handle.size, 32);
+    EXPECT_EQ(handle2.ctx, context_gpu);
+    EXPECT_EQ(handle2.size, 2097153);
+    auto ptr = handle.dptr;
+    auto ptr2 = handle2.dptr;
+    storage->Free(handle);
+    storage->Free(handle2);
+
+    handle = storage->Alloc(4095, context_gpu);
+    EXPECT_EQ(handle.ctx, context_gpu);
+    EXPECT_EQ(handle.size, 4095);
+    EXPECT_EQ(handle.dptr, ptr);
+    storage->Free(handle);
+
+    handle2 = storage->Alloc(3145728, context_gpu);
+    EXPECT_EQ(handle2.ctx, context_gpu);
+    EXPECT_EQ(handle2.size, 3145728);
+    EXPECT_EQ(handle2.dptr, ptr2);
+    storage->Free(handle2);
+    unsetenv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF");
+    unsetenv("MXNET_GPU_MEM_POOL_TYPE");
+  }
   if (mxnet::test::unitTestsWithCuda) {
     constexpr size_t kSize = 1024;
     mxnet::Context context_gpu = mxnet::Context::GPU(0);
diff --git a/tests/python-pytest/onnx/import/onnx_import_test.py b/tests/python-pytest/onnx/import/onnx_import_test.py
index 741ae1febb1..573dd74a471 100644
--- a/tests/python-pytest/onnx/import/onnx_import_test.py
+++ b/tests/python-pytest/onnx/import/onnx_import_test.py
@@ -77,6 +77,78 @@ def test_broadcast():
     output = bkd_rep.run([input1, input2])
     npt.assert_almost_equal(output[0], numpy_op)
 
+@with_seed()
+def test_greater():
+    """Test for logical greater in onnx operators."""
+    input1 = np.random.rand(1, 3, 4, 5).astype("float32")
+    input2 = np.random.rand(1, 5).astype("float32")
+    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=(1, 3, 4, 5)),
+              helper.make_tensor_value_info("input2", TensorProto.FLOAT, shape=(1, 5))]
+
+    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 3, 4, 5))]
+
+    nodes = [helper.make_node("Greater", ["input1", "input2"], ["output"])]
+
+    graph = helper.make_graph(nodes,
+                              "greater_test",
+                              inputs,
+                              outputs)
+
+    greater_model = helper.make_model(graph)
+    
+    bkd_rep = mxnet_backend.prepare(greater_model)
+    numpy_op = np.greater(input1, input2).astype(np.float32)
+    output = bkd_rep.run([input1, input2])
+    npt.assert_almost_equal(output[0], numpy_op)
+
+@with_seed()
+def test_lesser():
+    """Test for logical greater in onnx operators."""
+    input1 = np.random.rand(1, 3, 4, 5).astype("float32")
+    input2 = np.random.rand(1, 5).astype("float32")
+    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=(1, 3, 4, 5)),
+              helper.make_tensor_value_info("input2", TensorProto.FLOAT, shape=(1, 5))]
+
+    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 3, 4, 5))]
+
+    nodes = [helper.make_node("Less", ["input1", "input2"], ["output"])]
+
+    graph = helper.make_graph(nodes,
+                              "lesser_test",
+                              inputs,
+                              outputs)
+
+    greater_model = helper.make_model(graph)
+    
+    bkd_rep = mxnet_backend.prepare(greater_model)
+    numpy_op = np.less(input1, input2).astype(np.float32)
+    output = bkd_rep.run([input1, input2])
+    npt.assert_almost_equal(output[0], numpy_op)
+    
+@with_seed()
+def test_equal():
+    """Test for logical greater in onnx operators."""
+    input1 = np.random.rand(1, 3, 4, 5).astype("float32")
+    input2 = np.random.rand(1, 5).astype("float32")
+    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=(1, 3, 4, 5)),
+              helper.make_tensor_value_info("input2", TensorProto.FLOAT, shape=(1, 5))]
+
+    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 3, 4, 5))]
+
+    nodes = [helper.make_node("Equal", ["input1", "input2"], ["output"])]
+
+    graph = helper.make_graph(nodes,
+                              "equal_test",
+                              inputs,
+                              outputs)
+
+    greater_model = helper.make_model(graph)
+    
+    bkd_rep = mxnet_backend.prepare(greater_model)
+    numpy_op = np.equal(input1, input2).astype(np.float32)
+    output = bkd_rep.run([input1, input2])
+    npt.assert_almost_equal(output[0], numpy_op)
+
 def test_super_resolution_example():
     """Test the super resolution example in the example/onnx folder"""
     sys.path.insert(0, os.path.join(CURR_PATH, '../../../../example/onnx/'))
diff --git a/tests/python-pytest/onnx/import/test_cases.py b/tests/python-pytest/onnx/import/test_cases.py
index 8e6dc443bba..1a4d8c4fe37 100644
--- a/tests/python-pytest/onnx/import/test_cases.py
+++ b/tests/python-pytest/onnx/import/test_cases.py
@@ -58,6 +58,16 @@
     'test_argmax',
     'test_argmin',
     'test_min',
+    'test_logical_and',
+    'test_logical_xor',
+    'test_logical_not',
+    'test_logical_or',
+    'test_clip',
+    'test_softsign',
+    'test_reduce_l2',
+    'test_reduce_log_sum',
+    'test_reduce_log_sum_exp',
+    'test_reduce_sum_square'
     #pytorch operator tests
     'test_operator_exp',
     'test_operator_maxpool',
diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
index 453161fcfe7..126ccabaa7b 100644
--- a/tests/python/gpu/test_forward.py
+++ b/tests/python/gpu/test_forward.py
@@ -22,7 +22,7 @@
 from mxnet.test_utils import *
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon import utils
 
 def _get_model():
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 273ad3d69ca..d4f6f31a30e 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -27,7 +27,7 @@
 import unittest
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index a6e8ebf658a..76231fbe90e 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -24,7 +24,7 @@
 from mxnet.test_utils import assert_almost_equal, default_context
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 shape = (4, 4)
 keys = [5, 7, 11]
@@ -83,7 +83,7 @@ def check_rsp_pull(kv, count, ctxs, is_same_rowid=False, use_slice=False):
         check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], is_same_rowid=True)
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)])
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], is_same_rowid=True)
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], use_slice=True) 
+        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], use_slice=True)
         check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], use_slice=True)
 
     # test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/9384
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7c3d670ba22..ed4aaa43782 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -32,7 +32,7 @@
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from test_operator import *
 from test_optimizer import *
 from test_random import *
@@ -740,8 +740,8 @@ def test_pooling_with_type():
 
 @with_seed()
 def test_pooling_versions():
-    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride,
-                                     pooling_convention='valid', global_pool=False, p_value=2):
+    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride, pooling_convention='valid',
+                                     global_pool=False, p_value=2, count_include_pad=True, tol=None):
         ctx_list = []
         sym_list = []
         # PoolingV1 cpu
@@ -765,61 +765,69 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
             ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
             if not global_pool:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, name='pool', p_value=p_value))
+                                               pooling_convention=pooling_convention, name='pool',
+                                               p_value=p_value, count_include_pad=count_include_pad))
             else:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool', p_value=p_value))
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool',
+                                               p_value=p_value, count_include_pad=count_include_pad))
         # Pooling gpu
         if 'pool_gpu' in pool_op_list:
             ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
             if not global_pool:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool', p_value=p_value))
+                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool',
+                                               p_value=p_value, count_include_pad=count_include_pad))
             else:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True,
-                                               name='pool', p_value=p_value))
+                                               name='pool', p_value=p_value, count_include_pad=count_include_pad))
         # CuDNNPooling
         if 'pool_cudnn' in pool_op_list:
             ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
             if not global_pool:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, p_value=p_value, cudnn_off=False, name='pool'))
+                                               pooling_convention=pooling_convention, p_value=p_value, cudnn_off=False,
+                                               name='pool', count_include_pad=count_include_pad))
             else:
                 sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, p_value=p_value,
-                                               cudnn_off=False, name='pool'))
-        check_consistency(sym_list, ctx_list)
+                                               cudnn_off=False, name='pool', count_include_pad=count_include_pad))
+        check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), tol=tol)
 
-    def test_1d_pooling(pool_type, p_value=2):
+    def test_1d_pooling(pool_type, p_value=2, count_include_pad=True):
         data = (2, 3, 20)
         kernel = (4,)
         pad = (0,)
         stride = (1,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2,)
         stride = (2,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (0,)
         stride = (1,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2,)
         stride = (2,)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     global_pool=True, p_value=p_value)
+                                     global_pool=True, p_value=p_value, count_include_pad=count_include_pad)
 
-    def test_2d_pooling(pool_type, p_value=2):
+    def test_2d_pooling(pool_type, p_value=2, count_include_pad=True):
         data = (2, 3, 20, 20)
         kernel = (4, 5)
         pad = (0, 0)
@@ -831,14 +839,15 @@ def test_2d_pooling(pool_type, p_value=2):
         else:
             test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                          data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         pooling_convention='valid', global_pool=False)
+                                         pooling_convention='valid', global_pool=False, count_include_pad=count_include_pad)
 
         # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
         pad = (2, 3)
         stride = (2, 3)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (0, 0)
         stride = (1, 1)
@@ -847,16 +856,24 @@ def test_2d_pooling(pool_type, p_value=2):
                                          data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                          pooling_convention='full', global_pool=False, p_value=p_value)
         else:
-            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         pooling_convention='full', global_pool=False)
+            if count_include_pad:
+                test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                             data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                             pooling_convention='full', global_pool=False,
+                                             count_include_pad=count_include_pad)
+            else:
+                test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                             data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                             pooling_convention='full', global_pool=False,
+                                             count_include_pad=count_include_pad)
 
         # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
         pad = (2, 3)
         stride = (2, 3)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         if pool_type == 'lp':
             test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
@@ -865,55 +882,62 @@ def test_2d_pooling(pool_type, p_value=2):
         else:
             test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                          data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         global_pool=True)
+                                         global_pool=True, count_include_pad=count_include_pad)
 
-    def test_3d_pooling(pool_type, p_value=2):
+    def test_3d_pooling(pool_type, p_value=2, count_include_pad=True):
         data = (2, 3, 20, 20, 20)
         kernel = (4, 5, 3)
         pad = (0, 0, 0)
         stride = (1, 1, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2, 3, 3)
         stride = (2, 3, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value)
+                                     pooling_convention='valid', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (0, 0, 0)
         stride = (1, 1, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         pad = (2, 3, 3)
         stride = (2, 3, 1)
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value)
+                                     pooling_convention='full', global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad)
 
         test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
                                      data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     global_pool=True, p_value=p_value)
+                                     global_pool=True, p_value=p_value, count_include_pad=count_include_pad)
 
     test_1d_pooling('max')
-    test_1d_pooling('avg')
+    test_1d_pooling('avg', count_include_pad=True)
+    test_1d_pooling('avg', count_include_pad=False)
     test_1d_pooling('sum')
     test_1d_pooling('lp', p_value=1)
     test_1d_pooling('lp', p_value=2)
     test_1d_pooling('lp', p_value=3)
 
     test_2d_pooling('max')
-    test_2d_pooling('avg')
+    test_2d_pooling('avg', count_include_pad=True)
+    test_2d_pooling('avg', count_include_pad=False)
     test_2d_pooling('sum')
     test_2d_pooling('lp', p_value=1)
     test_2d_pooling('lp', p_value=2)
     test_2d_pooling('lp', p_value=3)
 
     test_3d_pooling('max')
-    test_3d_pooling('avg')
+    test_3d_pooling('avg', count_include_pad=True)
+    test_3d_pooling('avg', count_include_pad=False)
     test_3d_pooling('sum')
     test_3d_pooling('lp', p_value=1)
     test_3d_pooling('lp', p_value=2)
diff --git a/tests/python/mkl/test_quantization_mkldnn.py b/tests/python/mkl/test_quantization_mkldnn.py
new file mode 100644
index 00000000000..290f1a195c2
--- /dev/null
+++ b/tests/python/mkl/test_quantization_mkldnn.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import sys
+import mxnet as mx
+
+os.environ['ENABLE_MKLDNN_QUANTIZATION_TEST'] = '1'
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../quantization'))
+from test_quantization import *
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 7b08f46e836..15e8582b9ee 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -18,6 +18,7 @@
 """Some of the tests using CUDNN require a special GPU instruction called dp4a.
 Ref: http://images.nvidia.com/content/pdf/tesla/184457-Tesla-P4-Datasheet-NV-Final-Letter-Web.pdf
 """
+import os
 import mxnet as mx
 import numpy as np
 from mxnet.test_utils import assert_almost_equal, rand_ndarray, rand_shape_nd, same, DummyIter
@@ -25,6 +26,16 @@
 from mxnet.module import Module
 from mxnet.io import NDArrayIter
 
+def is_test_for_gpu():
+    return mx.current_context().device_type == 'gpu'
+
+def is_test_for_mkldnn():
+    return (mx.current_context().device_type == 'cpu'
+            and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == '1')
+
+def is_test_for_native_cpu():
+    return (mx.current_context().device_type == 'cpu'
+            and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == None)
 
 @with_seed()
 def test_quantize_float32_to_int8():
@@ -120,187 +131,220 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
 
 @with_seed()
 def test_quantized_conv():
-    if mx.current_context().device_type != 'gpu':
-        print('skipped testing quantized_conv on cpu since it is not implemented yet')
-        return
-
-    def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias):
-        with mx.Context('gpu', 0):
-            # run fp32 conv
-            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-            conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
-                                        no_bias=no_bias, cudnn_off=False, name='conv2d')
-            arg_shapes, _, _ = conv2d.infer_shape(data=data_shape)
-            arg_names = conv2d.list_arguments()
-            conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null')
-            conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                           shape=data_shape).astype('int32')
-            conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                           shape=arg_shapes[1]).astype('int32')
-            if not no_bias:
-                conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                               shape=arg_shapes[2]).astype('int32')
-            output = conv_exe_fp32.forward()[0]
-
-            # run quantized conv
-            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
-            qweight = mx.sym.Variable(name='qweight', dtype='int8')
-            min_data = mx.sym.Variable(name='min_data')
-            max_data = mx.sym.Variable(name='max_data')
-            min_weight = mx.sym.Variable(name='min_weight')
-            max_weight = mx.sym.Variable(name='max_weight')
-            quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data,
-                                                             max_data=max_data, min_weight=min_weight,
-                                                             max_weight=max_weight, kernel=kernel,
-                                                             num_filter=num_filter, pad=pad, stride=stride,
-                                                             no_bias=no_bias)
-            qarg_names = quantized_conv2d.list_arguments()
-            type_dict = None
-            if not no_bias:
-                type_dict = {qarg_names[2]: 'int8'}
-            conv_exe_int8 = quantized_conv2d.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
-            conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype('int8')
-            conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8')
-            quantized_range = 127.0
-            if no_bias:
-                conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range
-                conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range
-            else:
-                conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8')
-                conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range
-                conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range
-                conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range
-                conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range
-            qoutput, min_range, max_range = conv_exe_int8.forward()
-
-            if no_bias:
-                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-            else:
-                # with adding bias, accuracy loss should not be greater than one
-                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-                cond = mx.nd.lesser(2, diff).sum().asscalar()
-                assert cond == 0
-
-    check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), True)
-    check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), False)
+    def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, no_bias, qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_conv for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'int8' and is_test_for_mkldnn():
+            print('skipped testing quantized_conv for mkldnn cpu int8 since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantized_conv for gpu uint8 since it is not supported yet')
+            return
+
+        # run fp32 conv
+        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+        conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
+                                    no_bias=no_bias, cudnn_off=False, name='conv2d')
+        arg_shapes, _, _ = conv2d.infer_shape(data=data_shape)
+        arg_names = conv2d.list_arguments()
+        conv_exe_fp32 = conv2d.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
+                                                                        shape=data_shape).astype('int32')
+        conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                        shape=arg_shapes[1]).astype('int32')
+        if not no_bias:
+            conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                            shape=arg_shapes[2]).astype('int32')
+        output = conv_exe_fp32.forward()[0]
+
+        # run quantized conv
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
+        qweight = mx.sym.Variable(name='qweight', dtype='int8')
+        min_data = mx.sym.Variable(name='min_data')
+        max_data = mx.sym.Variable(name='max_data')
+        min_weight = mx.sym.Variable(name='min_weight')
+        max_weight = mx.sym.Variable(name='max_weight')
+        quantized_conv2d = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data,
+                                                            max_data=max_data, min_weight=min_weight,
+                                                            max_weight=max_weight, kernel=kernel,
+                                                            num_filter=num_filter, pad=pad, stride=stride,
+                                                            no_bias=no_bias)
+        qarg_names = quantized_conv2d.list_arguments()
+        type_dict = None
+        if not no_bias:
+            type_dict = {qarg_names[2]: 'int8'}
+        conv_exe_int8 = quantized_conv2d.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
+        conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype(qdtype)
+        conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8')
+        quantized_range = 127.0
+        if no_bias:
+            conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range
+            conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range
+        else:
+            conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8')
+            conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range
+            conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range
+            conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range
+            conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range
+        qoutput, min_range, max_range = conv_exe_int8.forward()
+
+        if no_bias:
+            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+        else:
+            # with adding bias, accuracy loss should not be greater than one
+            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            assert cond == 0
 
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), True, qdtype)
+        check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), False, qdtype)
 
 @with_seed()
 def test_quantized_pooling():
-    if mx.current_context().device_type != 'gpu':
-        print('skipped testing quantized_pooling on cpu since it is not implemented yet')
-        return
-
-    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool):
-        with mx.Context('gpu', 0):
-            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-            pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
-                                          pool_type=pool_type, global_pool=global_pool, cudnn_off=False)
-            arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
-            arg_names = pooling_fp32.list_arguments()
-            pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-            pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                              shape=data_shape).astype('int32')
-            output = pooling_fp32_exe.forward()[0]
-
-            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
-            min_data = mx.sym.Variable(name='min_data')
-            max_data = mx.sym.Variable(name='max_data')
-            quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
-                                                                 max_data=max_data, kernel=kernel,
-                                                                 pad=pad, stride=stride, pool_type=pool_type,
-                                                                 global_pool=global_pool)
-            pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
-            qarg_names = quantized_pooling.list_arguments()
-            pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype('int8')
-            quantized_range = 127.0
-            pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
-            pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
-            qoutput, min_range, max_range = pooling_int8_exe.forward()
-
-            if pool_type == 'max':
-                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-            elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
-                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-                cond = mx.nd.lesser(2, diff).sum().asscalar()
-                assert cond == 0
-
-    check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), False)
-    check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), True)
-    check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), False)
-    check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), True)
-
+    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool, qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_pooling for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantized_pooling for gpu uint8 since it is not supported yet')
+            return
+
+        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+        pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
+                                        pool_type=pool_type, global_pool=global_pool, cudnn_off=False)
+        arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
+        arg_names = pooling_fp32.list_arguments()
+        pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
+                                                                            shape=data_shape).astype('int32')
+        output = pooling_fp32_exe.forward()[0]
+
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
+        min_data = mx.sym.Variable(name='min_data')
+        max_data = mx.sym.Variable(name='max_data')
+        quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
+                                                                max_data=max_data, kernel=kernel,
+                                                                pad=pad, stride=stride, pool_type=pool_type,
+                                                                global_pool=global_pool)
+        pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
+        qarg_names = quantized_pooling.list_arguments()
+        pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
+        quantized_range = 127.0
+        pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
+        pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
+        qoutput, min_range, max_range = pooling_int8_exe.forward()
+
+        if pool_type == 'max':
+            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+        elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
+            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            assert cond == 0
+
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), False, qdtype)
+        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), True, qdtype)
+        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), False, qdtype)
+        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), True, qdtype)
 
 @with_seed()
 def test_quantized_fc():
-    if mx.current_context().device_type != 'gpu':
-        print('skipped testing quantized_fc on cpu since it is not implemented yet')
-        return
-
-    def check_quantized_fc(data_shape, num_hidden, no_bias, flatten=True):
-        with mx.Context('gpu', 0):
-            data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-            fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
-            arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
-            arg_names = fc_fp32.list_arguments()
-            fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-            fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                         shape=data_shape).astype('int32')
-            fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                         shape=arg_shapes[1]).astype('int32')
-            if not no_bias:
-                fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                             shape=arg_shapes[2]).astype('int32')
-            output = fc_fp32_exe.forward()[0]
-
-            qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
-            fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
-                                                               no_bias=no_bias, flatten=flatten)
-            qarg_names = fc_int8.list_arguments()
-            type_dict = {qarg_names[1]: 'int8'}
-            if not no_bias:
-                type_dict.update({qarg_names[2]: 'int8'})
-            fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
-            fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype('int8')
-            fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
-            quantized_range = 127.0
-            if no_bias:
-                fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
-                fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
-            else:
-                fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
-                fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
-                fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
-                fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
-                fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
-            qoutput, min_range, max_range = fc_int8_exe.forward()
-
-            if no_bias:
-                assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-            else:
-                # with adding bias, accuracy loss should not be greater than one
-                diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-                cond = mx.nd.lesser(2, diff).sum().asscalar()
-                assert cond == 0
-
-    check_quantized_fc((32, 512, 2, 2), 100, True)
-    check_quantized_fc((32, 111, 2, 2), 100, True)
-    check_quantized_fc((32, 512, 2, 2), 100, False)
-    check_quantized_fc((32, 111, 2, 2), 100, False)
+    def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
+        if mx.current_context().device_type != 'gpu':
+            print('skipped testing quantized_fc on cpu since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
+            return
+
+        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+        fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
+        arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
+        arg_names = fc_fp32.list_arguments()
+        fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
+                                                                     shape=data_shape).astype('int32')
+        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                     shape=arg_shapes[1]).astype('int32')
+        if not no_bias:
+            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+                                                                         shape=arg_shapes[2]).astype('int32')
+        output = fc_fp32_exe.forward()[0]
+
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype='int8')
+        fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
+                                                           no_bias=no_bias, flatten=flatten)
+        qarg_names = fc_int8.list_arguments()
+        type_dict = {qarg_names[1]: 'int8'}
+        if not no_bias:
+            type_dict.update({qarg_names[2]: 'int8'})
+        fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
+        fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
+        fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
+        quantized_range = 127.0
+        if no_bias:
+            fc_int8_exe.arg_dict[qarg_names[2]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[3]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[4]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[5]][:] = quantized_range
+        else:
+            fc_int8_exe.arg_dict[qarg_names[2]][:] = fc_fp32_exe.arg_dict[arg_names[2]].astype('int8')
+            fc_int8_exe.arg_dict[qarg_names[3]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[4]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[5]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[6]][:] = quantized_range
+            fc_int8_exe.arg_dict[qarg_names[7]][:] = -quantized_range
+            fc_int8_exe.arg_dict[qarg_names[8]][:] = quantized_range
+        qoutput, min_range, max_range = fc_int8_exe.forward()
+
+        if no_bias:
+            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+        else:
+            # with adding bias, accuracy loss should not be greater than one
+            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
+            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            assert cond == 0
 
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_fc((32, 512, 2, 2), 100, True, qdtype)
+        check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
+        check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
+        check_quantized_fc((32, 111, 2, 2), 100, False, qdtype)
 
 @with_seed()
 def test_quantized_flatten():
-    def check_quantized_flatten(shape):
-        qdata = mx.nd.random.uniform(low=-127, high=127, shape=shape).astype('int8')
+    def check_quantized_flatten(shape, qdtype):
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        qdata = mx.nd.random.uniform(low=data_low, high=data_high, shape=shape).astype(qdtype)
         min_data = mx.nd.array([-1023.343], dtype='float32')
         max_data = mx.nd.array([2343.324275], dtype='float32')
         qoutput, min_output, max_output = mx.nd.contrib.quantized_flatten(qdata, min_data, max_data)
@@ -311,10 +355,11 @@ def check_quantized_flatten(shape):
         assert same(min_data.asnumpy(), min_output.asnumpy())
         assert same(max_data.asnumpy(), max_output.asnumpy())
 
-    check_quantized_flatten((10,))
-    check_quantized_flatten((10, 15))
-    check_quantized_flatten((10, 15, 18))
-    check_quantized_flatten((3, 4, 23, 23))
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_flatten((10,), qdtype)
+        check_quantized_flatten((10, 15), qdtype)
+        check_quantized_flatten((10, 15, 18), qdtype)
+        check_quantized_flatten((3, 4, 23, 23), qdtype)
 
 
 @with_seed()
@@ -353,56 +398,69 @@ def get_fp32_sym():
 
 @with_seed()
 def test_quantize_model():
-    def check_params(params, qparams, qsym=None):
-        if qsym is None:
-            assert len(params) == len(qparams)
-            for k, v in params.items():
-                assert k in qparams
-                assert same(v.asnumpy(), qparams[k].asnumpy())
-        else:
-            qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
-            assert len(qparams) == len(qparams_ground_truth)
-            for k, v in qparams_ground_truth.items():
-                assert k in qparams
-                assert same(v.asnumpy(), qparams[k].asnumpy())
-
-    def check_qsym_calibrated(qsym):
-        attrs = qsym.attr_dict()
-        for k, v in attrs.items():
-            if k.find('requantize_') != -1:
-                assert 'min_calib_range' in v
-                assert 'max_calib_range' in v
-
-    sym = get_fp32_sym()
-    mod = Module(symbol=sym)
-    batch_size = 4
-    data_shape = (batch_size, 4, 10, 10)
-    label_shape = (batch_size, 10)
-    mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
-    mod.init_params()
-    arg_params, aux_params = mod.get_params()
-    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
-                                                                     arg_params=arg_params,
-                                                                     aux_params=aux_params,
-                                                                     ctx=mx.current_context(),
-                                                                     calib_mode='none')
-    check_params(arg_params, qarg_params, qsym)
-    check_params(aux_params, qaux_params)
-
-    calib_data = mx.nd.random.uniform(shape=data_shape)
-    calib_data = NDArrayIter(data=calib_data)
-    calib_data = DummyIter(calib_data)
-    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
-                                                                     arg_params=arg_params,
-                                                                     aux_params=aux_params,
-                                                                     ctx=mx.current_context(),
-                                                                     calib_mode='naive',
-                                                                     calib_data=calib_data,
-                                                                     num_calib_examples=20)
-    check_params(arg_params, qarg_params, qsym)
-    check_params(aux_params, qaux_params)
-    check_qsym_calibrated(qsym)
-
+    def check_quantize_model(qdtype):
+        def check_params(params, qparams, qsym=None):
+            if qsym is None:
+                assert len(params) == len(qparams)
+                for k, v in params.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+            else:
+                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
+                assert len(qparams) == len(qparams_ground_truth)
+                for k, v in qparams_ground_truth.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+
+        def check_qsym_calibrated(qsym):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('requantize_') != -1:
+                    assert 'min_calib_range' in v
+                    assert 'max_calib_range' in v
+
+        def check_qsym_qdtype(qsym, qdtype):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('_quantize') != -1:
+                    assert 'out_type' in v
+                    assert v['out_type'] == qdtype
+
+        sym = get_fp32_sym()
+        mod = Module(symbol=sym)
+        batch_size = 4
+        data_shape = (batch_size, 4, 10, 10)
+        label_shape = (batch_size, 10)
+        mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
+        mod.init_params()
+        arg_params, aux_params = mod.get_params()
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                         arg_params=arg_params,
+                                                                         aux_params=aux_params,
+                                                                         ctx=mx.current_context(),
+                                                                         quantized_dtype=qdtype,
+                                                                         calib_mode='none')
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+
+        calib_data = mx.nd.random.uniform(shape=data_shape)
+        calib_data = NDArrayIter(data=calib_data)
+        calib_data = DummyIter(calib_data)
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                         arg_params=arg_params,
+                                                                         aux_params=aux_params,
+                                                                         ctx=mx.current_context(),
+                                                                         quantized_dtype=qdtype,
+                                                                         calib_mode='naive',
+                                                                         calib_data=calib_data,
+                                                                         num_calib_examples=20)
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+        check_qsym_calibrated(qsym)
+        check_qsym_qdtype(qsym, qdtype)
+
+    for qdtype in ['int8', 'uint8']:
+        check_quantize_model(qdtype)
 
 @with_seed()
 def test_quantize_sym_with_calib():
diff --git a/tests/python/train/test_resnet_aug.py b/tests/python/train/test_resnet_aug.py
new file mode 100644
index 00000000000..62c531bb637
--- /dev/null
+++ b/tests/python/train/test_resnet_aug.py
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+import sys
+sys.path.insert(0, '../../python')
+import mxnet as mx
+import numpy as np
+import os, pickle, gzip
+import logging
+from mxnet.test_utils import get_cifar10
+
+batch_size = 128
+
+# small mlp network
+def get_net():
+    data = mx.symbol.Variable('data')
+    float_data = mx.symbol.Cast(data=data, dtype="float32")
+    fc1 = mx.symbol.FullyConnected(float_data, name='fc1', num_hidden=128)
+    act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
+    fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
+    act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
+    fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
+    softmax = mx.symbol.SoftmaxOutput(fc3, name="softmax")
+    return softmax
+
+# check data
+get_cifar10()
+
+def get_iterator(kv):
+    data_shape = (3, 28, 28)
+
+    train = mx.io.ImageRecordIter(
+        path_imgrec = "data/cifar/train.rec",
+        mean_img    = "data/cifar/mean.bin",
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        random_resized_crop = True,
+        min_aspect_ratio = 0.75,
+        max_aspect_ratio = 1.33,
+        min_random_area = 0.08,
+        max_random_area = 1,
+        brightness  = 0.4,
+        contrast    = 0.4,
+        saturation  = 0.4,
+        pca_noise   = 0.1,
+        rand_mirror = True,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank)
+    train = mx.io.PrefetchingIter(train)
+
+    val = mx.io.ImageRecordIter(
+        path_imgrec = "data/cifar/test.rec",
+        mean_img    = "data/cifar/mean.bin",
+        rand_crop   = False,
+        rand_mirror = False,
+        data_shape  = data_shape,
+        batch_size  = batch_size,
+        num_parts   = kv.num_workers,
+        part_index  = kv.rank)
+
+    return (train, val)
+
+num_epoch = 1
+
+def run_cifar10(train, val, use_module):
+    train.reset()
+    val.reset()
+    devs = [mx.cpu(0)]
+    net = get_net()
+    mod = mx.mod.Module(net, context=devs)
+    optim_args = {'learning_rate': 0.001, 'wd': 0.00001, 'momentum': 0.9}
+    eval_metrics = ['accuracy']
+    if use_module:
+        executor = mx.mod.Module(net, context=devs)
+        executor.fit(
+            train,
+            eval_data=val,
+            optimizer_params=optim_args,
+            eval_metric=eval_metrics,
+            num_epoch=num_epoch,
+            arg_params=None,
+            aux_params=None,
+            begin_epoch=0,
+            batch_end_callback=mx.callback.Speedometer(batch_size, 50),
+            epoch_end_callback=None)
+    else:
+        executor = mx.model.FeedForward.create(
+            net,
+            train,
+            ctx=devs,
+            eval_data=val,
+            eval_metric=eval_metrics,
+            num_epoch=num_epoch,
+            arg_params=None,
+            aux_params=None,
+            begin_epoch=0,
+            batch_end_callback=mx.callback.Speedometer(batch_size, 50),
+            epoch_end_callback=None,
+            **optim_args)
+
+    ret = executor.score(val, eval_metrics)
+    if use_module:
+        ret = list(ret)
+        logging.info('final accuracy = %f', ret[0][1])
+        assert (ret[0][1] > 0.08)
+    else:
+        logging.info('final accuracy = %f', ret[0])
+        assert (ret[0] > 0.08)
+
+class CustomDataIter(mx.io.DataIter):
+    def __init__(self, data):
+        super(CustomDataIter, self).__init__()
+        self.data = data
+        self.batch_size = data.provide_data[0][1][0]
+
+        # use legacy tuple
+        self.provide_data = [(n, s) for n, s in data.provide_data]
+        self.provide_label = [(n, s) for n, s in data.provide_label]
+
+    def reset(self):
+        self.data.reset()
+
+    def next(self):
+        return self.data.next()
+
+    def iter_next(self):
+        return self.data.iter_next()
+
+    def getdata(self):
+        return self.data.getdata()
+
+    def getlabel(self):
+        return self.data.getlable()
+
+    def getindex(self):
+        return self.data.getindex()
+
+    def getpad(self):
+        return self.data.getpad()
+
+def test_cifar10():
+    # print logging by default
+    logging.basicConfig(level=logging.DEBUG)
+    console = logging.StreamHandler()
+    console.setLevel(logging.DEBUG)
+    logging.getLogger('').addHandler(console)
+    kv = mx.kvstore.create("local")
+    # test float32 input
+    (train, val) = get_iterator(kv)
+    run_cifar10(train, val, use_module=False)
+    run_cifar10(train, val, use_module=True)
+
+    # test legecay tuple in provide_data and provide_label
+    run_cifar10(CustomDataIter(train), CustomDataIter(val), use_module=False)
+    run_cifar10(CustomDataIter(train), CustomDataIter(val), use_module=True)
+
+if __name__ == "__main__":
+    test_cifar10()
diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
index 635bdcc6092..b38c851984d 100644
--- a/tests/python/unittest/common.py
+++ b/tests/python/unittest/common.py
@@ -241,3 +241,11 @@ def __enter__(self):
 
         def __exit__(self, exc_type, exc_value, traceback):
             shutil.rmtree(self._dirname)
+
+def teardown():
+    """
+    A function with a 'magic name' executed automatically after each nosetests test module.
+
+    It waits for all operations in one file to finish before carrying on the next.
+    """
+    mx.nd.waitall()
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index c2d0d26c9df..2f889845af3 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -20,7 +20,7 @@
 from mxnet.ndarray import zeros_like
 from mxnet.autograd import *
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def grad_and_loss(func, argnum=None):
diff --git a/tests/python/unittest/test_contrib_autograd.py b/tests/python/unittest/test_contrib_autograd.py
index 9e80bba306d..1c878e322e7 100644
--- a/tests/python/unittest/test_contrib_autograd.py
+++ b/tests/python/unittest/test_contrib_autograd.py
@@ -18,7 +18,7 @@
 import mxnet.ndarray as nd
 from mxnet.contrib.autograd import *
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 def autograd_assert(*args, **kwargs):
     func   = kwargs["func"]
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index bbfed947852..e9e161d7f3b 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
 from mxnet.test_utils import assert_exception, default_context, set_default_context
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index 05e71b426eb..630cad87496 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import mxnet as mx
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def reldiff(a, b):
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index ced3063448b..e540657ed8f 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -20,9 +20,8 @@
 from mxnet.gluon import nn
 from mxnet.test_utils import assert_almost_equal
 from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
-from common import setup_module, with_seed, assertRaises
+from common import setup_module, with_seed, assertRaises, teardown
 import numpy as np
-from numpy.testing import assert_array_equal
 from nose.tools import raises, assert_raises
 from copy import deepcopy
 import warnings
@@ -91,15 +90,16 @@ def test_parameter_invalid_access():
 
 @with_seed()
 def test_paramdict():
+    ctx = mx.cpu(1)
     params0 = gluon.ParameterDict('net_')
     params0.get('w0', shape=(10, 10))
     params0.get('w1', shape=(10, 10), stype='row_sparse')
-    all_row_ids = mx.nd.arange(0, 10, ctx=mx.cpu())
+    all_row_ids = mx.nd.arange(0, 10, ctx=ctx)
     # check param names
     assert list(params0.keys()) == ['net_w0', 'net_w1']
-    params0.initialize(ctx=mx.cpu())
+    params0.initialize(ctx=ctx)
     trainer0 = mx.gluon.Trainer(params0, 'sgd')
-    prev_w0 = params0.get('w0').data(mx.cpu())
+    prev_w0 = params0.get('w0').data(ctx)
     prev_w1 = params0.get('w1').row_sparse_data(all_row_ids)
     # save params
     params0.save('test_paramdict.params')
@@ -108,11 +108,11 @@ def test_paramdict():
     params1 = gluon.ParameterDict('net_')
     params1.get('w0', shape=(10, 10))
     params1.get('w1', shape=(10, 10), stype='row_sparse')
-    params1.load('test_paramdict.params', mx.cpu())
+    params1.load('test_paramdict.params', ctx)
     trainer1 = mx.gluon.Trainer(params1, 'sgd')
 
     # compare the values before and after save/load
-    cur_w0 = params1.get('w0').data(mx.cpu())
+    cur_w0 = params1.get('w0').data(ctx)
     cur_w1 = params1.get('w1').row_sparse_data(all_row_ids)
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
@@ -122,11 +122,11 @@ def test_paramdict():
     params2 = gluon.ParameterDict('net_')
     params2.get('w0', shape=(10, 10))
     params2.get('w1', shape=(10, 10))
-    params2.load('test_paramdict.params', mx.cpu())
+    params2.load('test_paramdict.params', ctx)
 
     # compare the values before and after save/load
-    cur_w0 = params2.get('w0').data(mx.cpu())
-    cur_w1 = params2.get('w1').data(mx.cpu())
+    cur_w0 = params2.get('w0').data(ctx)
+    cur_w1 = params2.get('w1').data(ctx)
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
 
@@ -202,20 +202,20 @@ def forward(self, x):
     net1.collect_params().initialize()
     net2(mx.nd.zeros((3, 5)))
 
-    net1.save_params('net1.params')
+    net1.save_parameters('net1.params')
 
     net3 = Net(prefix='net3_')
-    net3.load_params('net1.params', mx.cpu())
+    net3.load_parameters('net1.params', mx.cpu())
 
     net4 = Net(prefix='net4_')
     net5 = Net(prefix='net5_', in_units=5, params=net4.collect_params())
     net4.collect_params().initialize()
     net5(mx.nd.zeros((3, 5)))
 
-    net4.save_params('net4.params')
+    net4.save_parameters('net4.params')
 
     net6 = Net(prefix='net6_')
-    net6.load_params('net4.params', mx.cpu())
+    net6.load_parameters('net4.params', mx.cpu())
 
 
 @with_seed()
@@ -359,6 +359,7 @@ def test_sparse_hybrid_block():
 
 @with_seed()
 def check_layer_forward(layer, dshape):
+    print("checking layer {}\nshape: {}.".format(layer, dshape))
     layer.collect_params().initialize()
     x = mx.nd.ones(shape=dshape)
     x.attach_grad()
@@ -438,7 +439,7 @@ def test_deconv():
         nn.Conv2DTranspose(16, (3, 4), groups=2, in_channels=4),
         nn.Conv2DTranspose(16, (3, 4), strides=4, in_channels=4),
         nn.Conv2DTranspose(16, (3, 4), dilation=4, in_channels=4),
-        nn.Conv2DTranspose(16, (3, 4), padding=4, in_channels=4),
+    #   nn.Conv2DTranspose(16, (3, 4), padding=4, in_channels=4),
         nn.Conv2DTranspose(16, (3, 4), strides=4, output_padding=3, in_channels=4),
         ]
     for layer in layers2d:
@@ -470,6 +471,7 @@ def test_pool():
         nn.MaxPool1D(3),
         nn.MaxPool1D(3, 2),
         nn.AvgPool1D(),
+        nn.AvgPool1D(count_include_pad=False),
         nn.GlobalAvgPool1D(),
         ]
     for layer in layers1d:
@@ -481,6 +483,7 @@ def test_pool():
         nn.MaxPool2D((3, 3)),
         nn.MaxPool2D(3, 2),
         nn.AvgPool2D(),
+        nn.AvgPool2D(count_include_pad=False),
         nn.GlobalAvgPool2D(),
         ]
     for layer in layers2d:
@@ -491,6 +494,7 @@ def test_pool():
         nn.MaxPool3D((3, 3, 3)),
         nn.MaxPool3D(3, 2),
         nn.AvgPool3D(),
+        nn.AvgPool3D(count_include_pad=False),
         nn.GlobalAvgPool3D(),
         ]
     for layer in layers3d:
@@ -728,19 +732,23 @@ def test_sequential_warning():
 
 @with_seed()
 def test_global_norm_clip():
-    x1 = mx.nd.ones((3,3))
-    x2 = mx.nd.ones((4,4))
-    norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
-    assert norm == 5.0
-    assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
-    assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
-
-    x3 = mx.nd.array([1.0, 2.0, float('nan')])
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter("always")
-        gluon.utils.clip_global_norm([x1, x3], 2.0)
-        assert len(w) == 1
-
+    stypes = ['default', 'row_sparse']
+    def check_global_norm_clip(stype):
+        x1 = mx.nd.ones((3,3)).tostype(stype)
+        x2 = mx.nd.ones((4,4)).tostype(stype)
+        norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
+        assert norm == 5.0
+        assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
+        assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
+
+        x3 = mx.nd.array([1.0, 2.0, float('nan')]).tostype(stype)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            gluon.utils.clip_global_norm([x1, x3], 2.0)
+            assert len(w) == 1
+
+    for stype in stypes:
+        check_global_norm_clip(stype)
 
 @with_seed()
 def test_embedding():
@@ -776,7 +784,7 @@ def test_export():
     model = gluon.model_zoo.vision.resnet18_v1(
         prefix='resnet', ctx=ctx, pretrained=True)
     model.hybridize()
-    data = mx.nd.random.normal(shape=(1, 3, 224, 224))
+    data = mx.nd.random.normal(shape=(1, 3, 32, 32))
     out = model(data)
 
     model.export('gluon')
@@ -794,6 +802,22 @@ def test_export():
 
     assert_almost_equal(out.asnumpy(), out2.asnumpy())
 
+@with_seed()
+def test_import():
+    ctx = mx.context.current_context()
+    net1 = gluon.model_zoo.vision.resnet18_v1(
+        prefix='resnet', ctx=ctx, pretrained=True)
+    net1.hybridize()
+    data = mx.nd.random.normal(shape=(1, 3, 32, 32))
+    out1 = net1(data)
+
+    net1.export('net1', epoch=1)
+
+    net2 = gluon.SymbolBlock.imports(
+        'net1-symbol.json', ['data'], 'net1-0001.params', ctx)
+    out2 = net2(data)
+
+    assert_almost_equal(out1.asnumpy(), out2.asnumpy())
 
 @with_seed()
 def test_hybrid_stale_cache():
@@ -910,7 +934,7 @@ def test_fill_shape_load():
     net1.hybridize()
     net1.initialize(ctx=ctx)
     net1(mx.nd.ones((2,3,5,7), ctx))
-    net1.save_params('net_fill.params')
+    net1.save_parameters('net_fill.params')
 
     net2 = nn.HybridSequential()
     with net2.name_scope():
@@ -919,7 +943,7 @@ def test_fill_shape_load():
                  nn.Dense(10))
     net2.hybridize()
     net2.initialize()
-    net2.load_params('net_fill.params', ctx)
+    net2.load_parameters('net_fill.params', ctx)
     assert net2[0].weight.shape[1] == 3, net2[0].weight.shape[1]
     assert net2[1].gamma.shape[0] == 64, net2[1].gamma.shape[0]
     assert net2[2].weight.shape[1] == 3072, net2[2].weight.shape[1]
@@ -1065,12 +1089,12 @@ def test_req():
 @with_seed()
 def test_save_load():
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=True)
-    net.save_params('test_save_load.params')
+    net.save_parameters('test_save_load.params')
 
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
     net.output = mx.gluon.nn.Dense(1000)
 
-    net.load_params('test_save_load.params')
+    net.load_parameters('test_save_load.params')
 
 @with_seed()
 def test_symbol_block_save_load():
@@ -1095,10 +1119,10 @@ def hybrid_forward(self, F, x):
     net1.initialize(mx.init.Normal())
     net1.hybridize()
     net1(mx.nd.random.normal(shape=(1, 3, 32, 32)))
-    net1.save_params('./test_symbol_block_save_load.params')
+    net1.save_parameters('./test_symbol_block_save_load.params')
 
     net2 = Net()
-    net2.load_params('./test_symbol_block_save_load.params', ctx=mx.cpu())
+    net2.load_parameters('./test_symbol_block_save_load.params', ctx=mx.cpu())
 
 
 @with_seed()
@@ -1108,6 +1132,7 @@ def test_hybrid_multi_context():
     net.hybridize()
     net(mx.nd.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy()
 
+
 @with_seed()
 def test_zero_grad():
     data = mx.nd.random.uniform(shape=(3,3))
@@ -1120,60 +1145,6 @@ def test_zero_grad():
     grad = net.collect_params()['test_zero_grad_weight'].grad()
     assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0)
 
-def check_hybrid_static_memory(**kwargs):
-    x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
-    x.attach_grad()
-
-    net1 = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=True, prefix='net_', ctx=mx.context.current_context())
-    net2 = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=True, prefix='net_', ctx=mx.context.current_context())
-    net2.hybridize(**kwargs)
-    net1(x)
-    net2(x)
-
-    def test(net, x):
-        with mx.autograd.record():
-            y = net(x) + net(x)
-            y.backward()
-
-        grads = {k: v.grad() for k, v in net.collect_params().items() if v.grad_req != 'null'}
-
-        return y, grads
-
-    y1, grads1 = test(net1, x)
-    y2, grads2 = test(net2, x)
-
-    assert_almost_equal(y1.asnumpy(), y2.asnumpy(), rtol=1e-3, atol=1e-5)
-    for key in grads1:
-        assert_almost_equal(grads1[key].asnumpy(), grads2[key].asnumpy(), rtol=1e-3, atol=1e-5)
-
-def test_hybrid_static_memory():
-    check_hybrid_static_memory()
-    check_hybrid_static_memory(static_alloc=True)
-    check_hybrid_static_memory(static_alloc=True, static_shape=True)
-
-def check_hybrid_static_memory_switching(**kwargs):
-    net = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=True, ctx=mx.context.current_context())
-    net.hybridize(**kwargs)
-
-    x = mx.nd.random.uniform(shape=(4, 3, 32, 32))
-    net(x)
-    with mx.autograd.record():
-        y = net(x)
-        y.backward()
-    x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
-    net(x)
-    with mx.autograd.record():
-        y = net(x)
-        y.backward()
-    mx.nd.waitall()
-
-def test_hybrid_static_memory_switching():
-    check_hybrid_static_memory_switching()
-    check_hybrid_static_memory_switching(static_alloc=True)
-    check_hybrid_static_memory_switching(static_alloc=True, static_shape=True)
 
 @with_seed()
 def test_hook():
@@ -1252,6 +1223,22 @@ def test_summary():
     assert_raises(AssertionError, net.summary, mx.nd.ones((32, 3, 224, 224)))
 
 
+@with_seed()
+def test_legacy_save_params():
+    net = gluon.nn.HybridSequential(prefix='')
+    with net.name_scope():
+        net.add(gluon.nn.Conv2D(10, (3, 3)))
+        net.add(gluon.nn.Dense(50))
+    net.initialize()
+    net(mx.nd.ones((1,1,50,50)))
+    a = net(mx.sym.var('data'))
+    a.save('test.json')
+    net.save_params('test.params')
+    model = gluon.nn.SymbolBlock(outputs=mx.sym.load_json(open('test.json', 'r').read()),
+                                     inputs=mx.sym.var('data'))
+    model.load_params('test.params', ctx=mx.cpu())
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_contrib.py b/tests/python/unittest/test_gluon_contrib.py
index 264ff1f5e53..a1cd8ea537d 100644
--- a/tests/python/unittest/test_gluon_contrib.py
+++ b/tests/python/unittest/test_gluon_contrib.py
@@ -21,7 +21,7 @@
 from mxnet.gluon import nn
 from mxnet.gluon.contrib.nn import Concurrent, HybridConcurrent, Identity, SparseEmbedding
 from mxnet.test_utils import almost_equal
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import numpy as np
 from numpy.testing import assert_allclose
 
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 751886b8e7f..ef2ba2ab9b2 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -23,7 +23,7 @@
 import random
 from mxnet import gluon
 import platform
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 from mxnet.gluon.data import DataLoader
 import mxnet.ndarray as nd
 from mxnet import context
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
index fe360ac9708..2ff9c5cb2a1 100644
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ b/tests/python/unittest/test_gluon_data_vision.py
@@ -22,7 +22,7 @@
 from mxnet.gluon.data.vision import transforms
 from mxnet.test_utils import assert_almost_equal
 from mxnet.test_utils import almost_equal
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 @with_seed()
@@ -66,18 +66,19 @@ def test_transformer():
     from mxnet.gluon.data.vision import transforms
 
     transform = transforms.Compose([
-		transforms.Resize(300),
-		transforms.CenterCrop(256),
-		transforms.RandomResizedCrop(224),
-		transforms.RandomFlipLeftRight(),
-		transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
-		transforms.RandomBrightness(0.1),
-		transforms.RandomContrast(0.1),
-		transforms.RandomSaturation(0.1),
-		transforms.RandomHue(0.1),
-		transforms.RandomLighting(0.1),
-		transforms.ToTensor(),
-		transforms.Normalize([0, 0, 0], [1, 1, 1])])
+        transforms.Resize(300),
+        transforms.Resize(300, keep_ratio=True),
+        transforms.CenterCrop(256),
+        transforms.RandomResizedCrop(224),
+        transforms.RandomFlipLeftRight(),
+        transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
+        transforms.RandomBrightness(0.1),
+        transforms.RandomContrast(0.1),
+        transforms.RandomSaturation(0.1),
+        transforms.RandomHue(0.1),
+        transforms.RandomLighting(0.1),
+        transforms.ToTensor(),
+        transforms.Normalize([0, 0, 0], [1, 1, 1])])
 
     transform(mx.nd.ones((245, 480, 3), dtype='uint8')).wait_to_read()
 
diff --git a/tests/python/unittest/test_gluon_model_zoo.py b/tests/python/unittest/test_gluon_model_zoo.py
index f89a8f70182..a64668451a2 100644
--- a/tests/python/unittest/test_gluon_model_zoo.py
+++ b/tests/python/unittest/test_gluon_model_zoo.py
@@ -19,7 +19,7 @@
 import mxnet as mx
 from mxnet.gluon.model_zoo.vision import get_model
 import sys
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 
 def eprint(*args, **kwargs):
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index c2e11ebb18e..1c59ceaa093 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -177,24 +177,30 @@ def test_trainer_save_load():
 
 @with_seed()
 def test_trainer_reset_kv():
-    params = gluon.ParameterDict()
-    x = params.get('x', shape=(10,), lr_mult=1.0)
-    params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-    trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1})
-    params.save('test_trainer_reset_kv.params')
-    with mx.autograd.record():
-        for w in x.list_data():
-            y = w + 1
-            y.backward()
-    trainer.step(1)
-    # load would reset kvstore
-    params.load('test_trainer_reset_kv.params')
-    assert trainer._kvstore is None
-    assert trainer._kv_initialized is False
-    with mx.autograd.record():
-        for w in x.list_data():
-            y = w + 1
-            y.backward()
-    trainer.step(1)
-    # the updated parameter should be based on the loaded checkpoint
-    assert (x.data(mx.cpu()) == -0.2).asnumpy().all()
+    def check_trainer_reset_kv(kv):
+        params = gluon.ParameterDict()
+        x = params.get('x', shape=(10,), lr_mult=1.0)
+        params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+        trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv)
+        params.save('test_trainer_reset_kv.params')
+        with mx.autograd.record():
+            for w in x.list_data():
+                y = w + 1
+                y.backward()
+        trainer.step(1)
+        assert trainer._kvstore.type == kv
+        # load would reset kvstore
+        params.load('test_trainer_reset_kv.params')
+        assert trainer._kvstore is None
+        assert trainer._kv_initialized is False
+        with mx.autograd.record():
+            for w in x.list_data():
+                y = w + 1
+                y.backward()
+        trainer.step(1)
+        # the updated parameter should be based on the loaded checkpoint
+        assert (x.data(mx.cpu()) == -0.2).asnumpy().all()
+
+    kvs = ['local', 'device']
+    for kv in kvs:
+        check_trainer_reset_kv(kv)
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 7e6ef1af5ab..c758208acde 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -20,7 +20,8 @@
 from mxnet.test_utils import *
 from mxnet.base import MXNetError
 import numpy as np
-import os, gzip
+import os
+import gzip
 import pickle as pickle
 import time
 try:
@@ -31,16 +32,17 @@
 from common import assertRaises
 import unittest
 
+
 def test_MNISTIter():
     # prepare data
     get_mnist_ubyte()
 
     batch_size = 100
     train_dataiter = mx.io.MNISTIter(
-            image="data/train-images-idx3-ubyte",
-            label="data/train-labels-idx1-ubyte",
-            data_shape=(784,),
-            batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        data_shape=(784,),
+        batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
     # test_loop
     nbatch = 60000 / batch_size
     batch_count = 0
@@ -60,18 +62,19 @@ def test_MNISTIter():
     label_1 = train_dataiter.getlabel().asnumpy().flatten()
     assert(sum(label_0 - label_1) == 0)
 
+
 def test_Cifar10Rec():
     get_cifar10()
     dataiter = mx.io.ImageRecordIter(
-            path_imgrec="data/cifar/train.rec",
-            mean_img="data/cifar/cifar10_mean.bin",
-            rand_crop=False,
-            and_mirror=False,
-            shuffle=False,
-            data_shape=(3,28,28),
-            batch_size=100,
-            preprocess_threads=4,
-            prefetch_buffer=1)
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar10_mean.bin",
+        rand_crop=False,
+        and_mirror=False,
+        shuffle=False,
+        data_shape=(3, 28, 28),
+        batch_size=100,
+        preprocess_threads=4,
+        prefetch_buffer=1)
     labelcount = [0 for i in range(10)]
     batchcount = 0
     for batch in dataiter:
@@ -84,23 +87,26 @@ def test_Cifar10Rec():
     for i in range(10):
         assert(labelcount[i] == 5000)
 
+
 def test_NDArrayIter():
     data = np.ones([1000, 2, 2])
     label = np.ones([1000, 1])
     for i in range(1000):
         data[i] = i / 100
         label[i] = i / 100
-    dataiter = mx.io.NDArrayIter(data, label, 128, True, last_batch_handle='pad')
+    dataiter = mx.io.NDArrayIter(
+        data, label, 128, True, last_batch_handle='pad')
     batchidx = 0
     for batch in dataiter:
         batchidx += 1
     assert(batchidx == 8)
-    dataiter = mx.io.NDArrayIter(data, label, 128, False, last_batch_handle='pad')
+    dataiter = mx.io.NDArrayIter(
+        data, label, 128, False, last_batch_handle='pad')
     batchidx = 0
     labelcount = [0 for i in range(10)]
     for batch in dataiter:
         label = batch.label[0].asnumpy().flatten()
-        assert((batch.data[0].asnumpy()[:,0,0] == label).all())
+        assert((batch.data[0].asnumpy()[:, 0, 0] == label).all())
         for i in range(label.shape[0]):
             labelcount[int(label[i])] += 1
 
@@ -110,6 +116,7 @@ def test_NDArrayIter():
         else:
             assert(labelcount[i] == 100)
 
+
 def test_NDArrayIter_h5py():
     if not h5py:
         return
@@ -128,17 +135,19 @@ def test_NDArrayIter_h5py():
         f.create_dataset("data", data=data)
         f.create_dataset("label", data=label)
 
-        dataiter = mx.io.NDArrayIter(f["data"], f["label"], 128, True, last_batch_handle='pad')
+        dataiter = mx.io.NDArrayIter(
+            f["data"], f["label"], 128, True, last_batch_handle='pad')
         batchidx = 0
         for batch in dataiter:
             batchidx += 1
         assert(batchidx == 8)
 
-        dataiter = mx.io.NDArrayIter(f["data"], f["label"], 128, False, last_batch_handle='pad')
+        dataiter = mx.io.NDArrayIter(
+            f["data"], f["label"], 128, False, last_batch_handle='pad')
         labelcount = [0 for i in range(10)]
         for batch in dataiter:
             label = batch.label[0].asnumpy().flatten()
-            assert((batch.data[0].asnumpy()[:,0,0] == label).all())
+            assert((batch.data[0].asnumpy()[:, 0, 0] == label).all())
             for i in range(label.shape[0]):
                 labelcount[int(label[i])] += 1
 
@@ -153,6 +162,7 @@ def test_NDArrayIter_h5py():
         else:
             assert(labelcount[i] == 100)
 
+
 def test_NDArrayIter_csr():
     # creating toy data
     num_rows = rnd.randint(5, 15)
@@ -163,17 +173,19 @@ def test_NDArrayIter_csr():
     dns = csr.asnumpy()
 
     # CSRNDArray or scipy.sparse.csr_matrix with last_batch_handle not equal to 'discard' will throw NotImplementedError
-    assertRaises(NotImplementedError, mx.io.NDArrayIter, {'data': csr}, dns, batch_size)
+    assertRaises(NotImplementedError, mx.io.NDArrayIter,
+                 {'data': csr}, dns, batch_size)
     try:
         import scipy.sparse as spsp
         train_data = spsp.csr_matrix(dns)
-        assertRaises(NotImplementedError, mx.io.NDArrayIter, {'data': train_data}, dns, batch_size)
+        assertRaises(NotImplementedError, mx.io.NDArrayIter,
+                     {'data': train_data}, dns, batch_size)
     except ImportError:
         pass
 
     # CSRNDArray with shuffle
     csr_iter = iter(mx.io.NDArrayIter({'csr_data': csr, 'dns_data': dns}, dns, batch_size,
-                    shuffle=True, last_batch_handle='discard'))
+                                      shuffle=True, last_batch_handle='discard'))
     num_batch = 0
     for batch in csr_iter:
         num_batch += 1
@@ -181,7 +193,8 @@ def test_NDArrayIter_csr():
     assert(num_batch == num_rows // batch_size)
 
     # make iterators
-    csr_iter = iter(mx.io.NDArrayIter(csr, csr, batch_size, last_batch_handle='discard'))
+    csr_iter = iter(mx.io.NDArrayIter(
+        csr, csr, batch_size, last_batch_handle='discard'))
     begin = 0
     for batch in csr_iter:
         expected = np.zeros((batch_size, num_cols))
@@ -192,6 +205,7 @@ def test_NDArrayIter_csr():
         assert_almost_equal(batch.data[0].asnumpy(), expected)
         begin += batch_size
 
+
 def test_LibSVMIter():
 
     def check_libSVMIter_synthetic():
@@ -214,8 +228,8 @@ def check_libSVMIter_synthetic():
         data_train = mx.io.LibSVMIter(data_libsvm=data_path, label_libsvm=label_path,
                                       data_shape=(3, ), label_shape=(3, ), batch_size=3)
 
-        first = mx.nd.array([[ 0.5, 0., 1.2], [ 0., 0., 0.], [ 0.6, 2.4, 1.2]])
-        second = mx.nd.array([[ 0., 0., -1.2], [ 0.5, 0., 1.2], [ 0., 0., 0.]])
+        first = mx.nd.array([[0.5, 0., 1.2], [0., 0., 0.], [0.6, 2.4, 1.2]])
+        second = mx.nd.array([[0., 0., -1.2], [0.5, 0., 1.2], [0., 0., 0.]])
         i = 0
         for batch in iter(data_train):
             expected = first.asnumpy() if i == 0 else second.asnumpy()
@@ -286,10 +300,13 @@ def test_DataBatch():
     from nose.tools import ok_
     from mxnet.io import DataBatch
     import re
-    batch = DataBatch(data=[mx.nd.ones((2,3))])
-    ok_(re.match('DataBatch: data shapes: \[\(2L?, 3L?\)\] label shapes: None', str(batch)))
-    batch = DataBatch(data=[mx.nd.ones((2,3)), mx.nd.ones((7,8))], label=[mx.nd.ones((4,5))])
-    ok_(re.match('DataBatch: data shapes: \[\(2L?, 3L?\), \(7L?, 8L?\)\] label shapes: \[\(4L?, 5L?\)\]', str(batch)))
+    batch = DataBatch(data=[mx.nd.ones((2, 3))])
+    ok_(re.match(
+        'DataBatch: data shapes: \[\(2L?, 3L?\)\] label shapes: None', str(batch)))
+    batch = DataBatch(data=[mx.nd.ones((2, 3)), mx.nd.ones(
+        (7, 8))], label=[mx.nd.ones((4, 5))])
+    ok_(re.match(
+        'DataBatch: data shapes: \[\(2L?, 3L?\), \(7L?, 8L?\)\] label shapes: \[\(4L?, 5L?\)\]', str(batch)))
 
 
 def test_CSVIter():
@@ -307,7 +324,7 @@ def check_CSVIter_synthetic(dtype='float32'):
             for i in range(1000):
                 fout.write('0\n')
 
-        data_train = mx.io.CSVIter(data_csv=data_path, data_shape=(8,8),
+        data_train = mx.io.CSVIter(data_csv=data_path, data_shape=(8, 8),
                                    label_csv=label_path, batch_size=100, dtype=dtype)
         expected = mx.nd.ones((100, 8, 8), dtype=dtype) * int(entry_str)
         for batch in iter(data_train):
@@ -318,6 +335,93 @@ def check_CSVIter_synthetic(dtype='float32'):
     for dtype in ['int32', 'float32']:
         check_CSVIter_synthetic(dtype=dtype)
 
+
+def test_ImageRecordIter_seed_augmentation():
+    get_cifar10()
+    seed_aug = 3
+
+    # check whether to get constant images after fixing seed_aug
+    dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar10_mean.bin",
+        shuffle=False,
+        data_shape=(3, 28, 28),
+        batch_size=3,
+        rand_crop=True,
+        rand_mirror=True,
+        max_random_scale=1.3,
+        max_random_illumination=3,
+        max_rotate_angle=10,
+        random_l=50,
+        random_s=40,
+        random_h=10,
+        max_shear_ratio=2,
+        seed_aug=seed_aug)
+    batch = dataiter.next()
+    data = batch.data[0].asnumpy().astype(np.uint8)
+
+    dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar10_mean.bin",
+        shuffle=False,
+        data_shape=(3, 28, 28),
+        batch_size=3,
+        rand_crop=True,
+        rand_mirror=True,
+        max_random_scale=1.3,
+        max_random_illumination=3,
+        max_rotate_angle=10,
+        random_l=50,
+        random_s=40,
+        random_h=10,
+        max_shear_ratio=2,
+        seed_aug=seed_aug)
+    batch = dataiter.next()
+    data2 = batch.data[0].asnumpy().astype(np.uint8)
+    assert(np.array_equal(data,data2))
+
+    # check whether to get different images after change seed_aug
+    dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar10_mean.bin",
+        shuffle=False,
+        data_shape=(3, 28, 28),
+        batch_size=3,
+        rand_crop=True,
+        rand_mirror=True,
+        max_random_scale=1.3,
+        max_random_illumination=3,
+        max_rotate_angle=10,
+        random_l=50,
+        random_s=40,
+        random_h=10,
+        max_shear_ratio=2,
+        seed_aug=seed_aug+1)
+    batch = dataiter.next()
+    data2 = batch.data[0].asnumpy().astype(np.uint8)
+    assert(not np.array_equal(data,data2))
+
+    # check whether seed_aug changes the iterator behavior
+    dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar10_mean.bin",
+        shuffle=False,
+        data_shape=(3, 28, 28),
+        batch_size=3)
+    batch = dataiter.next()
+    data = batch.data[0].asnumpy().astype(np.uint8)
+
+    dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar/cifar10_mean.bin",
+        shuffle=False,
+        data_shape=(3, 28, 28),
+        batch_size=3,
+        seed_aug=seed_aug)
+    batch = dataiter.next()
+    data2 = batch.data[0].asnumpy().astype(np.uint8)
+    assert(np.array_equal(data,data2))
+
 if __name__ == "__main__":
     test_NDArrayIter()
     if h5py:
@@ -327,3 +431,4 @@ def check_CSVIter_synthetic(dtype='float32'):
     test_LibSVMIter()
     test_NDArrayIter_csr()
     test_CSVIter()
+    test_ImageRecordIter_seed_augmentation()
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index 44d522ab9a5..0ab61bb2748 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -20,7 +20,7 @@
 import numpy as np
 import unittest
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
-from common import setup_module, with_seed, assertRaises
+from common import setup_module, with_seed, assertRaises, teardown
 from mxnet.base import py_str, MXNetError
 
 shape = (4, 4)
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 5a3237d2d64..14c4f6b8a16 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -19,7 +19,7 @@
 import numpy as np
 from mxnet import gluon
 from mxnet.test_utils import assert_almost_equal, default_context
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import unittest
 
 
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index ae950457444..802988b4329 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -21,7 +21,7 @@
 import numpy as np
 from functools import reduce
 from mxnet.module.executor_group import DataParallelExecutorGroup
-from common import setup_module, with_seed, assertRaises
+from common import setup_module, with_seed, assertRaises, teardown
 from collections import namedtuple
 
 
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 92cdb2ced9d..aeaa0b72679 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -21,7 +21,7 @@
 import pickle as pkl
 import unittest
 from nose.tools import raises
-from common import setup_module, with_seed, assertRaises, TemporaryDirectory
+from common import setup_module, with_seed, assertRaises, TemporaryDirectory, teardown
 from mxnet.test_utils import almost_equal
 from mxnet.test_utils import assert_almost_equal, assert_exception
 from mxnet.test_utils import default_context
@@ -711,9 +711,8 @@ def get_values(ensure_unique):
                  k=dat_size*dat_size*dat_size*dat_size, is_ascend=False)
     assert_almost_equal(nd_ret_argsort, gt)
 
-    # test topk with a big shape
-    a = mx.nd.arange(0, 54686454, step=1, repeat=1)
-    assert_almost_equal(a.topk(k=54686454).asnumpy(), a.asnumpy()[::-1])
+    a = mx.nd.arange(0, 1024, step=1, repeat=1)
+    assert_almost_equal(a.topk(k=1024).asnumpy(), a.asnumpy()[::-1])
 
     # Repeat those tests that don't involve indices.  These should pass even with
     # duplicated input data values (over many repeated runs with different random seeds,
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ab03973e8e8..67426693436 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -25,7 +25,7 @@
 from numpy.testing import assert_allclose, assert_array_equal
 from mxnet.test_utils import *
 from mxnet.base import py_str, MXNetError
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import unittest
 
 def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req):
@@ -115,6 +115,8 @@ def test_gru_sym():
     check_rnn_consistency(fused, stack, T, N, I, H, 'add')
     check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
+
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11219")
 @with_seed()
 def test_gru_bidirectional():
     T, N, I, H = 5, 20, 800, 800
@@ -675,7 +677,9 @@ def fprelu_grad(x, y, gamma):
         copy_x = x.copy()
         copy_x[pos_indices] = 0.0
         grad_x[pos_indices] = 1.0
-        if gamma.shape[0] == 1:
+        if len(gamma.shape) > 1:
+            grad_gam = copy_x
+        elif gamma.shape[0] == 1:
             grad_gam = np.sum(np.sum(copy_x))
         elif gamma.shape[0] > 1:
             grad_gam = np.sum(copy_x, axis=0)
@@ -685,6 +689,7 @@ def fprelu_grad(x, y, gamma):
     gamma = mx.symbol.Variable("gamma")
     for dtype in [np.float16, np.float32, np.float64]:
         for gam in [np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype)]:
+            gam_full = np.array([gam, gam, gam])
             xa = np.random.uniform(low=-1.0,high=1.0,size=shape).astype(dtype)
             rtol = 1e-2
             atol = 1e-3
@@ -692,12 +697,18 @@ def fprelu_grad(x, y, gamma):
             xa[abs(xa) < eps] = 1.0
             y = mx.symbol.LeakyReLU(data=x, gamma=gamma, act_type='prelu')
             ya = fprelu(xa, gam)
+            ya_full = fprelu(xa, gam_full)
             g_xa, g_gam = fprelu_grad(xa, ya, gamma=gam)
+            g_xa_full, g_gam_full = fprelu_grad(xa, ya_full, gamma=gam_full)
             # Skip numeric check for float16 type to get rid of flaky behavior
             if dtype is not np.float16:
                 check_numeric_gradient(y, [xa, gam], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
+                check_numeric_gradient(y, [xa, gam_full], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
             check_symbolic_forward(y, [xa, gam], [ya], rtol=rtol, atol=atol, dtype=dtype)
             check_symbolic_backward(y, [xa, gam], [np.ones(shape), np.ones(gam.shape)], [g_xa, g_gam], rtol=rtol, atol=atol, dtype=dtype)
+            check_symbolic_forward(y, [xa, gam_full], [ya_full], rtol=rtol, atol=atol, dtype=dtype)
+            check_symbolic_backward(y, [xa, gam_full], [np.ones(shape), np.ones(gam_full.shape)],
+                                    [g_xa_full, g_gam_full], rtol=rtol, atol=atol, dtype=dtype)
 
 @with_seed()
 def test_sigmoid():
@@ -4087,7 +4098,7 @@ def test_new_softmax():
     for ndim in range(1, 5):
         for _ in range(5):
             shape = np.random.randint(1, 5, size=ndim)
-            axis = np.random.randint(0, ndim)
+            axis = np.random.randint(-ndim, ndim)
             data = np.random.uniform(-2, 2, size=shape)
             sym = mx.sym.softmax(axis=axis)
             check_symbolic_forward(sym, [data], [np_softmax(data, axis=axis)])
@@ -5821,7 +5832,7 @@ def py_bilinear_resize(x, outputHeight, outputWidth):
         batch, channel, inputHeight, inputWidth = x.shape
         if outputHeight == inputHeight and outputWidth == inputWidth:
             return x
-        y = np.empty([batch, channel, outputHeight, outputWidth]) 
+        y = np.empty([batch, channel, outputHeight, outputWidth])
         rheight = 1.0 * (inputHeight - 1) / (outputHeight - 1) if outputHeight > 1 else 0.0
         rwidth = 1.0 * (inputWidth - 1) / (outputWidth - 1) if outputWidth > 1 else 0.0
         for h2 in range(outputHeight):
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 90762f7620f..fba10fb522a 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -23,7 +23,7 @@
 from nose.tools import raises
 import math
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 @with_seed()
 def test_learning_rate():
@@ -230,7 +230,10 @@ def test_sgd():
                                     ('multi_precision' not in kwarg or
                                         not kwarg['multi_precision'])):
                                 continue
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                            if dtype == np.float16:
+                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3)
+                            else:
+                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
                             # test operator fallback on cpu
                             if dtype != np.float16:
                                 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
@@ -420,7 +423,7 @@ def update(self, index, weight, grad, state):
               grad += wd * weight
               mom[:] += grad
               grad[:] += self.momentum * mom
-              weight[:] += -lr * grad 
+              weight[:] += -lr * grad
         else:
             grad32 = array(grad, ctx=grad.context, dtype=np.float32)
             grad32 = grad32 * self.rescale_grad
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 7abbc9918c5..3251ba0fac8 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -22,7 +22,7 @@
 from mxnet.test_utils import verify_generator, gen_buckets_probs_with_ppf
 import numpy as np
 import random as rnd
-from common import setup_module, with_seed, random_seed
+from common import setup_module, with_seed, random_seed, teardown
 import scipy.stats as ss
 
 def same(a, b):
diff --git a/tests/python/unittest/test_recordio.py b/tests/python/unittest/test_recordio.py
index 51d80c33504..9edf9b459f7 100644
--- a/tests/python/unittest/test_recordio.py
+++ b/tests/python/unittest/test_recordio.py
@@ -22,7 +22,7 @@
 import tempfile
 import random
 import string
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 
 @with_seed()
 def test_recordio():
diff --git a/tests/python/unittest/test_rnn.py b/tests/python/unittest/test_rnn.py
index 52a3dcf9934..a5588250e51 100644
--- a/tests/python/unittest/test_rnn.py
+++ b/tests/python/unittest/test_rnn.py
@@ -300,7 +300,15 @@ def test_convgru():
     args, outs, auxs = outputs.infer_shape(rnn_t0_data=(1, 3, 16, 10), rnn_t1_data=(1, 3, 16, 10), rnn_t2_data=(1, 3, 16, 10))
     assert outs == [(1, 10, 16, 10), (1, 10, 16, 10), (1, 10, 16, 10)]
 
+def test_encode_sentences():
+    sentences = [['a','b','c'],['b','c','d']]
+    dict = {'a':1, 'b':2, 'c':3}
+    result, vocab = mx.rnn.io.encode_sentences(sentences, vocab=dict, invalid_label=-1, invalid_key='\n',
+                         start_label=0, unknown_token='UNK')
+    print(result, vocab)
+    assert vocab == {'a': 1, 'b': 2, 'c': 3, 'UNK': 0}
+    assert result == [[1,2,3],[2,3,0]]
+    
 if __name__ == '__main__':
     import nose
     nose.runmodule()
-
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index c90fb1317fe..b0c3a0cdcd2 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -19,7 +19,7 @@
 
 from mxnet.ndarray import NDArray
 from mxnet.test_utils import *
-from common import setup_module, with_seed, random_seed
+from common import setup_module, with_seed, random_seed, teardown
 from mxnet.base import mx_real_t
 from numpy.testing import assert_allclose
 import numpy.random as rnd
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index b2ff0fecb5a..62f5f3ecb07 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -16,7 +16,7 @@
 # under the License.
 
 from mxnet.test_utils import *
-from common import setup_module, with_seed
+from common import setup_module, with_seed, teardown
 import random
 import warnings
 
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 0f9103dcaa9..4741df59c8d 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -213,3 +213,6 @@ def test_vision_large_scale_classification():
 
 def test_vision_cnn_visualization():
     assert _test_tutorial_nb('vision/cnn_visualization')
+
+def test_python_profiler():
+    assert _test_tutorial_nb('python/profiler')


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services