You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/05/29 10:45:07 UTC
[GitHub] jinhuang415 closed pull request #10783: Fix 'make clean USE_MKLDNN=1' will build mkldnn issue

jinhuang415 closed pull request #10783: Fix 'make clean USE_MKLDNN=1' will build mkldnn issue
URL: https://github.com/apache/incubator-mxnet/pull/10783
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index e9446f5a53c..dadcd97fdce 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit e9446f5a53cf5e61273deff7ce814093d2791766
+Subproject commit dadcd97fdceb5f395e963b2a637f6ed377f59fc4
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index b4137dfc88e..0e7ca738866 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit b4137dfc88e3bf5c6b62e833121802eb8c6696da
+Subproject commit 0e7ca738866d22cc700aa33b8de120b938f910d0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05d8021c367..246ae995e90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,8 +19,8 @@ mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON)
 mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON) # autodetects support if ON
 mxnet_option(USE_LAPACK           "Build with lapack support" ON IF NOT MSVC)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
-mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
-mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
+mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
+mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
@@ -87,7 +87,6 @@ if(MSVC)
   add_definitions(-DNNVM_EXPORTS)
   add_definitions(-DDMLC_STRICT_CXX11)
   add_definitions(-DNOMINMAX)
-  set(SUPPORT_F16C FALSE)
   if(USE_F16C)
     message("F16C instruction set is not yet supported for MSVC")
   endif()
@@ -185,34 +184,21 @@ if(USE_VTUNE)
   list(APPEND mxnet_LINKER_LIBS dl)
 endif()
 
-if(USE_MKL_IF_AVAILABLE)
-  if(USE_MKLDNN)
-    add_subdirectory(3rdparty/mkldnn)
-    include_directories(3rdparty/mkldnn/include)
-    list(APPEND mxnet_LINKER_LIBS mkldnn)
-  endif()
-  find_package(MKL)
-
-  if(MKL_FOUND)
-    include_directories(${MKL_INCLUDE_DIR})
-    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl)
-
-    if(USE_MKLDNN)
-      add_definitions(-DMXNET_USE_MKLDNN=1)
-    endif()
-
-    add_definitions(-DUSE_MKL=1)
-    add_definitions(-DCUB_MKL=1)
-    list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES})
-
-    if(NOT MSVC)
-      list(APPEND mxnet_LINKER_LIBS dl)
-    endif()
-    # If using MKL, use the Intel OMP libraries
-    list(APPEND mxnet_LINKER_LIBS iomp5)
-  else()
-    message(STATUS " MKL not found")
+if(USE_MKLDNN)
+  include(cmake/MklDnn.cmake)
+  # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
+  if(NOT MSVC)
+    set(ARCH_OPT_FLAGS "-mtune=generic")
   endif()
+  set(WITH_TEST OFF)
+  set(WITH_EXAMPLE OFF)
+  add_subdirectory(3rdparty/mkldnn)
+  
+  include_directories(3rdparty/mkldnn/include)
+  add_definitions(-DUSE_MKL=1)
+  add_definitions(-DCUB_MKL=1)
+  add_definitions(-DMXNET_USE_MKLDNN=1)
+  list(APPEND mxnet_LINKER_LIBS mkldnn)
 endif()
 
 # Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 5f5302a45a4..4bfafb60cba 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -170,4 +170,4 @@ List of Contributors
 * [Sina Afrooze](https://github.com/safrooze)
 * [Sergey Sokolov](https://github.com/Ishitori)
 * [Thomas Delteil](https://github.com/ThomasDelteil)
-
+* [Hang Zhang](http://hangzh.com)
diff --git a/Jenkinsfile b/Jenkinsfile
index 5601c52df1c..e45bea7f456 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -26,7 +26,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a'
 // command to start a docker container
 docker_run = 'tests/ci_build/ci_build.sh'
@@ -92,24 +92,34 @@ echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
 """
 }
 
+def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
+  def command = "ci/build.py --download-docker-cache --docker-cache-bucket ${env.DOCKER_CACHE_BUCKET} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
+  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
+  command = command.replaceAll('%PLATFORM%', platform)
+  command = command.replaceAll('%FUNCTION_NAME%', function_name)
+  command = command.replaceAll('%SHARED_MEM%', shared_mem)
+
+  sh command
+}
+
 // Python unittest for CPU
 // Python 2
 def python2_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    sh "ci/build.py --platform ${docker_container_name} /work/runtime_functions.sh unittest_ubuntu_python2_cpu"
+    docker_run(docker_container_name, 'unittest_ubuntu_python2_cpu', false)
   }
 }
 
 // Python 3
 def python3_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    sh "ci/build.py --platform ${docker_container_name} /work/runtime_functions.sh unittest_ubuntu_python3_cpu"
+    docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu', false)
   }
 }
 
 def python3_ut_mkldnn(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    sh "ci/build.py --build --platform ${docker_container_name} /work/runtime_functions.sh unittest_ubuntu_python3_cpu_mkldnn"
+    docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
   }
 }
 
@@ -118,14 +128,14 @@ def python3_ut_mkldnn(docker_container_name) {
 // Python 2
 def python2_gpu_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    sh "ci/build.py --nvidiadocker --platform ${docker_container_name} /work/runtime_functions.sh unittest_ubuntu_python2_gpu"
+    docker_run(docker_container_name, 'unittest_ubuntu_python2_gpu', true)
   }
 }
 
 // Python 3
 def python3_gpu_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    sh "ci/build.py --nvidiadocker --platform ${docker_container_name} /work/runtime_functions.sh unittest_ubuntu_python3_gpu"
+    docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu', true)
   }
 }
 
@@ -134,7 +144,7 @@ try {
     node('mxnetlinux-cpu') {
       ws('workspace/sanity') {
         init_git()
-        sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh sanity_check"
+        docker_run('ubuntu_cpu', 'sanity_check', false)
       }
     }
   }
@@ -145,7 +155,7 @@ try {
         ws('workspace/build-centos7-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform centos7_cpu /work/runtime_functions.sh build_centos7_cpu"
+            docker_run('centos7_cpu', 'build_centos7_cpu', false)
             pack_lib('centos7_cpu')
           }
         }
@@ -156,7 +166,7 @@ try {
         ws('workspace/build-centos7-mkldnn') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform centos7_cpu /work/runtime_functions.sh build_centos7_mkldnn"
+            docker_run('centos7_cpu', 'build_centos7_mkldnn', false)
             pack_lib('centos7_mkldnn')
           }
         }
@@ -167,7 +177,7 @@ try {
         ws('workspace/build-centos7-gpu') {
           timeout(time: max_time, unit: 'MINUTES') { 
             init_git()
-            sh "ci/build.py --platform centos7_gpu /work/runtime_functions.sh build_centos7_gpu"
+            docker_run('centos7_gpu', 'build_centos7_gpu', false)
             pack_lib('centos7_gpu')
           }
         }
@@ -178,7 +188,7 @@ try {
         ws('workspace/build-cpu-openblas') {
           timeout(time: max_time, unit: 'MINUTES') { 
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_openblas"
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
             pack_lib('cpu', mx_dist_lib)
           }
         }
@@ -189,7 +199,7 @@ try {
         ws('workspace/build-cpu-clang39') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_clang39"
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39', false)
           }
         }
       }
@@ -199,7 +209,7 @@ try {
         ws('workspace/build-cpu-clang50') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_clang50"
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50', false)
           }
         }
       }
@@ -209,7 +219,7 @@ try {
         ws('workspace/build-cpu-mkldnn-clang39') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_clang39_mkldnn"
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39_mkldnn', false)
             pack_lib('mkldnn_cpu_clang3', mx_mkldnn_lib)
           }
         }
@@ -220,7 +230,7 @@ try {
         ws('workspace/build-cpu-mkldnn-clang50') {
           timeout(time: max_time, unit: 'MINUTES') { 
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_clang50_mkldnn"
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50_mkldnn', false)
             pack_lib('mkldnn_cpu_clang5', mx_mkldnn_lib)
           }
         }
@@ -231,7 +241,7 @@ try {
         ws('workspace/build-mkldnn-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_mkldnn"
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkldnn', false)
             pack_lib('mkldnn_cpu', mx_mkldnn_lib)
           }
         }
@@ -242,7 +252,7 @@ try {
         ws('workspace/build-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_build_cuda /work/runtime_functions.sh build_ubuntu_gpu_mkldnn"
+            docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn', false)
             pack_lib('mkldnn_gpu', mx_mkldnn_lib)
           }  
         }
@@ -253,8 +263,16 @@ try {
         ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_build_cuda /work/runtime_functions.sh build_ubuntu_gpu_cuda91_cudnn7"
+            docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
             pack_lib('gpu', mx_dist_lib)
+            stash includes: 'build/cpp-package/example/lenet', name: 'cpp_lenet'
+            stash includes: 'build/cpp-package/example/alexnet', name: 'cpp_alexnet'
+            stash includes: 'build/cpp-package/example/googlenet', name: 'cpp_googlenet'
+            stash includes: 'build/cpp-package/example/lenet_with_mxdataiter', name: 'cpp_lenet_with_mxdataiter'
+            stash includes: 'build/cpp-package/example/resnet', name: 'cpp_resnet'
+            stash includes: 'build/cpp-package/example/mlp', name: 'cpp_mlp'
+            stash includes: 'build/cpp-package/example/mlp_cpu', name: 'cpp_mlp_cpu'
+            stash includes: 'build/cpp-package/example/mlp_gpu', name: 'cpp_mlp_gpu'
             stash includes: 'build/cpp-package/example/test_score', name: 'cpp_test_score'
             stash includes: 'build/cpp-package/example/test_optimizer', name: 'cpp_test_optimizer'
           }
@@ -266,7 +284,7 @@ try {
         ws('workspace/amalgamationmin') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_amalgamation_min"
+            docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation_min', false)
           }
         }
       }
@@ -276,7 +294,7 @@ try {
         ws('workspace/amalgamation') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_amalgamation"
+            docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation', false)
           }
         }
       }
@@ -287,7 +305,7 @@ try {
         ws('workspace/build-cmake-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu_cmake_mkldnn" //build_cuda
+            docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake_mkldnn', false)
             pack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
           }
         }
@@ -298,7 +316,7 @@ try {
         ws('workspace/build-cmake-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu_cmake" //build_cuda
+            docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake', false)
             pack_lib('cmake_gpu', mx_cmake_lib)
           }
         }
@@ -313,7 +331,7 @@ try {
               bat """mkdir build_vc14_cpu
                 call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
                 cd build_vc14_cpu
-                cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 ${env.WORKSPACE}"""
+                cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DUSE_MKL_IF_AVAILABLE=0 ${env.WORKSPACE}"""
               bat 'C:\\mxnet\\build_vc14_cpu.bat'
 
               bat '''rmdir /s/q pkg_vc14_cpu
@@ -347,7 +365,7 @@ try {
             bat """mkdir build_vc14_gpu
               call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
               cd build_vc14_gpu
-              cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
+              cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release -DUSE_MKL_IF_AVAILABLE=0 ${env.WORKSPACE}"""
             bat 'C:\\mxnet\\build_vc14_gpu.bat'
             bat '''rmdir /s/q pkg_vc14_gpu
               mkdir pkg_vc14_gpu\\lib
@@ -370,12 +388,53 @@ try {
         }
       }
     },
+    'Build GPU MKLDNN windows':{
+      node('mxnetwindows-cpu') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          ws('workspace/build-gpu') {
+            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0','BUILD_NAME=vc14_gpu_mkldnn']) {
+            init_git_win()
+            bat """mkdir build_%BUILD_NAME%
+              call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
+              cd build_%BUILD_NAME%
+              copy ${env.WORKSPACE}\\3rdparty\\mkldnn\\config_template.vcxproj.user ${env.WORKSPACE}\\config_template.vcxproj.user /y
+              cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
+            bat '''
+                call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
+                cd build_%BUILD_NAME%
+                set /a cores=%NUMBER_OF_PROCESSORS% * 2
+                jom -j %cores%
+                '''
+            bat '''rmdir /s/q pkg_%BUILD_NAME%
+              mkdir pkg_%BUILD_NAME%\\lib
+              mkdir pkg_%BUILD_NAME%\\python
+              mkdir pkg_%BUILD_NAME%\\include
+              mkdir pkg_%BUILD_NAME%\\build
+              copy build_%BUILD_NAME%\\libmxnet.lib pkg_%BUILD_NAME%\\lib
+              copy build_%BUILD_NAME%\\libmxnet.dll pkg_%BUILD_NAME%\\build
+              copy build_%BUILD_NAME%\\3rdparty\\mkldnn\\src\\mkldnn.dll pkg_%BUILD_NAME%\\build
+              copy build_%BUILD_NAME%\\libiomp5md.dll pkg_%BUILD_NAME%\\build
+              copy build_%BUILD_NAME%\\mklml.dll pkg_%BUILD_NAME%\\build
+              xcopy python pkg_%BUILD_NAME%\\python /E /I /Y
+              xcopy include pkg_%BUILD_NAME%\\include /E /I /Y
+              xcopy 3rdparty\\dmlc-core\\include pkg_%BUILD_NAME%\\include /E /I /Y
+              xcopy 3rdparty\\mshadow\\mshadow pkg_%BUILD_NAME%\\include\\mshadow /E /I /Y
+              xcopy 3rdparty\\nnvm\\include pkg_%BUILD_NAME%\\nnvm\\include /E /I /Y
+              del /Q *.7z
+              7z.exe a %BUILD_NAME%.7z pkg_%BUILD_NAME%\\
+              '''
+            stash includes: 'vc14_gpu_mkldnn.7z', name: 'vc14_gpu_mkldnn'
+            }
+          }
+        }
+      }
+    },
     'NVidia Jetson / ARMv8':{
       node('mxnetlinux-cpu') {
         ws('workspace/build-jetson-armv8') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform jetson /work/runtime_functions.sh build_jetson"
+            docker_run('jetson', 'build_jetson', false)
           }
         }
       }
@@ -385,17 +444,17 @@ try {
         ws('workspace/build-raspberry-armv7') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform armv7 /work/runtime_functions.sh build_armv7"
+            docker_run('armv7', 'build_armv7', false)
           }
         }
       }
     },
-    'Raspberry / ARMv6l':{
+    'Raspberry / ARMv6':{
       node('mxnetlinux-cpu') {
         ws('workspace/build-raspberry-armv6') {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
-            sh "ci/build.py --platform armv6 /work/runtime_functions.sh build_armv6"
+            docker_run('armv6', 'build_armv6', false)
           }
         }
       }
@@ -445,7 +504,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu', mx_lib)
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python2_quantization_gpu"
+            docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
           }
         }
       }
@@ -456,7 +515,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu', mx_lib)
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_quantization_gpu"
+            docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
           }
         }
       }
@@ -503,7 +562,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('centos7_cpu')
-            sh "ci/build.py --platform centos7_cpu /work/runtime_functions.sh unittest_centos7_cpu"
+            docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
           }
         }
       }
@@ -514,7 +573,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('centos7_gpu')
-            sh "ci/build.py --nvidiadocker --platform centos7_gpu /work/runtime_functions.sh unittest_centos7_gpu"
+            docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
           }
         }
       }
@@ -525,7 +584,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('cpu', mx_dist_lib)
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh unittest_ubuntu_cpu_scala"
+            docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
           }
         }
       }
@@ -536,7 +595,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu', mx_dist_lib)
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_gpu_scala"
+            docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_scala', true)
           }
         }
       }
@@ -547,7 +606,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('cpu')
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh unittest_ubuntu_cpugpu_perl"
+            docker_run('ubuntu_cpu', 'unittest_ubuntu_cpugpu_perl', false)
           }
         }
       }
@@ -558,7 +617,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_cpugpu_perl"
+            docker_run('ubuntu_gpu', 'unittest_ubuntu_cpugpu_perl', true)
           }
         }
       }
@@ -569,7 +628,18 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('cmake_gpu', mx_cmake_lib)
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_gpu_cpp"
+            docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
+          }
+        }
+      }
+    },
+    'Cpp: MKLDNN+GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-cpp-mkldnn-gpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            unpack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
+            docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
           }
         }
       }
@@ -580,7 +650,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('cpu')
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh unittest_ubuntu_cpu_R"
+            docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_R', false)
           }
         }
       }
@@ -591,7 +661,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_gpu_R"
+            docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_R', true)
           }
         }
       }
@@ -668,6 +738,24 @@ try {
           }
         }
       }
+    },
+    'Python 3: MKLDNN-GPU Win':{
+      node('mxnetwindows-gpu') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          ws('workspace/ut-python-gpu') {
+          init_git_win()
+          unstash 'vc14_gpu_mkldnn'
+          bat '''rmdir /s/q pkg_vc14_gpu_mkldnn
+            7z x -y vc14_gpu_mkldnn.7z'''
+          bat """xcopy C:\\mxnet\\data data /E /I /Y
+            xcopy C:\\mxnet\\model model /E /I /Y
+            call activate py3
+            set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python
+            del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python\\*.pyc
+            C:\\mxnet\\test_gpu.bat"""
+          }
+        }
+      }
     }
   }
 
@@ -678,7 +766,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('cpu')
-            sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh integrationtest_ubuntu_cpu_onnx"
+            docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_onnx', false)
           }
         }
       }
@@ -689,7 +777,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh integrationtest_ubuntu_gpu_python"
+            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_python', true)
           }
         }
       }
@@ -700,7 +788,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh integrationtest_ubuntu_gpu_caffe"
+            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_caffe', true)
           }
         }
       }
@@ -711,9 +799,17 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
+            unstash 'cpp_lenet'
+            unstash 'cpp_alexnet'
+            unstash 'cpp_googlenet'
+            unstash 'cpp_lenet_with_mxdataiter'
+            unstash 'cpp_resnet'
+            unstash 'cpp_mlp'
+            unstash 'cpp_mlp_cpu'
+            unstash 'cpp_mlp_gpu'
             unstash 'cpp_test_score'
             unstash 'cpp_test_optimizer'
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh integrationtest_ubuntu_gpu_cpp_package"
+            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_cpp_package', true)
           }
         }
       }
@@ -724,7 +820,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
-            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh integrationtest_ubuntu_gpu_dist_kvstore"
+            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
           }
         }
       }
@@ -735,7 +831,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
-            sh "ci/build.py --shm-size=3g --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh tutorialtest_ubuntu_python2_gpu"
+            docker_run('ubuntu_gpu', 'tutorialtest_ubuntu_python2_gpu', true, '3g')
           }
         }
       }
@@ -746,7 +842,7 @@ try {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
             unpack_lib('gpu')
-            sh "ci/build.py --shm-size=3g --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh tutorialtest_ubuntu_python3_gpu"
+            docker_run('ubuntu_gpu', 'tutorialtest_ubuntu_python3_gpu', true, '3g')
           }
         }
       }
@@ -758,7 +854,7 @@ try {
       ws('workspace/docs') {
         timeout(time: max_time, unit: 'MINUTES') {
           init_git()
-          sh "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh deploy_docs"
+          docker_run('ubuntu_cpu', 'deploy_docs', false)
           sh "tests/ci_build/deploy/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
         }        
       }
diff --git a/MKL_README.md b/MKL_README.md
index 5374adb8e42..a5c63b097c5 100644
--- a/MKL_README.md
+++ b/MKL_README.md
@@ -1,19 +1,77 @@
-# Full MKL Installation
-
-## Build/Install MXNet with a full MKL installation:
-Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.
-
-  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/articles/intel-mkl-111-install-guide)
-
-  2. Set USE_BLAS=mkl in make/config.mk
-
-        1.1 Set ADD_LDFLAGS=-L<path/to/mkl/lib/folder> (ex. ADD_LDFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/lib)
-
-        1.1 Set ADD_CFLAGS=-I<path/to/mkl/include/folder> (ex. ADD_CFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/include)
-
-  3. Run 'make -j ${nproc}'
-
-  4. Navigate into the python directory
-
-  5. Run 'sudo python setup.py install'
-
+## Build/Install MXNet with a full MKL installation:
+
+To make it convenient for customers, Intel introduced a new license called [Intel® Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license) that allows to redistribute not only dynamic libraries but also headers, examples and static libraries.
+
+Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.
+
+  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
+
+  2. Run 'make -j ${nproc} USE_BLAS=mkl'
+
+  3. Navigate into the python directory
+
+  4. Run 'sudo python setup.py install'
+
+
+## Build/Install MXNet with MKLDNN on Windows:
+
+To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake](https://cmake.org/) if it is not already installed.
+3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
+6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
+```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
+7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
+8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
+```
+    git clone https://github.com/apache/incubator-mxnet.git --recursive
+```
+
+2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
+
+3. Start a Visual Studio command prompt.
+
+4. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
+[CMake](https://cmake.org/) command:
+```
+    mkdir build
+    cd build
+    cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+
+5. In Visual Studio, open the solution file,```.sln```, and compile it.
+These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
+Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
+
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+
+## Install MXNet for Python
+
+1. Install ```Python``` using windows installer available [here](https://www.python.org/downloads/release/python-2712/).
+2. Install ```Numpy``` using windows installer available [here](http://scipy.org/install.html).
+3. Next, we install Python package interface for MXNet. You can find the Python interface package for [MXNet on GitHub](https://github.com/dmlc/mxnet/tree/master/python/mxnet).
+
+```CMD
+    cd python
+    python setup.py install
+```
+Done! We have installed MXNet with Python interface. Run below commands to verify our installation is successful.
+```CMD
+    # Open Python terminal
+    python
+
+    # You should be able to import mxnet library without any issues.
+    >>> import mxnet as mx;
+    >>> a = mx.nd.ones((2, 3));
+    >>> print ((a*2).asnumpy());
+        [[ 2.  2.  2.]
+        [ 2.  2.  2.]]
+```
+We actually did a small tensor computation using MXNet! You are all set with MKLDNN MXNet on your Windows machine.
diff --git a/Makefile b/Makefile
index 951b29b41cf..72d659b4d6f 100644
--- a/Makefile
+++ b/Makefile
@@ -66,12 +66,14 @@ $(warning "USE_MKL2017 is deprecated. We will switch to USE_MKLDNN.")
 endif
 
 ifeq ($(USE_MKLDNN), 1)
-	RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT))
-	LAST_WORD_INDEX := $(words $(RETURN_STRING))
-	# fetch the 2nd last word as MKLDNNROOT
-	MKLDNNROOT := $(word $(shell echo $$(($(LAST_WORD_INDEX) - 1))),$(RETURN_STRING))
-	MKLROOT := $(lastword $(RETURN_STRING))
-	export USE_MKLML = 1
+ifneq ($(MKLDNN_ROOT),)
+    MKLDNNROOT = $(MKLDNN_ROOT)
+    MKLROOT = $(MKLDNN_ROOT)
+else
+    MKLDNNROOT = $(ROOTDIR)/3rdparty/mkldnn/install
+    MKLROOT = $(ROOTDIR)/3rdparty/mkldnn/install
+endif
+    export USE_MKLML = 1
 endif
 
 include $(TPARTYDIR)/mshadow/make/mshadow.mk
@@ -118,10 +120,6 @@ ifeq ($(USE_MKLDNN), 1)
 	CFLAGS += -DMXNET_USE_MKLDNN=1
 	CFLAGS += -DUSE_MKL=1
 	CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/
-	ifneq ($(MKLDNNROOT), $(MKLROOT))
-		CFLAGS += -I$(MKLROOT)/include
-		LDFLAGS += -L$(MKLROOT)/lib
-	endif
 	CFLAGS += -I$(MKLDNNROOT)/include
 	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
@@ -427,7 +425,7 @@ endif
 # For quick compile test, used smaller subset
 ALLX_DEP= $(ALL_DEP)
 
-build/src/%.o: src/%.cc
+build/src/%.o: src/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 -c $(CFLAGS) -MMD -c $< -o $@
 
@@ -501,6 +499,7 @@ include cpp-package/cpp-package.mk
 endif
 
 include tests/cpp/unittest.mk
+include mkldnn.mk
 
 extra-packages: $(EXTRA_PACKAGES)
 
@@ -612,10 +611,9 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
 	$(RM) -r  $(patsubst %, %/*.o, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.o, $(EXTRA_OPERATORS))
 else
-clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN)
+clean: mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
-		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \
-		3rdparty/mkldnn/install/*
+		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/NEWS.md b/NEWS.md
index a51b514c1a5..461bb6d2d15 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,135 @@
 MXNet Change Log
 ================
+## 1.2.0
+### New Features - Added Scala Inference APIs
+- Implemented new [Scala Inference APIs](https://cwiki.apache.org/confluence/display/MXNET/MXNetScalaInferenceAPI) which offer an easy-to-use, Scala Idiomatic and thread-safe high level APIs for performing predictions with deep learning models trained with MXNet (#9678). Implemented a new ImageClassifier class which provides APIs for classification tasks on a Java BufferedImage using a pre-trained model you provide (#10054). Implemented a new ObjectDetector class which provides APIs for object and boundary detections on a Java BufferedImage using a pre-trained model you provide (#10229).
+
+### New Features - Added a Module to Import ONNX models into MXNet
+- Implemented a new ONNX module in MXNet which offers an easy to use API to import ONNX models into MXNet's symbolic interface (#9963). Checkout the [example](https://github.com/apache/incubator-mxnet/blob/master/example/onnx/super_resolution.py) on how you could use this [API](https://cwiki.apache.org/confluence/display/MXNET/ONNX-MXNet+API+Design) to import ONNX models and perform inference on MXNet. Currently, the ONNX-MXNet Import module is still experimental. Please use it with caution.
+
+### New Features - Added Support for Model Quantization with Calibration
+- Implemented model quantization by adopting the [TensorFlow approach](https://www.tensorflow.org/performance/quantization) with calibration by borrowing the idea from Nvidia's [TensorRT](http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf). The focus of this work is on keeping quantized models (ConvNets for now) inference accuracy loss under control when compared to their corresponding FP32 models. Please see the [example](https://github.com/apache/incubator-mxnet/tree/master/example/quantization) on how to quantize a FP32 model with or without calibration (#9552). Currently, the Quantization support is still experimental. Please use it with caution.
+
+### New Features - MKL-DNN Integration
+- MXNet now integrates with Intel MKL-DNN to accelerate neural network operators: Convolution, Deconvolution, FullyConnected, Pooling, Batch Normalization, Activation, LRN, Softmax, as well as some common operators: sum and concat (#9677). This integration allows NDArray to contain data with MKL-DNN layouts and reduces data layout conversion to get the maximal performance from MKL-DNN. Currently, the MKL-DNN integration is still experimental. Please use it with caution.
+
+### New Features - Added Exception Handling Support for Operators
+- Implemented [Exception Handling Support for Operators](https://cwiki.apache.org/confluence/display/MXNET/Improved+exception+handling+in+MXNet) in MXNet. MXNet now transports backend C++ exceptions to the different language front-ends and prevents crashes when exceptions are thrown during operator execution (#9681).
+
+### New Features - Enhanced FP16 support
+- Added support for distributed mixed precision training with FP16. It supports storing of master copy of weights in float32 with the multi_precision mode of optimizers (#10183). Improved speed of float16 operations on x86 CPU by 8 times through F16C instruction set. Added support for more operators to work with FP16 inputs (#10125, #10078, #10169). Added a tutorial on using mixed precision with FP16 (#10391).
+
+### New Features - Added Profiling Enhancements
+- Enhanced built-in profiler to support native Intel:registered: VTune:tm: Amplifier objects such as Task, Frame, Event, Counter and Marker from both C++ and Python -- which is also visible in the Chrome tracing view(#8972). Added Runtime tracking of symbolic and imperative operators as well as memory and API calls. Added Tracking and dumping of aggregate profiling data. Profiler also no longer affects runtime performance when not in use. 
+
+### Breaking Changes
+- Changed Namespace for MXNet scala from `ml.dmlc.mxnet` to `org.apache.mxnet` (#10284).
+- Changed API for the Pooling operator from `mxnet.symbol.Pooling(data=None, global_pool=_Null, cudnn_off=_Null, kernel=_Null, pool_type=_Null, pooling_convention=_Null, stride=_Null, pad=_Null, name=None, attr=None, out=None, **kwargs)` to  `mxnet.symbol.Pooling(data=None,  kernel=_Null, pool_type=_Null, global_pool=_Null, cudnn_off=_Null, pooling_convention=_Null, stride=_Null, pad=_Null, name=None, attr=None, out=None, **kwargs)`. This is a breaking change when kwargs are not provided since the new api expects the arguments starting from `global_pool` at the fourth position instead of the second position. (#10000).
+
+### Bug Fixes
+- Fixed tests - Flakiness/Bugs - (#9598, #9951, #10259, #10197, #10136, #10422). Please see: [Tests Improvement Project](https://github.com/apache/incubator-mxnet/projects/9)
+- Fixed `cudnn_conv` and `cudnn_deconv` deadlock (#10392).
+- Fixed a race condition in `io.LibSVMIter` when batch size is large (#10124).
+- Fixed a race condition in converting data layouts in MKL-DNN (#9862).
+- Fixed MKL-DNN sigmoid/softrelu issue (#10336).
+- Fixed incorrect indices generated by device row sparse pull (#9887).
+- Fixed cast storage support for same stypes (#10400).
+- Fixed uncaught exception for bucketing module when symbol name not specified (#10094).
+- Fixed regression output layers (#9848).
+- Fixed crash with `mx.nd.ones` (#10014).
+- Fixed `sample_multinomial` crash when `get_prob=True` (#10413).
+- Fixed buggy type inference in correlation (#10135).
+- Fixed race condition for `CPUSharedStorageManager->Free` and launched workers at iter init stage to avoid frequent relaunch (#10096).
+- Fixed DLTensor Conversion for int64 (#10083).
+- Fixed issues where hex symbols of the profiler were not being recognized by chrome tracing tool(#9932)
+- Fixed crash when profiler was not enabled (#10306)
+- Fixed ndarray assignment issues (#10022, #9981, #10468).
+- Fixed incorrect indices generated by device row sparse pull (#9887).
+- Fixed `print_summary` bug in visualization module (#9492).
+- Fixed shape mismatch in accuracy metrics (#10446).
+- Fixed random samplers from uniform and random distributions in R bindings (#10450).
+- Fixed a bug that was causing training metrics to be printed as NaN sometimes (#10437).
+- Fixed a crash with non positive reps for tile ops (#10417).
+
+### Performance Improvements 
+- On average, after the MKL-DNN change, the inference speed of MXNet + MKLDNN outperforms MXNet + OpenBLAS by a factor of 32, outperforms MXNet + MKLML by 82% and outperforms MXNet + MKLML with the experimental flag by 8%. The experiments were run for the image classifcation example, for different networks and different batch sizes.
+- Improved sparse SGD, sparse AdaGrad and sparse Adam optimizer speed on GPU by 30x (#9561, #10312, #10293, #10062).
+- Improved `sparse.retain` performance on CPU by 2.5x (#9722)
+- Replaced `std::swap_ranges` with memcpy (#10351)
+- Implemented DepthwiseConv2dBackwardFilterKernel which is over 5x faster (#10098)
+- Implemented CPU LSTM Inference (#9977)
+- Added Layer Normalization in C++ (#10029)
+- Optimized Performance for rtc (#10018)
+- Improved CPU performance of  ROIpooling operator by using OpenMP (#9958)
+- Accelerated the calculation of F1 (#9833)
+
+### API Changes
+- `Block.save_params` now match parameters according to model structure instead of names to avoid prefix mismatching problems during saving and loading (#10511).
+- Added an optional argument `ctx` to `mx.random.seed`. Seeding with `ctx` option produces random number sequence independent of device id. (#10367).
+- Added copy flag for astype (#10347).
+- Added context parameter to Scala Infer API - ImageClassifier and ObjectDetector (#10252).
+- Added axes support for dropout in gluon (#10032).
+- Added default `ctx` to cpu for `gluon.Block.load_params` (#10160).
+- Added support for variable sequence length in gluon.RecurrentCell (#9934).
+- Added convenience fluent method for squeeze op (#9734).
+- Made `array.reshape` compatible with numpy (#9790).
+- Added axis support and gradient for L2norm (#9740).
+
+### Sparse Support
+- Added support for multi-GPU training with `row_sparse` weights using `device` KVStore (#9987).
+- Added `Module.prepare` API for multi-GPU and multi-machine training with row_sparse weight (#10285).
+- Added `deterministic` option for `contrib.SparseEmbedding` operator (#9846).
+- Added `sparse.broadcast_mul` and `sparse.broadcast_div` with CSRNDArray and 1-D dense NDArray on CPU (#10208).
+- Added sparse support for Custom Operator (#10374).
+- Added Sparse feature for Perl (#9988).
+- Added `force_deterministic` option for sparse embedding (#9882).
+- Added `sparse.where` with condition being csr ndarray (#9481).
+
+### Deprecations
+- Deprecated `profiler_set_state` (#10156).
+
+### Other Features
+- Added constant parameter for gluon (#9893).
+- Added `contrib.rand.zipfian` (#9747).
+- Added Gluon PreLU, ELU, SELU, Swish activation layers for Gluon (#9662)
+- Added Squeeze Op (#9700).
+- Added multi-proposal operator (CPU version) and fixed bug in multi-proposal operator (GPU version) (#9939).
+- Added in Large-Batch SGD with a warmup, and a LARS startegy (#8918).
+- Added Language Modelling datasets and Sampler (#9514).
+- Added instance norm and reflection padding to Gluon (#7938).
+- Added micro-averaging strategy for F1 metric (#9777).
+- Added Softsign Activation Function (#9851).
+- Added eye operator, for default storage type (#9770).
+- Added TVM bridge support to JIT NDArray Function by TVM (#9880).
+- Added float16 support for correlation operator and L2Normalization operator (#10125, #10078).
+- Added random shuffle implementation for NDArray (#10048).
+- Added load from buffer functions for CPP package (#10261).
+
+### Usability Improvements
+- Added embedding learning example for Gluon (#9165).
+- Added tutorial on how to use data augmenters (#10055).
+- Added tutorial for Data Augmentation with Masks (#10178).
+- Added LSTNet example (#9512).
+- Added MobileNetV2 example (#9614).
+- Added tutorial for Gluon Datasets and DataLoaders (#10251).
+- Added Language model with Google's billion words dataset (#10025).
+- Added example for custom operator using RTC (#9870).
+- Improved image classification examples (#9799, #9633).
+- Added reshape predictor function to c_predict_api (#9984).
+- Added guide for implementing sparse ops (#10081).
+- Added naming tutorial for gluon blocks and parameters (#10511).
+
+### Known Issues
+- MXNet crash when built with `USE_GPERFTOOLS = 1` (#8968).
+- [DevGuide.md](https://github.com/google/googletest/blob/ec44c6c1675c25b9827aacd08c02433cccde7780/googlemock/docs/DevGuide.md) in the 3rdparty submodule googletest licensed under CC-BY-2.5.
+- Incompatibility in the behavior of MXNet Convolution operator for certain unsupported use cases: Raises an exception when MKLDNN is enabled, fails silently when it is not.
+- MXNet convolution generates wrong results for 1-element strides (#10689).
+- [Tutorial on fine-tuning an ONNX model](https://github.com/apache/incubator-mxnet/blob/v1.2.0/docs/tutorials/onnx/fine_tuning_gluon.md) fails when using cpu context.
+- CMake build ignores the `USE_MKLDNN` flag and doesn't build with MKLDNN support even with `-DUSE_MKLDNN=1`. To workaround the issue please see: #10801.
+- Linking the dmlc-core library fails with CMake build when building with `USE_OPENMP=OFF`. To workaround the issue, please use the updated CMakeLists in dmlc-core unit tests directory: https://github.com/dmlc/dmlc-core/pull/396. You can also workaround the issue by using make instead of cmake when building with `USE_OPENMP=OFF`.
+
+For more information and examples, see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/%5BWIP%5D+Apache+MXNet+%28incubating%29+1.2.0+Release+Notes)
+
 ## 1.1.0
 ### Usability Improvements
 - Improved the usability of examples and tutorials
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index b22751a8db2..ff70a97d017 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 1.2.0
+Version: 1.3.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qk...@qkou.info>
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index ff885317064..3c503c2e855 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -401,7 +401,7 @@ mx.opt.get.updater <- function(optimizer, weights) {
   update <- optimizer$update
 
   update.closure <- function(weight, grad) {
-    ulist <- lapply(seq_along(weights), function(i) {
+    ulist <- lapply(seq_along(weight), function(i) {
       if (!is.null(grad[[i]])) {
         update(i, weight[[i]], grad[[i]], state.list[[i]])
       } else {
diff --git a/README.md b/README.md
index ba37cd4bf83..c37959d6d74 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ deep learning systems, and interesting insights of DL systems for hackers.
 
 What's New
 ----------
+* [Version 1.2.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.2.0) - MXNet 1.2.0 Release.
 * [Version 1.1.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.1.0) - MXNet 1.1.0 Release.
 * [Version 1.0.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.0.0) - MXNet 1.0.0 Release.
 * [Version 0.12.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.1) - MXNet 0.12.1 Patch Release.
diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache
new file mode 100644
index 00000000000..8a0428b58c3
--- /dev/null
+++ b/ci/Jenkinsfile_docker_cache
@@ -0,0 +1,81 @@
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline to generate the centralized docker cache
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// timeout in minutes
+total_timeout = 120
+git_timeout = 15
+// assign any caught errors here
+err = null
+
+// initialize source codes
+def init_git() {
+  deleteDir()
+  retry(5) {
+    try {
+      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
+      // retries as this will increase the amount of requests and worsen the throttling
+      timeout(time: git_timeout, unit: 'MINUTES') {
+        checkout scm
+        sh 'git submodule update --init --recursive'
+        sh 'git clean -x -d -f'
+      }
+    } catch (exc) {
+      deleteDir()
+      error "Failed to fetch source codes with ${exc}"
+      sleep 2
+    }
+  }
+}
+
+
+try {
+  stage("Docker cache build & publish") {
+    node('mxnetlinux-cpu') {
+      ws('workspace/docker_cache') {
+        timeout(time: total_timeout, unit: 'MINUTES') {
+          init_git()
+          sh "ci/docker_cache.py --docker-cache-bucket ${env.DOCKER_CACHE_BUCKET}"
+        }
+      }
+    }
+  }
+
+  // set build status to success at the end
+  currentBuild.result = "SUCCESS"
+} catch (caughtError) {
+  node("mxnetlinux-cpu") {
+    sh "echo caught ${caughtError}"
+    err = caughtError
+    currentBuild.result = "FAILURE"
+  }
+} finally {
+  node("mxnetlinux-cpu") {
+    // Only send email if master failed
+    if (currentBuild.result == "FAILURE" && env.BRANCH_NAME == "master") {
+      emailext body: 'Build for MXNet branch ${BRANCH_NAME} has broken. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[BUILD FAILED] Branch ${BRANCH_NAME} build ${BUILD_NUMBER}', to: '${EMAIL}'
+    }
+    // Remember to rethrow so the build is marked as failing
+    if (err) {
+      throw err
+    }
+  }
+}
diff --git a/ci/__init__.py b/ci/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/ci/build.py b/ci/build.py
index 6d8d0141170..deae1d733a8 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -61,17 +61,44 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
 
 
 def build_docker(platform: str, docker_binary: str) -> None:
-    """Build a container for the given platform"""
+    """
+    Build a container for the given platform
+    :param platform: Platform
+    :param docker_binary: docker binary to use (docker/nvidia-docker)
+    :return: Id of the top level image
+    """
+
     tag = get_docker_tag(platform)
     logging.info("Building container tagged '%s' with %s", tag, docker_binary)
     cmd = [docker_binary, "build",
         "-f", get_dockerfile(platform),
+        "--rm=false",  # Keep intermediary layers to prime the build cache
         "--build-arg", "USER_ID={}".format(os.getuid()),
+        "--cache-from", tag,
         "-t", tag,
         "docker"]
     logging.info("Running command: '%s'", ' '.join(cmd))
     check_call(cmd)
 
+    # Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
+    # check_call would have failed
+    image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
+    if not image_id:
+        raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
+    return image_id
+
+
+def _get_local_image_id(docker_binary, docker_tag):
+    """
+    Get the image id of the local docker layer with the passed tag
+    :param docker_tag: docker tag
+    :return: Image id as string or None if tag does not exist
+    """
+    cmd = [docker_binary, "images", "-q", docker_tag]
+    image_id_b = subprocess.check_output(cmd)
+    image_id = image_id_b.decode('utf-8').strip()
+    return image_id
+
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -123,6 +150,7 @@ def container_run(platform: str,
     if not dry_run and ret != 0:
         logging.error("Running of command in container failed (%s): %s", ret, cmd)
         logging.error("You can try to get into the container by using the following command: %s", docker_run_cmd)
+
         raise subprocess.CalledProcessError(ret, cmd)
 
     return docker_run_cmd
@@ -131,7 +159,6 @@ def container_run(platform: str,
 def list_platforms() -> str:
     print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
 
-
 def main() -> int:
     # We need to be in the same directory than the script so the commands in the dockerfiles work as
     # expected. But the script can be invoked from a different path
@@ -180,6 +207,14 @@ def script_name() -> str:
                         help="go in a shell inside the container",
                         action='store_true')
 
+    parser.add_argument("--download-docker-cache",
+                        help="Download the docker cache from our central repository instead of rebuilding locally",
+                        action='store_true')
+
+    parser.add_argument("--docker-cache-bucket",
+                        help="S3 docker cache bucket, e.g. mxnet-ci-docker-cache",
+                        type=str)
+
     parser.add_argument("command",
                         help="command to run in the container",
                         nargs='*', action='append', type=str)
@@ -194,12 +229,16 @@ def script_name() -> str:
         list_platforms()
     elif args.platform:
         platform = args.platform
+        tag = get_docker_tag(platform)
+        if args.download_docker_cache:
+            import docker_cache
+            logging.info('Docker cache download is enabled')
+            docker_cache.load_docker_cache(bucket_name=args.docker_cache_bucket, docker_tag=tag)
         build_docker(platform, docker_binary)
         if args.build_only:
-            logging.warn("Container was just built. Exiting due to build-only.")
+            logging.warning("Container was just built. Exiting due to build-only.")
             return 0
 
-        tag = get_docker_tag(platform)
         if command:
             container_run(platform, docker_binary, shared_memory_size, command)
         elif args.print_docker_run:
@@ -216,6 +255,11 @@ def script_name() -> str:
         logging.info("Building for all architectures: {}".format(platforms))
         logging.info("Artifacts will be produced in the build/ directory.")
         for platform in platforms:
+            if args.download_docker_cache:
+                import docker_cache
+                tag = get_docker_tag(platform)
+                logging.info('Docker cache download is enabled')
+                docker_cache.load_docker_cache(bucket_name=args.docker_cache_bucket, docker_tag=tag)
             build_docker(platform, docker_binary)
             if args.build_only:
                 continue
diff --git a/ci/docker/Dockerfile.build.amzn_linux_cpu b/ci/docker/Dockerfile.build.amzn_linux_cpu
deleted file mode 100755
index 7d6f2236af3..00000000000
--- a/ci/docker/Dockerfile.build.amzn_linux_cpu
+++ /dev/null
@@ -1,44 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Dockerfile to build and run MXNet for Amazon Linux on CPU
-
-FROM amazonlinux
-
-WORKDIR /work/deps
-COPY install/amzn_linux_core.sh /work/
-RUN /work/amzn_linux_core.sh
-COPY install/amzn_linux_opencv.sh /work/
-RUN /work/amzn_linux_opencv.sh
-COPY install/amzn_linux_openblas.sh /work/
-RUN /work/amzn_linux_openblas.sh
-COPY install/amzn_linux_python2.sh /work/
-RUN /work/amzn_linux_python2.sh
-COPY install/amzn_linux_python3.sh /work/
-RUN /work/amzn_linux_python3.sh
-COPY install/amzn_linux_testdeps.sh /work/
-RUN /work/amzn_linux_testdeps.sh
-COPY install/amzn_linux_julia.sh /work/
-RUN /work/amzn_linux_julia.sh
-COPY install/amzn_linux_maven.sh /work/
-RUN /work/amzn_linux_maven.sh
-COPY install/amzn_linux_library.sh /work/
-RUN /work/amzn_linux_library.sh
-WORKDIR /work/mxnet
-
-COPY runtime_functions.sh /work/
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index 0074c1f9330..c22e000cad1 100755
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -84,13 +84,6 @@ ENV CC /usr/arm-linux-androideabi/bin/arm-linux-androideabi-clang
 ENV CXX /usr/arm-linux-androideabi/bin/arm-linux-androideabi-clang++
 ENV BUILD_OPTS "USE_BLAS=openblas USE_SSE=0 DMLC_LOG_STACK_TRACE=0 USE_OPENCV=0 USE_LAPACK=0"
 
-# Build MXNet
-ADD mxnet mxnet
-ADD arm.crosscompile.android.mk /work/mxnet/make/config.mk
-RUN cd mxnet && \
-    make -j$(nproc) $BUILD_OPTS
+WORKDIR /work/mxnet
 
-WORKDIR /work/build/
-RUN cp /work/mxnet/lib/* .
-
-# TODO: Bring this into the new format
\ No newline at end of file
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.build.arm64 b/ci/docker/Dockerfile.build.arm64
index eb68a818ba6..ec949600f73 100755
--- a/ci/docker/Dockerfile.build.arm64
+++ b/ci/docker/Dockerfile.build.arm64
@@ -18,21 +18,22 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
-FROM dockcross/linux-arm64
+# Temporary fix due to https://github.com/apache/incubator-mxnet/issues/10837
+#FROM dockcross/linux-arm64
+FROM mxnetci/dockcross-linux-arm64:05082018
 
 ENV ARCH aarch64
-ENV CC /usr/bin/aarch64-linux-gnu-gcc
-ENV CXX /usr/bin/aarch64-linux-gnu-g++
-ENV FC /usr/bin/aarch64-linux-gnu-gfortran-4.9
+ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
+ENV TARGET ARMV8
 
 WORKDIR /work
 
-COPY install/arm64_openblas.sh /work/
-RUN /work/arm64_openblas.sh
+# Build OpenBLAS
+RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
+    cd OpenBLAS && \
+    make -j$(nproc) && \
+    PREFIX=${CROSS_ROOT} make install
 
-ENV LD_LIBRARY_PATH /opt/OpenBLAS/lib
-ENV CPLUS_INCLUDE_PATH /opt/OpenBLAS/include
+COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
-
-COPY runtime_functions.sh /work/
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.centos7_cpu b/ci/docker/Dockerfile.build.centos7_cpu
index a44d6464ee3..92314faf121 100755
--- a/ci/docker/Dockerfile.build.centos7_cpu
+++ b/ci/docker/Dockerfile.build.centos7_cpu
@@ -20,8 +20,6 @@
 
 FROM centos:7
 
-ARG USER_ID=0
-
 WORKDIR /work/deps
 
 COPY install/centos7_core.sh /work/
@@ -30,6 +28,8 @@ COPY install/centos7_python.sh /work/
 RUN /work/centos7_python.sh
 COPY install/ubuntu_mklml.sh /work/
 RUN /work/ubuntu_mklml.sh
+
+ARG USER_ID=0
 COPY install/centos7_adduser.sh /work/
 RUN /work/centos7_adduser.sh 
 
diff --git a/ci/docker/Dockerfile.build.centos7_gpu b/ci/docker/Dockerfile.build.centos7_gpu
index 4dcf5bf08ca..2d28170f11b 100755
--- a/ci/docker/Dockerfile.build.centos7_gpu
+++ b/ci/docker/Dockerfile.build.centos7_gpu
@@ -20,14 +20,14 @@
 
 FROM nvidia/cuda:9.1-cudnn7-devel-centos7
 
-ARG USER_ID=0
-
 WORKDIR /work/deps
 
 COPY install/centos7_core.sh /work/
 RUN /work/centos7_core.sh
 COPY install/centos7_python.sh /work/
 RUN /work/centos7_python.sh
+
+ARG USER_ID=0
 COPY install/centos7_adduser.sh /work/
 RUN /work/centos7_adduser.sh
 
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 9fa50f4097a..c358edb1fb0 100755
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -22,7 +22,9 @@
 
 FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
 
-FROM dockcross/linux-arm64
+# Temporary fix due to https://github.com/apache/incubator-mxnet/issues/10837
+# FROM dockcross/linux-arm64
+FROM mxnetci/dockcross-linux-arm64:05082018
 
 ENV ARCH aarch64
 ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
@@ -32,7 +34,6 @@ ENV TARGET ARMV8
 WORKDIR /work
 
 # Build OpenBLAS
-ADD https://api.github.com/repos/xianyi/OpenBLAS/git/refs/tags/v0.2.20 openblas_version.json
 RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
     cd OpenBLAS && \
     make -j$(nproc) && \
diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
index 9156d6f7b69..4d3c4664363 100755
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -23,8 +23,6 @@
 
 FROM nvidia/cuda:9.1-cudnn7-devel
 
-ARG USER_ID=0
-
 WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
@@ -48,6 +46,7 @@ COPY install/ubuntu_nvidia.sh /work/
 RUN /work/ubuntu_nvidia.sh
 
 # Keep this at the end since this command is not cachable
+ARG USER_ID=0
 COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu
index f706f88461f..2dc7ef13f21 100755
--- a/ci/docker/Dockerfile.build.ubuntu_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu
@@ -20,8 +20,6 @@
 
 FROM ubuntu:16.04
 
-ARG USER_ID=0
-
 WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
@@ -44,6 +42,8 @@ COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 COPY install/ubuntu_docs.sh /work/
 RUN /work/ubuntu_docs.sh
+
+ARG USER_ID=0
 COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu b/ci/docker/Dockerfile.build.ubuntu_gpu
index 547f9843d34..10971724aaa 100755
--- a/ci/docker/Dockerfile.build.ubuntu_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu
@@ -20,8 +20,6 @@
 
 FROM nvidia/cuda:9.1-cudnn7-devel
 
-ARG USER_ID=0
-
 WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
@@ -50,6 +48,8 @@ COPY install/ubuntu_docs.sh /work/
 RUN /work/ubuntu_docs.sh
 COPY install/ubuntu_tutorials.sh /work/
 RUN /work/ubuntu_tutorials.sh
+
+ARG USER_ID=0
 COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
diff --git a/ci/docker/install/amzn_linux_julia.sh b/ci/docker/install/amzn_linux_julia.sh
deleted file mode 100755
index bfaf3c4924b..00000000000
--- a/ci/docker/install/amzn_linux_julia.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-wget -nv https://julialang.s3.amazonaws.com/bin/linux/x64/0.5/julia-0.5.0-linux-x86_64.tar.gz
-mv julia-0.5.0-linux-x86_64.tar.gz /tmp/
-tar xfvz /tmp/julia-0.5.0-linux-x86_64.tar.gz
-rm -f /tmp/julia-0.5.0-linux-x86_64.tar.gz
-# tar extracted in current directory
-ln -s -f ${PWD}/julia-3c9d75391c/bin/julia /usr/bin/julia
\ No newline at end of file
diff --git a/ci/docker/install/amzn_linux_library.sh b/ci/docker/install/amzn_linux_library.sh
deleted file mode 100755
index 04708957033..00000000000
--- a/ci/docker/install/amzn_linux_library.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-yum -y install graphviz
-pip install graphviz
-pip install opencv-python
\ No newline at end of file
diff --git a/ci/docker/install/amzn_linux_maven.sh b/ci/docker/install/amzn_linux_maven.sh
deleted file mode 100755
index 22875d0ec86..00000000000
--- a/ci/docker/install/amzn_linux_maven.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-wget -nv http://mirrors.ocf.berkeley.edu/apache/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
-mv apache-maven-3.3.9-bin.tar.gz /tmp/
-tar xfvz /tmp/apache-maven-3.3.9-bin.tar.gz
-yum install -y java-1.8.0-openjdk-devel
\ No newline at end of file
diff --git a/ci/docker/install/amzn_linux_python2.sh b/ci/docker/install/amzn_linux_python2.sh
deleted file mode 100755
index e099ad6d6c4..00000000000
--- a/ci/docker/install/amzn_linux_python2.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-yum groupinstall -y "Development Tools"
-yum install -y mlocate python27 python27-setuptools python27-tools python27-numpy python27-scipy python27-nose python27-matplotlib unzip
-ln -s -f /usr/bin/python2.7 /usr/bin/python2
-wget -nv https://bootstrap.pypa.io/get-pip.py
-python2 get-pip.py
-$(which easy_install-2.7) --upgrade pip
-if [ -f /usr/local/bin/pip ] && [ -f /usr/bin/pip ]; then
-    mv /usr/bin/pip /usr/bin/pip.bak
-    ln /usr/local/bin/pip /usr/bin/pip
-fi
-
-ln -s -f /usr/local/bin/pip /usr/bin/pip
-for i in ipython[all] jupyter pandas scikit-image h5py pandas sklearn sympy; do echo "${i}..."; pip install -U $i >/dev/null; done
diff --git a/ci/docker/install/amzn_linux_python3.sh b/ci/docker/install/amzn_linux_python3.sh
deleted file mode 100755
index 3f80d7d98d8..00000000000
--- a/ci/docker/install/amzn_linux_python3.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-wget -nv https://bootstrap.pypa.io/get-pip.py
-mkdir py3
-cd py3
-wget -nv https://www.python.org/ftp/python/3.5.2/Python-3.5.2.tgz
-tar -xvzf Python-3.5.2.tgz
-cd Python-3.5.2
-yum install -y zlib-devel openssl-devel sqlite-devel bzip2-devel gdbm-devel ncurses-devel xz-devel readline-devel
-./configure --prefix=/opt/ --with-zlib-dir=/usr/lib64
-make -j$(nproc)
-mkdir /opt/bin
-mkdir /opt/lib
-make install
-ln -s -f /opt/bin/python3 /usr/bin/python3
-cd ../..
-python3 get-pip.py
-ln -s -f /opt/bin/pip /usr/bin/pip3
-
-mkdir -p ~/.local/lib/python3.5/site-packages/
-pip3 install numpy
-popd
\ No newline at end of file
diff --git a/ci/docker/install/amzn_linux_testdeps.sh b/ci/docker/install/amzn_linux_testdeps.sh
deleted file mode 100755
index f5c49d9e37b..00000000000
--- a/ci/docker/install/amzn_linux_testdeps.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pip install cpplint 'pylint==1.4.4' 'astroid==1.3.6'
-pip3 install nose
-ln -s -f /opt/bin/nosetests /usr/local/bin/nosetests3
-ln -s -f /opt/bin/nosetests-3.4 /usr/local/bin/nosetests-3.4
\ No newline at end of file
diff --git a/ci/docker/install/centos7_python.sh b/ci/docker/install/centos7_python.sh
index 9e076b61e87..154e3b8e4f5 100755
--- a/ci/docker/install/centos7_python.sh
+++ b/ci/docker/install/centos7_python.sh
@@ -31,5 +31,5 @@ curl "https://bootstrap.pypa.io/get-pip.py" -o "get-pip.py"
 python2.7 get-pip.py
 python3.6 get-pip.py
 
-pip2 install nose pylint numpy nose-timer requests h5py scipy
-pip3 install nose pylint numpy nose-timer requests h5py scipy
\ No newline at end of file
+pip2 install nose pylint numpy nose-timer requests h5py scipy==1.0.1
+pip3 install nose pylint numpy nose-timer requests h5py scipy==1.0.1
diff --git a/ci/docker/install/ubuntu_mklml.sh b/ci/docker/install/ubuntu_mklml.sh
index 253cf95c6ce..3689aad65cf 100755
--- a/ci/docker/install/ubuntu_mklml.sh
+++ b/ci/docker/install/ubuntu_mklml.sh
@@ -21,5 +21,5 @@
 # the whole docker cache for the image
 
 set -ex
-wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz
-tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
\ No newline at end of file
+wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz
+tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh
index 554000d20ad..da7c25697b6 100755
--- a/ci/docker/install/ubuntu_python.sh
+++ b/ci/docker/install/ubuntu_python.sh
@@ -29,5 +29,5 @@ wget -nv https://bootstrap.pypa.io/get-pip.py
 python3 get-pip.py
 python2 get-pip.py
 
-pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy
-pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy
+pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1
+pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 027e287f751..7abe767c869 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -323,6 +323,9 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
new file mode 100755
index 00000000000..7fdfbcfe80c
--- /dev/null
+++ b/ci/docker_cache.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Utility to handle distributed docker cache. This is done by keeping the entire image chain of a docker container
+on an S3 bucket. This utility allows cache creation and download. After execution, the cache will be in an identical
+state as if the container would have been built locally already.
+"""
+
+import os
+import logging
+import argparse
+import sys
+import boto3
+import tempfile
+import pprint
+import threading
+import build as build_util
+import botocore
+import subprocess
+from botocore.handlers import disable_signing
+from subprocess import call, check_call, CalledProcessError
+from joblib import Parallel, delayed
+
+S3_METADATA_IMAGE_ID_KEY = 'docker-image-id'
+LOG_PROGRESS_PERCENTAGE_THRESHOLD = 10
+
+cached_aws_session = None
+
+
+class ProgressPercentage(object):
+    def __init__(self, object_name, size):
+        self._object_name = object_name
+        self._size = size
+        self._seen_so_far = 0
+        self._last_percentage = 0
+        self._lock = threading.Lock()
+
+    def __call__(self, bytes_amount) -> None:
+        # To simplify we'll assume this is hooked up
+        # to a single filename.
+        with self._lock:
+            self._seen_so_far += bytes_amount
+            percentage = int((self._seen_so_far / self._size) * 100)
+            if (percentage - self._last_percentage) >= LOG_PROGRESS_PERCENTAGE_THRESHOLD:
+                self._last_percentage = percentage
+                logging.info('{}% of {}'.format(percentage, self._object_name))
+
+
+def build_save_containers(platforms, bucket) -> int:
+    """
+    Entry point to build and upload all built dockerimages in parallel
+    :param platforms: List of platforms
+    :param bucket: S3 bucket name
+    :return: 1 if error occurred, 0 otherwise
+    """
+    if len(platforms) == 0:
+        return 0
+
+    platform_results = Parallel(n_jobs=len(platforms), backend="multiprocessing")(
+        delayed(_build_save_container)(platform, bucket)
+        for platform in platforms)
+
+    is_error = False
+    for platform_result in platform_results:
+        if platform_result is not None:
+            logging.error('Failed to generate {}'.format(platform_result))
+            is_error = True
+
+    return 1 if is_error else 0
+
+
+def _build_save_container(platform, bucket) -> str:
+    """
+    Build image for passed platform and upload the cache to the specified S3 bucket
+    :param platform: Platform
+    :param bucket: Target s3 bucket
+    :return: Platform if failed, None otherwise
+    """
+    docker_tag = build_util.get_docker_tag(platform)
+
+    # Preload cache
+    # TODO: Allow to disable this in order to allow clean rebuilds
+    load_docker_cache(bucket_name=bucket, docker_tag=docker_tag)
+
+    # Start building
+    logging.debug('Building {} as {}'.format(platform, docker_tag))
+    try:
+        image_id = build_util.build_docker(docker_binary='docker', platform=platform)
+        logging.info('Built {} as {}'.format(docker_tag, image_id))
+
+        # Compile and upload tarfile
+        _compile_upload_cache_file(bucket_name=bucket, docker_tag=docker_tag, image_id=image_id)
+        return None
+    except Exception:
+        logging.exception('Unexpected exception during build of {}'.format(docker_tag))
+        return platform
+        # Error handling is done by returning the errorous platform name. This is necessary due to
+        # Parallel being unable to handle exceptions
+
+
+def _compile_upload_cache_file(bucket_name, docker_tag, image_id) -> None:
+    """
+    Upload the passed image by id, tag it with docker tag and upload to S3 bucket
+    :param bucket_name: S3 bucket name
+    :param docker_tag: Docker tag
+    :param image_id: Image id
+    :return: None
+    """
+    session = _get_aws_session()
+    s3_object = session.resource('s3').Object(bucket_name, docker_tag)
+
+    remote_image_id = _get_remote_image_id(s3_object)
+    if remote_image_id == image_id:
+        logging.info('{} ({}) for {} has not been updated - skipping'.format(docker_tag, image_id, docker_tag))
+        return
+    else:
+        logging.debug('Cached image {} differs from local {} for {}'.format(remote_image_id, image_id, docker_tag))
+
+    # Compile layers into tarfile
+    with tempfile.TemporaryDirectory() as temp_dir:
+        tar_file_path = _format_docker_cache_filepath(output_dir=temp_dir, docker_tag=docker_tag)
+        logging.debug('Writing layers of {} to {}'.format(docker_tag, tar_file_path))
+        history_cmd = ['docker', 'history', '-q', docker_tag]
+
+        image_ids_b = subprocess.check_output(history_cmd)
+        image_ids_str = image_ids_b.decode('utf-8').strip()
+        layer_ids = [id.strip() for id in image_ids_str.split('\n') if id != '<missing>']
+
+        # docker_tag is important to preserve the image name. Otherwise, the --cache-from feature will not be able to
+        # reference the loaded cache later on. The other layer ids are added to ensure all intermediary layers
+        # are preserved to allow resuming the cache at any point
+        cmd = ['docker', 'save', '-o', tar_file_path, docker_tag]
+        cmd.extend(layer_ids)
+        try:
+            check_call(cmd)
+        except CalledProcessError as e:
+            logging.error('Error during save of {} at {}. Command: {}'.
+                          format(docker_tag, tar_file_path, pprint.pprint(cmd)))
+            return
+
+        # Upload file
+        logging.info('Uploading {} to S3'.format(docker_tag))
+        with open(tar_file_path, 'rb') as data:
+            s3_object.upload_fileobj(
+                Fileobj=data,
+                Callback=ProgressPercentage(object_name=docker_tag, size=os.path.getsize(tar_file_path)),
+                ExtraArgs={"Metadata": {S3_METADATA_IMAGE_ID_KEY: image_id}})
+            logging.info('Uploaded {} to S3'.format(docker_tag))
+
+
+def _get_remote_image_id(s3_object) -> str:
+    """
+    Get the image id of the docker cache which is represented by the S3 object
+    :param s3_object: S3 object
+    :return: Image id as string or None if object does not exist
+    """
+    try:
+        if S3_METADATA_IMAGE_ID_KEY in s3_object.metadata:
+            cached_image_id = s3_object.metadata[S3_METADATA_IMAGE_ID_KEY]
+            return cached_image_id
+        else:
+            logging.debug('No cached image available for {}'.format(s3_object.key))
+    except botocore.exceptions.ClientError as e:
+        if e.response['Error']['Code'] == "404":
+            logging.debug('{} does not exist in S3 yet'.format(s3_object.key))
+        else:
+            raise
+
+    return None
+
+
+def load_docker_cache(bucket_name, docker_tag) -> None:
+    """
+    Load the precompiled docker cache from the passed S3 bucket
+    :param bucket_name: S3 bucket name
+    :param docker_tag: Docker tag to load
+    :return: None
+    """
+    # Allow anonymous access
+    s3_resource = boto3.resource('s3')
+    s3_resource.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)
+    s3_object = s3_resource.Object(bucket_name, docker_tag)
+
+    # Check if cache is still valid and exists
+    remote_image_id = _get_remote_image_id(s3_object)
+    if remote_image_id:
+        if _docker_layer_exists(remote_image_id):
+            logging.info('Local docker cache already present for {}'.format(docker_tag))
+            return
+        else:
+            logging.info('Local docker cache not present for {}'.format(docker_tag))
+
+        # Download using public S3 endpoint (without requiring credentials)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            tar_file_path = os.path.join(temp_dir, 'layers.tar')
+            s3_object.download_file(
+                Filename=tar_file_path,
+                Callback=ProgressPercentage(object_name=docker_tag, size=s3_object.content_length))
+
+            # Load layers
+            cmd = ['docker', 'load', '-i', tar_file_path]
+            try:
+                check_call(cmd)
+                logging.info('Docker cache for {} loaded successfully'.format(docker_tag))
+            except CalledProcessError as e:
+                logging.error('Error during load of docker cache for {} at {}'.format(docker_tag, tar_file_path))
+                logging.exception(e)
+                return
+    else:
+        logging.info('No cached remote image of {} present'.format(docker_tag))
+
+
+def _docker_layer_exists(layer_id) -> bool:
+    """
+    Check if the docker cache contains the layer with the passed id
+    :param layer_id: layer id
+    :return: True if exists, False otherwise
+    """
+    cmd = ['docker', 'images', '-q']
+    image_ids_b = subprocess.check_output(cmd)
+    image_ids_str = image_ids_b.decode('utf-8').strip()
+    return layer_id in [id.strip() for id in image_ids_str.split('\n')]
+
+
+def _get_aws_session() -> boto3.Session:  # pragma: no cover
+    """
+    Get the boto3 AWS session
+    :return: Session object
+    """
+    global cached_aws_session
+    if cached_aws_session:
+        return cached_aws_session
+
+    session = boto3.Session()  # Uses IAM user credentials
+    cached_aws_session = session
+    return session
+
+
+def _format_docker_cache_filepath(output_dir, docker_tag) -> str:
+    return os.path.join(output_dir, docker_tag.replace('/', '_') + '.tar')
+
+
+def main() -> int:
+    # We need to be in the same directory than the script so the commands in the dockerfiles work as
+    # expected. But the script can be invoked from a different path
+    base = os.path.split(os.path.realpath(__file__))[0]
+    os.chdir(base)
+
+    logging.getLogger().setLevel(logging.DEBUG)
+    logging.getLogger('botocore').setLevel(logging.INFO)
+    logging.getLogger('boto3').setLevel(logging.INFO)
+    logging.getLogger('urllib3').setLevel(logging.INFO)
+    logging.getLogger('s3transfer').setLevel(logging.INFO)
+
+    def script_name() -> str:
+        return os.path.split(sys.argv[0])[1]
+
+    logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))
+
+    parser = argparse.ArgumentParser(description="Utility for preserving and loading Docker cache",epilog="")
+    parser.add_argument("--docker-cache-bucket",
+                        help="S3 docker cache bucket, e.g. mxnet-ci-docker-cache",
+                        type=str,
+                        required=True)
+
+    args = parser.parse_args()
+
+    platforms = build_util.get_platforms()
+    _get_aws_session()  # Init AWS credentials
+    return build_save_containers(platforms=platforms, bucket=args.docker_cache_bucket)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/ci/docker_cache_requirements b/ci/docker_cache_requirements
new file mode 100644
index 00000000000..47c16ff3b4a
--- /dev/null
+++ b/ci/docker_cache_requirements
@@ -0,0 +1,8 @@
+boto3==1.7.13
+botocore==1.10.13
+docutils==0.14
+jmespath==0.9.3
+joblib==0.11
+python-dateutil==2.7.2
+s3transfer==0.1.13
+six==1.11.0
diff --git a/cmake/FirstClassLangCuda.cmake b/cmake/FirstClassLangCuda.cmake
index f4dcbbe448d..a70d63e2a60 100644
--- a/cmake/FirstClassLangCuda.cmake
+++ b/cmake/FirstClassLangCuda.cmake
@@ -126,7 +126,7 @@ endif ()
 function(mshadow_select_nvcc_arch_flags out_variable)
 
   set(CUDA_ARCH_LIST "Auto" CACHE STRING "Select target NVIDIA GPU achitecture.")
-  set_property( CACHE CUDA_ARCH_LIST PROPERTY STRINGS "" "All" "Common" ${CUDA_KNOWN_GPU_ARCHITECTURES} )
+  set_property( CACHE CUDA_ARCH_LIST PROPERTY STRINGS "" "Auto" "All" "Common" ${CUDA_KNOWN_GPU_ARCHITECTURES} )
   mark_as_advanced(CUDA_ARCH_NAME)
 
 
diff --git a/cmake/MklDnn.cmake b/cmake/MklDnn.cmake
new file mode 100644
index 00000000000..acaf878b2f4
--- /dev/null
+++ b/cmake/MklDnn.cmake
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#this file download mklml
+
+message(STATUS "download mklml")
+if(MSVC)
+  set(MKL_NAME "mklml_win_2018.0.3.20180406")
+  file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/v0.14/${MKL_NAME}.zip" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.zip" EXPECTED_MD5 "8DD73E7D3F19F004551809824C4E8970" SHOW_PROGRESS)
+  file(DOWNLOAD "https://github.com/apache/incubator-mxnet/releases/download/utils/7z.exe" "${CMAKE_CURRENT_BINARY_DIR}/mklml/7z2.exe" EXPECTED_MD5 "E1CF766CF358F368EC97662D06EA5A4C" SHOW_PROGRESS)
+  
+  execute_process(COMMAND "${CMAKE_CURRENT_BINARY_DIR}/mklml/7z2.exe" "-o${CMAKE_CURRENT_BINARY_DIR}/mklml/" "-y")
+  execute_process(COMMAND "${CMAKE_CURRENT_BINARY_DIR}/mklml/7z.exe" "x" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.zip" "-o${CMAKE_CURRENT_BINARY_DIR}/mklml/" "-y")
+  set(MKLROOT "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}")
+  include_directories(${MKLROOT}/include)
+  file(COPY ${MKLROOT}/lib/libiomp5md.dll DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  file(COPY ${MKLROOT}/lib/mklml.dll DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  file(COPY ${CMAKE_SOURCE_DIR}/3rdparty/mkldnn/config_template.vcxproj.user DESTINATION ${CMAKE_SOURCE_DIR})
+elseif(UNIX)
+  set(MKL_NAME "mklml_lnx_2018.0.3.20180406")
+  file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/v0.14/${MKL_NAME}.tgz" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz" EXPECTED_MD5 "DAF7EFC3C1C0036B447213004467A8AE" SHOW_PROGRESS)
+  execute_process(COMMAND "tar" "-xzf" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz" "-C" "${CMAKE_CURRENT_BINARY_DIR}/mklml/")
+  set(MKLROOT "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}")
+  include_directories(${MKLROOT}/include)
+  file(COPY ${MKLROOT}/lib/libiomp5.so DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  file(COPY ${MKLROOT}/lib/libmklml_gnu.so DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+  file(COPY ${MKLROOT}/lib/libmklml_intel.so DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+else()
+  message(FATAL_ERROR "not support now")
+endif()
diff --git a/cmake/Modules/FindAccelerate.cmake b/cmake/Modules/FindAccelerate.cmake
index 695538ac924..8bdc665a3aa 100644
--- a/cmake/Modules/FindAccelerate.cmake
+++ b/cmake/Modules/FindAccelerate.cmake
@@ -22,11 +22,13 @@
 #  Accelerate_INCLUDE_DIRS
 #  Accelerate_LIBRARIES
 
+file(TO_CMAKE_PATH "$ENV{Accelerate_HOME}" Accelerate_HOME)
 set(Accelerate_INCLUDE_SEARCH_PATHS
-  /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/
+  /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current
+  ${Accelerate_HOME}
 )
 
-find_path(Accelerate_CBLAS_INCLUDE_DIR NAMES cblas.h   PATHS ${Accelerate_INCLUDE_SEARCH_PATHS})
+find_path(Accelerate_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Accelerate_INCLUDE_SEARCH_PATHS} PATH_SUFFIXES Headers)
 
 set(LOOKED_FOR
     Accelerate_CBLAS_INCLUDE_DIR
diff --git a/cpp-package/README.md b/cpp-package/README.md
index 9dd3b01ca3f..2b6e0e39f0f 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -4,4 +4,18 @@ To build the C++ package, please refer to [this guide](<https://mxnet.incubator.
 
 A basic tutorial can be found at <https://mxnet.incubator.apache.org/tutorials/c++/basics.html>.
 
-The example directory contains examples for you to get started. 
+The example directory contains examples for you to get started.
+
+## Building C++ examples in examples folder
+
+From cpp-package/examples directory
+-  Build all examples in release mode: **make all**
+-  Build all examples in debug mode : **make debug**
+
+By default, the examples are build to be run on GPU.
+To build examples to run on CPU:
+- Release: **make all MXNET_USE_CPU=1**  
+- Debug: **make debug MXNET_USE_CPU=1**  
+
+
+The makefile will also download the necessary data files and store in data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
index 7c1216d1dbd..c83260c2671 100644
--- a/cpp-package/example/Makefile
+++ b/cpp-package/example/Makefile
@@ -15,19 +15,31 @@
 # specific language governing permissions and limitations
 # under the License.
 
+prebuild :
+	$(shell ./get_data.sh)
+	$(shell cp -r ../../lib ./)
 CPPEX_SRC = $(wildcard *.cpp)
 CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
 
-CFLAGS += -I../../include -I../../3rdparty/nnvm/include -I../../3rdparty/dmlc-core/include
-CPPEX_CFLAGS += -I../include
-CPPEX_EXTRA_LDFLAGS := -L. -lmxnet
+CFLAGS += -I../../include -I../../3rdparty/nnvm/include -I../../3rdparty/dmlc-core/include  -I../include
+
+ifeq ($(MXNET_USE_CPU),1)
+	CFLAGS += -D MXNET_USE_CPU
+endif
+
+# CPPEX_CFLAGS += -I../include
+CPPEX_EXTRA_LDFLAGS := -L../../lib -lmxnet
 
 .PHONY: all clean
 
-all: $(CPPEX_EXE)
+all: prebuild  $(CPPEX_EXE)
+
+debug: CPPEX_CFLAGS += -DDEBUG -g 
+debug: prebuild all
+
 
-$(CPPEX_EXE):% : %.cpp libmxnet.so ../include/mxnet-cpp/*.h
-	$(CXX) -std=c++0x $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+$(CPPEX_EXE):% : %.cpp  
+	$(CXX) -std=c++0x $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
 
 clean:
 	rm -f $(CPPEX_EXE)
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index dd5d2b4b06d..3d6e6855b64 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -22,10 +22,11 @@
 #include <iostream>
 #include <map>
 #include <string>
+#include <fstream>
+#include <cstdlib>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-
-using namespace std;
 using namespace mxnet::cpp;
 
 Symbol AlexnetSymbol(int num_classes) {
@@ -198,17 +199,21 @@ Symbol AlexnetSymbol(int num_classes) {
 int main(int argc, char const *argv[]) {
   /*basic config*/
   int batch_size = 256;
-  int max_epo = 100;
+  int max_epo = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
 
   /*context and net symbol*/
   auto ctx = Context::gpu();
+#if MXNET_USE_CPU
+  ctx = Context::cpu();
+#endif
+
   auto Net = AlexnetSymbol(10);
 
   /*args_map and aux_map is used for parameters' saving*/
-  map<string, NDArray> args_map;
-  map<string, NDArray> aux_map;
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
 
   /*we should tell mxnet the shape of data and label*/
   args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
@@ -234,26 +239,24 @@ int main(int argc, char const *argv[]) {
     LG << s;
     const auto &k = args_map[s].GetShape();
     for (const auto &i : k) {
-      cout << i << " ";
+      std::cout << i << " ";
     }
-    cout << endl;
+    std::cout << std::endl;
   }
 
   /*these binary files should be generated using im2rc tools, which can be found
    * in mxnet/bin*/
-  auto train_iter = MXDataIter("ImageRecordIter")
-                        .SetParam("path_imglist", "./data/train_rec.lst")
-                        .SetParam("path_imgrec", "./data/train_rec.bin")
-                        .SetParam("data_shape", Shape(3, 256, 256))
-                        .SetParam("batch_size", batch_size)
-                        .SetParam("shuffle", 1)
-                        .CreateDataIter();
-  auto val_iter = MXDataIter("ImageRecordIter")
-                      .SetParam("path_imglist", "./data/val_rec.lst")
-                      .SetParam("path_imgrec", "./data/val_rec.bin")
-                      .SetParam("data_shape", Shape(3, 256, 256))
-                      .SetParam("batch_size", batch_size)
-                      .CreateDataIter();
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                "./data/mnist_data/train-labels-idx1-ubyte",
+                                "./data/mnist_data/t10k-images-idx3-ubyte",
+                                "./data/mnist_data/t10k-labels-idx1-ubyte"
+                              };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
@@ -306,11 +309,11 @@ int main(int argc, char const *argv[]) {
     LG << "ITER: " << iter << " Val LogLoss: " << logloss_val.Get();
 
     /*save the parameters*/
-    stringstream ss;
+    std::stringstream ss;
     ss << iter;
-    string iter_str;
+    std::string iter_str;
     ss >> iter_str;
-    string save_path_param = "./model/alex_param_" + iter_str;
+    std::string save_path_param = "alex_param_" + iter_str;
     auto save_args = args_map;
     /*we do not want to save the data and label*/
     save_args.erase(save_args.find("data"));
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index d28ae33c639..ad564f665a8 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -43,8 +43,6 @@
 #include <chrono>
 #include "mxnet-cpp/MxNetCpp.h"
 
-
-using namespace std;
 using namespace mxnet::cpp;
 
 struct LSTMState {
@@ -65,7 +63,7 @@ bool TIME_MAJOR = true;
 LSTMState LSTM(int num_hidden, const Symbol& indata, const LSTMState& prev_state,
     const LSTMParam& param, int seqidx, int layeridx, mx_float dropout = 0) {
   auto input = dropout > 0? Dropout(indata, dropout) : indata;
-  auto prefix = string("t") + to_string(seqidx) + "_l" + to_string(layeridx);
+  auto prefix = std::string("t") + std::to_string(seqidx) + "_l" + std::to_string(layeridx);
   auto i2h = FullyConnected(prefix + "_i2h", input, param.i2h_weight, param.i2h_bias,
       num_hidden * 4);
   auto h2h = FullyConnected(prefix + "_h2h", prev_state.h, param.h2h_weight, param.h2h_bias,
@@ -93,10 +91,10 @@ Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
   auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
   auto wordvec = isTrain? SliceChannel(embed, sequence_length, TIME_MAJOR? 0 : 1, true) : embed;
 
-  vector<LSTMState> last_states;
-  vector<LSTMParam> param_cells;
+  std::vector<LSTMState> last_states;
+  std::vector<LSTMParam> param_cells;
   for (int l = 0; l < num_lstm_layer; l++) {
-    string layer = "l" + to_string(l);
+    std::string layer = "l" + std::to_string(l);
     LSTMParam param;
     param.i2h_weight = Symbol::Variable(layer + "_i2h_weight");
     param.i2h_bias = Symbol::Variable(layer + "_i2h_bias");
@@ -109,7 +107,7 @@ Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
     last_states.push_back(state);
   }
 
-  vector<Symbol> hidden_all;
+  std::vector<Symbol> hidden_all;
   for (int i = 0; i < sequence_length; i++) {
     auto hidden = wordvec[i];
     for (int layer = 0; layer < num_lstm_layer; layer++) {
@@ -136,7 +134,7 @@ Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
   if (isTrain)
     return sm;
 
-  vector<Symbol> outputs = { sm };
+  std::vector<Symbol> outputs = { sm };
   for (auto& state : last_states) {
     outputs.push_back(state.C);
     outputs.push_back(state.h);
@@ -189,14 +187,14 @@ Symbol LSTMWithBuiltInRNNOp(int num_lstm_layer, int sequence_length, int input_d
 }
 
 class Shuffler {
-  vector<int> sequence;
+  std::vector<int> sequence;
  public:
   explicit Shuffler(int size) : sequence(size) {
     int* p = sequence.data();
     for (int i = 0; i < size; i++)
       *p++ = i;
   }
-  void shuffle(function<void(int, int)> lambda = nullptr) {
+  void shuffle(std::function<void(int, int)> lambda = nullptr) {
     random_shuffle(sequence.begin(), sequence.end());
     int n = 0;
     if (lambda != nullptr)
@@ -213,12 +211,12 @@ class BucketSentenceIter : public DataIter {
   int batch, current, end;
   unsigned int sequence_length;
   Context device;
-  vector<vector<mx_float>> sequences;
-  vector<wchar_t> index2chars;
-  unordered_map<wchar_t, mx_float> charIndices;
+  std::vector<std::vector<mx_float>> sequences;
+  std::vector<wchar_t> index2chars;
+  std::unordered_map<wchar_t, mx_float> charIndices;
 
  public:
-  BucketSentenceIter(string filename, int minibatch, Context context) : batch(minibatch),
+  BucketSentenceIter(std::string filename, int minibatch, Context context) : batch(minibatch),
   current(-1), device(context) {
     auto content = readContent(filename);
     buildCharIndex(content);
@@ -226,13 +224,13 @@ class BucketSentenceIter : public DataIter {
 
     int N = sequences.size() / batch * batch;  // total used samples
     sequences.resize(N);
-    sort(sequences.begin(), sequences.end(), [](const vector<mx_float>& a,
-        const vector<mx_float>& b) { return a.size() < b.size(); });
+    sort(sequences.begin(), sequences.end(), [](const std::vector<mx_float>& a,
+        const std::vector<mx_float>& b) { return a.size() < b.size(); });
 
     sequence_length = sequences.back().size();
     random = new Shuffler(N);
     // We still can get random results if call Reset() firstly
-//    vector<vector<mx_float>>* target = &sequences;
+//    std::vector<vector<mx_float>>* target = &sequences;
 //    random->shuffle([target](int n, int i) { (*target)[n].swap((*target)[i]); });
     end = N / batch;
   }
@@ -286,7 +284,7 @@ class BucketSentenceIter : public DataIter {
   }
   virtual std::vector<int> GetIndex(void) {
     const int* indices = random->data();
-    vector<int> list(indices + current * batch, indices + current * batch + batch);
+    std::vector<int> list(indices + current * batch, indices + current * batch + batch);
     return list;
   }
   virtual void BeforeFirst(void) {
@@ -294,17 +292,17 @@ class BucketSentenceIter : public DataIter {
     random->shuffle(nullptr);
   }
 
-  wstring readContent(const string file) {
-    wifstream ifs(file, ios::binary);
+  std::wstring readContent(const std::string file) {
+    std::wifstream ifs(file, std::ios::binary);
     if (ifs) {
-      wostringstream os;
+      std::wostringstream os;
       os << ifs.rdbuf();
       return os.str();
     }
     return L"";
   }
 
-  void buildCharIndex(const wstring& content) {
+  void buildCharIndex(const std::wstring& content) {
   // This version buildCharIndex() Compatiable with python version char_rnn dictionary
     int n = 1;
     charIndices['\0'] = 0;  // padding character
@@ -318,7 +316,7 @@ class BucketSentenceIter : public DataIter {
 //  void buildCharIndex(wstring& content) {
 //    for (auto c : content)
 //      charIndices[c]++; // char-frequency map; then char-index map
-//    vector<tuple<wchar_t, mx_float>> characters;
+//    std::vector<tuple<wchar_t, mx_float>> characters;
 //    for (auto& iter : charIndices)
 //      characters.push_back(make_tuple(iter.first, iter.second));
 //    sort(characters.begin(), characters.end(), [](const tuple<wchar_t, mx_float>& a,
@@ -329,7 +327,7 @@ class BucketSentenceIter : public DataIter {
 //    for (auto& t : characters) {
 //      charIndices[get<0>(t)] = index++;
 //      index2chars.push_back(get<0>(t));
-//    }
+//    }s
 //  }
 
   inline wchar_t character(int i) {
@@ -340,21 +338,21 @@ class BucketSentenceIter : public DataIter {
     return charIndices[c];
   }
 
-  void saveCharIndices(const string file) {
-    wofstream ofs(file, ios::binary);
+  void saveCharIndices(const std::string file) {
+    std::wofstream ofs(file, std::ios::binary);
     if (ofs) {
       ofs.write(index2chars.data() + 1, index2chars.size() - 1);
       ofs.close();
     }
   }
 
-  static tuple<unordered_map<wchar_t, mx_float>, vector<wchar_t>> loadCharIndices(
-      const string file) {
-    wifstream ifs(file, ios::binary);
-    unordered_map<wchar_t, mx_float> map;
-    vector<wchar_t> chars;
+  static std::tuple<std::unordered_map<wchar_t, mx_float>, std::vector<wchar_t>> loadCharIndices(
+      const std::string file) {
+    std::wifstream ifs(file, std::ios::binary);
+    std::unordered_map<wchar_t, mx_float> map;
+    std::vector<wchar_t> chars;
     if (ifs) {
-      wostringstream os;
+      std::wostringstream os;
       os << ifs.rdbuf();
       int n = 1;
       map[L'\0'] = 0;
@@ -365,15 +363,16 @@ class BucketSentenceIter : public DataIter {
       }
     }
     // Note: Can't use {} because this would hit the explicit constructor
-    return tuple<unordered_map<wchar_t, mx_float>, vector<wchar_t>>(map, chars);
+    return std::tuple<std::unordered_map<wchar_t, mx_float>, std::vector<wchar_t>>(map, chars);
   }
 
-  vector<vector<mx_float>> convertTextToSequences(const wstring& content, wchar_t spliter) {
-    vector<vector<mx_float>> sequences;
-    sequences.push_back(vector<mx_float>());
+  std::vector<std::vector<mx_float>>
+  convertTextToSequences(const std::wstring& content, wchar_t spliter) {
+    std::vector<std::vector<mx_float>> sequences;
+    sequences.push_back(std::vector<mx_float>());
     for (auto c : content)
       if (c == spliter && !sequences.back().empty())
-        sequences.push_back(vector<mx_float>());
+        sequences.push_back(std::vector<mx_float>());
       else
         sequences.back().push_back(charIndices[c]);
     return sequences;
@@ -381,7 +380,7 @@ class BucketSentenceIter : public DataIter {
 };
 
 void OutputPerplexity(NDArray* labels, NDArray* output) {
-  vector<mx_float> charIndices, a;
+  std::vector<mx_float> charIndices, a;
   labels->SyncCopyToCPU(&charIndices, 0L);  // 0L indicates all
   output->SyncCopyToCPU(&a, 0L)/*4128*84*/;
   mx_float loss = 0;
@@ -390,18 +389,18 @@ void OutputPerplexity(NDArray* labels, NDArray* output) {
   for (int n = 0; n < nSamples; n++) {
     int row = n % batchSize, column = n / batchSize, labelOffset = column +
         row * sequenceLength;  // Search based on column storage: labels.T
-    mx_float safe_value = max(1e-10f, a[vocabSize * n +
+    mx_float safe_value = std::max(1e-10f, a[vocabSize * n +
                                     static_cast<int>(charIndices[labelOffset])]);
     loss += -log(safe_value);  // Calculate negative log-likelihood
   }
   loss = exp(loss / nSamples);
-  cout << "Train-Perplexity=" << loss << endl;
+  std::cout << "Train-Perplexity=" << loss << std::endl;
 }
 
-void SaveCheckpoint(const string filepath, Symbol net, Executor* exe) {
-  map<string, NDArray> params;
+void SaveCheckpoint(const std::string filepath, Symbol net, Executor* exe) {
+  std::map<std::string, NDArray> params;
   for (auto iter : exe->arg_dict())
-    if (iter.first.find("_init_") == string::npos
+    if (iter.first.find("_init_") == std::string::npos
         && iter.first.rfind("data") != iter.first.length() - 4
         && iter.first.rfind("label") != iter.first.length() - 5)
       params.insert({"arg:" + iter.first, iter.second});
@@ -410,11 +409,11 @@ void SaveCheckpoint(const string filepath, Symbol net, Executor* exe) {
   NDArray::Save(filepath, params);
 }
 
-void LoadCheckpoint(const string filepath, Executor* exe) {
-  map<std::string, NDArray> params = NDArray::LoadToMap(filepath);
+void LoadCheckpoint(const std::string filepath, Executor* exe) {
+  std::map<std::string, NDArray> params = NDArray::LoadToMap(filepath);
   for (auto iter : params) {
-    string type = iter.first.substr(0, 4);
-    string name = iter.first.substr(4);
+    std::string type = iter.first.substr(0, 4);
+    std::string name = iter.first.substr(4);
     NDArray target;
     if (type == "arg:")
       target = exe->arg_dict()[name];
@@ -432,10 +431,10 @@ int num_embed = 256;
 int num_lstm_layer = 3;
 int num_hidden = 512;
 mx_float dropout = 0.2;
-void train(const string file, int batch_size, int max_epoch, int start_epoch) {
+void train(const std::string file, int batch_size, int max_epoch, int start_epoch) {
   Context device(DeviceType::kGPU, 0);
   BucketSentenceIter dataIter(file, batch_size, device);
-  string prefix = file.substr(0, file.rfind("."));
+  std::string prefix = file.substr(0, file.rfind("."));
   dataIter.saveCharIndices(prefix + ".dictionary");
 
   input_dim = static_cast<int>(dataIter.characterSize());
@@ -443,15 +442,15 @@ void train(const string file, int batch_size, int max_epoch, int start_epoch) {
 
   auto RNN = LSTMUnroll(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
       num_embed, dropout);
-  map<string, NDArray> args_map;
+  std::map<std::string, NDArray> args_map;
   args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
   args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
   for (int i = 0; i < num_lstm_layer; i++) {
-    string key = "l" + to_string(i) + "_init_";
+    std::string key = "l" + std::to_string(i) + "_init_";
     args_map[key + "c"] = NDArray(Shape(batch_size, num_hidden), device, false);
     args_map[key + "h"] = NDArray(Shape(batch_size, num_hidden), device, false);
   }
-  vector<mx_float> zeros(batch_size * num_hidden, 0);
+  std::vector<mx_float> zeros(batch_size * num_hidden, 0);
   // RNN.SimpleBind(device, args_map, {}, {{"data", kNullOp}});
   Executor* exe = RNN.SimpleBind(device, args_map);
 
@@ -460,7 +459,7 @@ void train(const string file, int batch_size, int max_epoch, int start_epoch) {
     for (auto &arg : exe->arg_dict())
       xavier(arg.first, &arg.second);
   } else {
-    LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe);
+    LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe);
   }
   start_epoch++;
 
@@ -474,13 +473,13 @@ void train(const string file, int batch_size, int max_epoch, int start_epoch) {
 
   for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
     dataIter.Reset();
-    auto tic = chrono::system_clock::now();
+    auto tic =  std::chrono::system_clock::now();
     while (dataIter.Next()) {
       auto data_batch = dataIter.GetDataBatch();
       data_batch.data.CopyTo(&exe->arg_dict()["data"]);
       data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
       for (int l = 0; l < num_lstm_layer; l++) {
-        string key = "l" + to_string(l) + "_init_";
+        std::string key = "l" + std::to_string(l) + "_init_";
         exe->arg_dict()[key + "c"].SyncCopyFromCPU(zeros);
         exe->arg_dict()[key + "h"].SyncCopyFromCPU(zeros);
       }
@@ -494,11 +493,11 @@ void train(const string file, int batch_size, int max_epoch, int start_epoch) {
 
       NDArray::WaitAll();
     }
-    auto toc = chrono::system_clock::now();
-    cout << "Epoch[" << epoch << "] Time Cost:" <<
-        chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds ";
+    auto toc =  std::chrono::system_clock::now();
+    std::cout << "Epoch[" << epoch << "] Time Cost:" <<
+         std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds ";
     OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
-    string filepath = prefix + "-" + to_string(epoch) + ".params";
+    std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
     SaveCheckpoint(filepath, RNN, exe);
   }
 }
@@ -520,10 +519,10 @@ class RNNXavier : public Xavier {
   }
 };
 
-void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int start_epoch) {
+void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch, int start_epoch) {
   Context device(DeviceType::kGPU, 0);
   BucketSentenceIter dataIter(file, batch_size, device);
-  string prefix = file.substr(0, file.rfind("."));
+  std::string prefix = file.substr(0, file.rfind("."));
   dataIter.saveCharIndices(prefix + ".dictionary");
 
   input_dim = static_cast<int>(dataIter.characterSize());
@@ -531,13 +530,13 @@ void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int
 
   auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
       num_embed, dropout);
-  map<string, NDArray> args_map;
+  std::map<std::string, NDArray> args_map;
   args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
   // Avoiding SwapAxis, batch_size is of second dimension.
   args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
   args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
   args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
-  vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0);
+  std::vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0);
   Executor* exe = RNN.SimpleBind(device, args_map);
 
   if (start_epoch == -1) {
@@ -545,7 +544,7 @@ void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int
     for (auto &arg : exe->arg_dict())
       xavier(arg.first, &arg.second);
   } else {
-    LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe);
+    LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe);
   }
   start_epoch++;
 
@@ -555,7 +554,7 @@ void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int
 
   for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
     dataIter.Reset();
-    auto tic = chrono::system_clock::now();
+    auto tic =  std::chrono::system_clock::now();
     while (dataIter.Next()) {
       auto data_batch = dataIter.GetDataBatch();
       data_batch.data.CopyTo(&exe->arg_dict()["data"]);
@@ -571,30 +570,30 @@ void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int
       }
       NDArray::WaitAll();
     }
-    auto toc = chrono::system_clock::now();
-    cout << "Epoch[" << epoch << "] Time Cost:" <<
-        chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds ";
+    auto toc =  std::chrono::system_clock::now();
+    std::cout << "Epoch[" << epoch << "] Time Cost:" <<
+         std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds ";
     OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
-    string filepath = prefix + "-" + to_string(epoch) + ".params";
+    std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
     SaveCheckpoint(filepath, RNN, exe);
   }
 }
 
-void predict(wstring* ptext, int sequence_length, const string param_file,
-    const string dictionary_file) {
+void predict(std::wstring* ptext, int sequence_length, const std::string param_file,
+    const std::string dictionary_file) {
   Context device(DeviceType::kGPU, 0);
   auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
-  auto dictionary = get<0>(results);
-  auto charIndices = get<1>(results);
+  auto dictionary = std::get<0>(results);
+  auto charIndices = std::get<1>(results);
   input_dim = static_cast<int>(charIndices.size());
   auto RNN = LSTMUnroll(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
 
-  map<string, NDArray> args_map;
+  std::map<std::string, NDArray> args_map;
   args_map["data"] = NDArray(Shape(1, 1), device, false);
   args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
-  vector<mx_float> zeros(1 * num_hidden, 0);
+  std::vector<mx_float> zeros(1 * num_hidden, 0);
   for (int l = 0; l < num_lstm_layer; l++) {
-    string key = "l" + to_string(l) + "_init_";
+    std::string key = "l" + std::to_string(l) + "_init_";
     args_map[key + "c"] = NDArray(Shape(1, num_hidden), device, false);
     args_map[key + "h"] = NDArray(Shape(1, num_hidden), device, false);
     args_map[key + "c"].SyncCopyFromCPU(zeros);
@@ -605,7 +604,7 @@ void predict(wstring* ptext, int sequence_length, const string param_file,
 
   mx_float index;
   wchar_t next = 0;
-  vector<mx_float> softmax;
+  std::vector<mx_float> softmax;
   softmax.resize(input_dim);
   for (auto c : *ptext) {
     exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
@@ -613,7 +612,7 @@ void predict(wstring* ptext, int sequence_length, const string param_file,
 
     exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
     for (int l = 0; l < num_lstm_layer; l++) {
-      string key = "l" + to_string(l) + "_init_";
+      std::string key = "l" + std::to_string(l) + "_init_";
       exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
       exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
     }
@@ -630,7 +629,7 @@ void predict(wstring* ptext, int sequence_length, const string param_file,
 
     exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
     for (int l = 0; l < num_lstm_layer; l++) {
-      string key = "l" + to_string(l) + "_init_";
+      std::string key = "l" + std::to_string(l) + "_init_";
       exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
       exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
     }
@@ -642,19 +641,19 @@ void predict(wstring* ptext, int sequence_length, const string param_file,
   }
 }
 
-void predictWithBuiltInRNNOp(wstring* ptext, int sequence_length, const string param_file,
-  const string dictionary_file) {
+void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file,
+  const std::string dictionary_file) {
   Context device(DeviceType::kGPU, 0);
   auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
-  auto dictionary = get<0>(results);
-  auto charIndices = get<1>(results);
+  auto dictionary = std::get<0>(results);
+  auto charIndices = std::get<1>(results);
   input_dim = static_cast<int>(charIndices.size());
   auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
 
-  map<string, NDArray> args_map;
+  std::map<std::string, NDArray> args_map;
   args_map["data"] = NDArray(Shape(1, 1), device, false);
   args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
-  vector<mx_float> zeros(1 * num_lstm_layer * num_hidden, 0);
+  std::vector<mx_float> zeros(1 * num_lstm_layer * num_hidden, 0);
   // Avoiding SwapAxis, batch_size=1 is of second dimension.
   args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
   args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
@@ -665,7 +664,7 @@ void predictWithBuiltInRNNOp(wstring* ptext, int sequence_length, const string p
 
   mx_float index;
   wchar_t next = 0;
-  vector<mx_float> softmax;
+  std::vector<mx_float> softmax;
   softmax.resize(input_dim);
   for (auto c : *ptext) {
     exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
@@ -698,23 +697,23 @@ void predictWithBuiltInRNNOp(wstring* ptext, int sequence_length, const string p
 
 int main(int argc, char** argv) {
   if (argc < 5) {
-    cout << "Usage for training: charRNN train[BuiltIn][TimeMajor] {corpus file}"
-            " {batch size} {max epoch} [{starting epoch}]" << endl;
-    cout <<"Usage for prediction: charRNN predict[BuiltIn][TimeMajor] {params file}"
-            " {dictionary file} {beginning of text}" << endl;
-    cout <<"Note: The {params file} of train/trainBuiltIn/trainTimeMajor/trainBuiltInTimeMajor"
-            " are not compatible with each other." << endl;
+    std::cout << "Usage for training: charRNN train[BuiltIn][TimeMajor] {corpus file}"
+            " {batch size} {max epoch} [{starting epoch}]" << std::endl;
+    std::cout <<"Usage for prediction: charRNN predict[BuiltIn][TimeMajor] {params file}"
+            " {dictionary file} {beginning of text}" << std::endl;
+    std::cout <<"Note: The {params file} of train/trainBuiltIn/trainTimeMajor/trainBuiltInTimeMajor"
+            " are not compatible with each other." << std::endl;
     return 0;
   }
 
-  string task = argv[1];
-  bool builtIn = task.find("BuiltIn") != string::npos;
-  TIME_MAJOR = task.find("TimeMajor") != string::npos;
-  cout << "use BuiltIn cuDNN RNN: " << builtIn << endl
-         << "use data as TimeMajor: " << TIME_MAJOR << endl;
+  std::string task = argv[1];
+  bool builtIn = task.find("BuiltIn") != std::string::npos;
+  TIME_MAJOR = task.find("TimeMajor") != std::string::npos;
+  std::cout << "use BuiltIn cuDNN RNN: " << builtIn << std::endl
+         << "use data as TimeMajor: " << TIME_MAJOR << std::endl;
   if (task.find("train") == 0) {
-    cout << "train batch size:      " << argv[3] << endl
-           << "train max epoch:       " << argv[4] << endl;
+    std::cout << "train batch size:      " << argv[3] << std::endl
+           << "train max epoch:       " << argv[4] << std::endl;
     int start_epoch = argc > 5? atoi(argv[5]) : -1;
     // this function will generate dictionary file and params file.
     if (builtIn)
@@ -722,9 +721,9 @@ int main(int argc, char** argv) {
     else
       train(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);  // ditto
   } else if (task.find("predict") == 0) {
-    wstring text;  // = L"If there is anyone out there who still doubts ";
+    std::wstring text;  // = L"If there is anyone out there who still doubts ";
     // Considering of extending to Chinese samples in future, use wchar_t instead of char
-    for (char c : string(argv[4]))
+    for (char c : std::string(argv[4]))
       text.push_back((wchar_t) c);
     /*Python version predicts text default to random selecltions. Here I didn't write the random
     code, always choose the 'best' character. So the text length reduced to 600. Longer size often
@@ -733,7 +732,7 @@ int main(int argc, char** argv) {
       predictWithBuiltInRNNOp(&text, 600, argv[2], argv[3]);
     else
       predict(&text, 600, argv[2], argv[3]);
-    wcout << text << endl;
+    std::wcout << text << std::endl;
   }
 
   MXNotifyShutdown();
diff --git a/cpp-package/example/get_data.sh b/cpp-package/example/get_data.sh
new file mode 100755
index 00000000000..7f975222d0b
--- /dev/null
+++ b/cpp-package/example/get_data.sh
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+unameOut="$(uname -s)"
+case "${unameOut}" in
+    Linux*)     CMD='wget';;
+    Darwin*)    CMD='curl -o';;
+    CYGWIN*)    CMD='wget';;
+    MINGW*)     CMD='wget';;
+    *)          CMD=""
+esac
+
+if [ ! -d "./data" ]; then
+    mkdir data
+fi
+
+if [ ! -d "./data/mnist_data" ]; then
+  mkdir ./data/mnist_data
+
+  (cd data/mnist_data; $CMD train-images-idx3-ubyte.gz https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz)
+  (cd data/mnist_data; $CMD train-labels-idx1-ubyte.gz https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz)
+  (cd data/mnist_data; $CMD t10k-images-idx3-ubyte.gz  https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz)
+  (cd data/mnist_data; $CMD t10k-labels-idx1-ubyte.gz  https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz)
+  (cd data/mnist_data; $CMD mnist_train.csv.gz         http://data.mxnet.io/data/mnist_train.csv.gz)
+  (cd data/mnist_data; gzip -d *.gz)
+fi
+
+
+
diff --git a/cpp-package/example/get_mnist.sh b/cpp-package/example/get_mnist.sh
deleted file mode 100755
index 40379621025..00000000000
--- a/cpp-package/example/get_mnist.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if [ ! -d "./mnist_data" ]; then
-  mkdir mnist_data
-  (cd mnist_data; wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz)
-  (cd mnist_data; wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz)
-  (cd mnist_data; wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz)
-  (cd mnist_data; wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz)
-  (cd mnist_data; gzip -d *.gz)
-fi
-echo "Data downloaded"
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index fe5dea6a1f5..ad9212c75be 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -22,9 +22,10 @@
 #include <string>
 #include <vector>
 #include <map>
+#include <fstream>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-
 using namespace mxnet::cpp;
 
 Symbol ConvFactory(Symbol data, int num_filter,
@@ -114,32 +115,34 @@ Symbol GoogleNetSymbol(int num_classes) {
 
 int main(int argc, char const *argv[]) {
   int batch_size = 50;
-  int max_epoch = 100;
+  int max_epoch = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
 
+  auto ctx = Context::gpu();
+#if MXNET_USE_CPU
+  ctx = Context::cpu();;
+#endif
+
   auto googlenet = GoogleNetSymbol(10);
   std::map<std::string, NDArray> args_map;
   std::map<std::string, NDArray> aux_map;
 
-  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), Context::gpu());
-  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
-  googlenet.InferArgsMap(Context::gpu(), &args_map, args_map);
-
-  auto train_iter = MXDataIter("ImageRecordIter")
-      .SetParam("path_imglist", "./train.lst")
-      .SetParam("path_imgrec", "./train.rec")
-      .SetParam("data_shape", Shape(3, 256, 256))
-      .SetParam("batch_size", batch_size)
-      .SetParam("shuffle", 1)
-      .CreateDataIter();
-
-  auto val_iter = MXDataIter("ImageRecordIter")
-      .SetParam("path_imglist", "./val.lst")
-      .SetParam("path_imgrec", "./_val.rec")
-      .SetParam("data_shape", Shape(3, 256, 256))
-      .SetParam("batch_size", batch_size)
-      .CreateDataIter();
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
+  args_map["data_label"] = NDArray(Shape(batch_size), ctx);
+  googlenet.InferArgsMap(ctx, &args_map, args_map);
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
@@ -149,7 +152,7 @@ int main(int argc, char const *argv[]) {
      ->SetParam("wd", weight_decay);
 
 
-  auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
+  auto *exec = googlenet.SimpleBind(ctx, args_map);
   auto arg_names = googlenet.ListArguments();
 
   for (int iter = 0; iter < max_epoch; ++iter) {
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index e6f47904e0e..c499df77e1e 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -21,10 +21,11 @@
  */
 #include <map>
 #include <string>
+#include <fstream>
 #include <vector>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-
 using namespace mxnet::cpp;
 
 Symbol ConvFactoryBN(Symbol data, int num_filter,
@@ -146,28 +147,30 @@ int main(int argc, char const *argv[]) {
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
 
+  auto ctx = Context::gpu();
+#if MXNET_USE_CPU
+  ctx = Context::cpu();
+#endif
+
   auto inception_bn_net = InceptionSymbol(10);
   std::map<std::string, NDArray> args_map;
   std::map<std::string, NDArray> aux_map;
 
-  args_map["data"] = NDArray(Shape(batch_size, 3, 224, 224), Context::gpu());
-  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
-  inception_bn_net.InferArgsMap(Context::gpu(), &args_map, args_map);
-
-  auto train_iter = MXDataIter("ImageRecordIter")
-      .SetParam("path_imglist", "./train.lst")
-      .SetParam("path_imgrec", "./train.rec")
-      .SetParam("data_shape", Shape(3, 224, 224))
-      .SetParam("batch_size", batch_size)
-      .SetParam("shuffle", 1)
-      .CreateDataIter();
-
-  auto val_iter = MXDataIter("ImageRecordIter")
-      .SetParam("path_imglist", "./val.lst")
-      .SetParam("path_imgrec", "./val.rec")
-      .SetParam("data_shape", Shape(3, 224, 224))
-      .SetParam("batch_size", batch_size)
-      .CreateDataIter();
+  args_map["data"] = NDArray(Shape(batch_size, 3, 224, 224), ctx);
+  args_map["data_label"] = NDArray(Shape(batch_size), ctx);
+  inception_bn_net.InferArgsMap(ctx, &args_map, args_map);
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
@@ -176,7 +179,7 @@ int main(int argc, char const *argv[]) {
      ->SetParam("lr", learning_rate)
      ->SetParam("wd", weight_decay);
 
-  auto *exec = inception_bn_net.SimpleBind(Context::gpu(), args_map);
+  auto *exec = inception_bn_net.SimpleBind(ctx, args_map);
   auto arg_names = inception_bn_net.ListArguments();
 
   for (int iter = 0; iter < max_epoch; ++iter) {
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 4c5a1f1165c..83c659c7082 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -23,18 +23,23 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <cstdlib>
 #include "mxnet-cpp/MxNetCpp.h"
 
-
-using namespace std;
 using namespace mxnet::cpp;
 
 class Lenet {
  public:
   Lenet()
       : ctx_cpu(Context(DeviceType::kCPU, 0)),
-        ctx_dev(Context(DeviceType::kGPU, 0)) {}
-  void Run() {
+#if MXNET_USE_CPU
+        ctx_dev(Context(DeviceType::kCPU, 0))
+#else
+        ctx_dev(Context(DeviceType::kGPU, 0))
+#endif
+        {}
+
+  void Run(int max_epoch) {
     /*
      * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
      * "Gradient-based learning applied to document recognition."
@@ -84,12 +89,11 @@ class Lenet {
     int W = 28;
     int H = 28;
     int batch_size = 42;
-    int max_epoch = 100000;
     float learning_rate = 1e-4;
     float weight_decay = 1e-4;
 
     /*prepare the data*/
-    vector<float> data_vec, label_vec;
+    std::vector<float> data_vec, label_vec;
     size_t data_count = GetData(&data_vec, &label_vec);
     const float *dptr = data_vec.data();
     const float *lptr = label_vec.data();
@@ -174,21 +178,21 @@ class Lenet {
  private:
   Context ctx_cpu;
   Context ctx_dev;
-  map<string, NDArray> args_map;
+  std::map<std::string, NDArray> args_map;
   NDArray train_data;
   NDArray train_label;
   NDArray val_data;
   NDArray val_label;
 
-  size_t GetData(vector<float> *data, vector<float> *label) {
-    const char *train_data_path = "./train.csv";
-    ifstream inf(train_data_path);
-    string line;
+  size_t GetData(std::vector<float> *data, std::vector<float> *label) {
+    const char *train_data_path = "./data/mnist_data/mnist_train.csv";
+    std::ifstream inf(train_data_path);
+    std::string line;
     inf >> line;  // ignore the header
     size_t _N = 0;
     while (inf >> line) {
       for (auto &c : line) c = (c == ',') ? ' ' : c;
-      stringstream ss;
+      std::stringstream ss;
       ss << line;
       float _data;
       ss >> _data;
@@ -253,7 +257,7 @@ class Lenet {
 
 int main(int argc, char const *argv[]) {
   Lenet lenet;
-  lenet.Run();
+  lenet.Run(argc > 1 ? strtol(argv[1], NULL, 10) : 100000);
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 04f5cbca3a9..9869356be08 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -22,11 +22,12 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <fstream>
 #include <chrono>
+#include <cstdlib>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-
-using namespace std;
 using namespace mxnet::cpp;
 
 Symbol LenetSymbol() {
@@ -70,33 +71,36 @@ int main(int argc, char const *argv[]) {
   int W = 28;
   int H = 28;
   int batch_size = 128;
-  int max_epoch = 100;
+  int max_epoch = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
-
+  auto dev_ctx = Context::gpu();
+#if MXNET_USE_CPU
+    dev_ctx = Context::cpu();
+#endif
   auto lenet = LenetSymbol();
-  std::map<string, NDArray> args_map;
+  std::map<std::string, NDArray> args_map;
 
-  args_map["data"] = NDArray(Shape(batch_size, 1, W, H), Context::gpu());
-  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
-  lenet.InferArgsMap(Context::gpu(), &args_map, args_map);
+  args_map["data"] = NDArray(Shape(batch_size, 1, W, H), dev_ctx);
+  args_map["data_label"] = NDArray(Shape(batch_size), dev_ctx);
+  lenet.InferArgsMap(dev_ctx, &args_map, args_map);
 
-  args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), Context::gpu());
+  args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), dev_ctx);
   NDArray::SampleGaussian(0, 1, &args_map["fc1_w"]);
-  args_map["fc2_b"] = NDArray(Shape(10), Context::gpu());
+  args_map["fc2_b"] = NDArray(Shape(10), dev_ctx);
   args_map["fc2_b"] = 0;
 
-  auto train_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/train-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/train-labels-idx1-ubyte")
-      .SetParam("batch_size", batch_size)
-      .SetParam("shuffle", 1)
-      .SetParam("flat", 0)
-      .CreateDataIter();
-  auto val_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/t10k-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/t10k-labels-idx1-ubyte")
-      .CreateDataIter();
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
@@ -106,7 +110,7 @@ int main(int argc, char const *argv[]) {
      ->SetParam("wd", weight_decay);
 
 
-  auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
+  auto *exec = lenet.SimpleBind(dev_ctx, args_map);
   auto arg_names = lenet.ListArguments();
 
   // Create metrics
@@ -117,7 +121,7 @@ int main(int argc, char const *argv[]) {
       train_iter.Reset();
       train_acc.Reset();
 
-      auto tic = chrono::system_clock::now();
+      auto tic = std::chrono::system_clock::now();
 
      while (train_iter.Next()) {
       samples += batch_size;
@@ -142,8 +146,9 @@ int main(int argc, char const *argv[]) {
     }
 
      // one epoch of training is finished
-     auto toc = chrono::system_clock::now();
-     float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
+     auto toc = std::chrono::system_clock::now();
+     float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                      (toc - tic).count() / 1000.0;
      LG << "Epoch[" << iter << "] " << samples / duration \
          << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
 
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
index e12e25d7cb4..595d75c67c0 100644
--- a/cpp-package/example/mlp.cpp
+++ b/cpp-package/example/mlp.cpp
@@ -25,8 +25,6 @@
 #include <string>
 #include "mxnet-cpp/MxNetCpp.h"
 
-
-using namespace std;
 using namespace mxnet::cpp;
 
 /*
@@ -48,7 +46,7 @@ void OutputAccuracy(mx_float* pred, mx_float* target) {
     }
     if (p_y == target[i]) right++;
   }
-  cout << "Accuracy: " << right / 128.0 << endl;
+  std::cout << "Accuracy: " << right / 128.0 << std::endl;
 }
 
 void MLP() {
@@ -56,20 +54,20 @@ void MLP() {
   auto sym_label = Symbol::Variable("label");
 
   const int nLayers = 2;
-  vector<int> layerSizes({512, 10});
-  vector<Symbol> weights(nLayers);
-  vector<Symbol> biases(nLayers);
-  vector<Symbol> outputs(nLayers);
+  std::vector<int> layerSizes({512, 10});
+  std::vector<Symbol> weights(nLayers);
+  std::vector<Symbol> biases(nLayers);
+  std::vector<Symbol> outputs(nLayers);
 
   Symbol null_sym;
   for (int i = 0; i < nLayers; i++) {
-    string istr = to_string(i);
-    weights[i] = Symbol::Variable(string("w") + istr);
-    biases[i] = Symbol::Variable(string("b") + istr);
-    Symbol fc = FullyConnected(string("fc") + istr,
+    std::string istr = std::to_string(i);
+    weights[i] = Symbol::Variable(std::string("w") + istr);
+    biases[i] = Symbol::Variable(std::string("b") + istr);
+    Symbol fc = FullyConnected(std::string("fc") + istr,
       i == 0? sym_x : outputs[i-1],
       weights[i], biases[i], layerSizes[i]);
-    outputs[i] = LeakyReLU(string("act") + istr, fc, null_sym, LeakyReLUActType::kLeaky);
+    outputs[i] = LeakyReLU(std::string("act") + istr, fc, null_sym, LeakyReLUActType::kLeaky);
   }
   auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label);
 
@@ -141,18 +139,18 @@ void MLP() {
   grad_req_type.push_back(kNullOp);
   std::vector<NDArray> aux_states;
 
-  cout << "make the Executor" << endl;
+  std::cout << "make the Executor" << std::endl;
   Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
                                grad_req_type, aux_states);
 
-  cout << "Training" << endl;
+  std::cout << "Training" << std::endl;
   int max_iters = 20000;
   mx_float learning_rate = 0.0001;
   for (int iter = 0; iter < max_iters; ++iter) {
     exe->Forward(true);
 
     if (iter % 100 == 0) {
-      cout << "epoch " << iter << endl;
+      std::cout << "epoch " << iter << std::endl;
       std::vector<NDArray>& out = exe->outputs;
       float* cptr = new float[128 * 10];
       out[0].SyncCopyToCPU(cptr, 128 * 10);
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index 051bad1bd26..dc1ab36727d 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -21,22 +21,22 @@
  * Xin Li yakumolx@gmail.com
  */
 #include <chrono>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-using namespace std;
 using namespace mxnet::cpp;
 
-Symbol mlp(const vector<int> &layers) {
+Symbol mlp(const std::vector<int> &layers) {
   auto x = Symbol::Variable("X");
   auto label = Symbol::Variable("label");
 
-  vector<Symbol> weights(layers.size());
-  vector<Symbol> biases(layers.size());
-  vector<Symbol> outputs(layers.size());
+  std::vector<Symbol> weights(layers.size());
+  std::vector<Symbol> biases(layers.size());
+  std::vector<Symbol> outputs(layers.size());
 
   for (size_t i = 0; i < layers.size(); ++i) {
-    weights[i] = Symbol::Variable("w" + to_string(i));
-    biases[i] = Symbol::Variable("b" + to_string(i));
+    weights[i] = Symbol::Variable("w" + std::to_string(i));
+    biases[i] = Symbol::Variable("b" + std::to_string(i));
     Symbol fc = FullyConnected(
       i == 0? x : outputs[i-1],  // data
       weights[i],
@@ -50,30 +50,29 @@ Symbol mlp(const vector<int> &layers) {
 
 int main(int argc, char** argv) {
   const int image_size = 28;
-  const vector<int> layers{128, 64, 10};
+  const std::vector<int> layers{128, 64, 10};
   const int batch_size = 100;
   const int max_epoch = 10;
   const float learning_rate = 0.1;
   const float weight_decay = 1e-2;
 
-  auto train_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/train-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/train-labels-idx1-ubyte")
-      .SetParam("batch_size", batch_size)
-      .SetParam("flat", 1)
-      .CreateDataIter();
-  auto val_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/t10k-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/t10k-labels-idx1-ubyte")
-      .SetParam("batch_size", batch_size)
-      .SetParam("flat", 1)
-      .CreateDataIter();
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   auto net = mlp(layers);
 
   Context ctx = Context::cpu();  // Use CPU for training
 
-  std::map<string, NDArray> args;
+  std::map<std::string, NDArray> args;
   args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
   args["label"] = NDArray(Shape(batch_size), ctx);
   // Let MXNet infer shapes other parameters such as weights
@@ -101,7 +100,7 @@ int main(int argc, char** argv) {
     int samples = 0;
     train_iter.Reset();
 
-    auto tic = chrono::system_clock::now();
+    auto tic = std::chrono::system_clock::now();
     while (train_iter.Next()) {
       samples += batch_size;
       auto data_batch = train_iter.GetDataBatch();
@@ -118,7 +117,7 @@ int main(int argc, char** argv) {
         opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
       }
     }
-    auto toc = chrono::system_clock::now();
+    auto toc = std::chrono::system_clock::now();
 
     Accuracy acc;
     val_iter.Reset();
@@ -130,7 +129,8 @@ int main(int argc, char** argv) {
       exec->Forward(false);
       acc.Update(data_batch.label, exec->outputs[0]);
     }
-    float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
+    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                     (toc - tic).count() / 1000.0;
     LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
   }
 
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
index 531afbb29db..67992a19f9f 100644
--- a/cpp-package/example/mlp_gpu.cpp
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -21,22 +21,22 @@
  * Xin Li yakumolx@gmail.com
  */
 #include <chrono>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-using namespace std;
 using namespace mxnet::cpp;
 
-Symbol mlp(const vector<int> &layers) {
+Symbol mlp(const std::vector<int> &layers) {
   auto x = Symbol::Variable("X");
   auto label = Symbol::Variable("label");
 
-  vector<Symbol> weights(layers.size());
-  vector<Symbol> biases(layers.size());
-  vector<Symbol> outputs(layers.size());
+  std::vector<Symbol> weights(layers.size());
+  std::vector<Symbol> biases(layers.size());
+  std::vector<Symbol> outputs(layers.size());
 
   for (size_t i = 0; i < layers.size(); ++i) {
-    weights[i] = Symbol::Variable("w" + to_string(i));
-    biases[i] = Symbol::Variable("b" + to_string(i));
+    weights[i] = Symbol::Variable("w" + std::to_string(i));
+    biases[i] = Symbol::Variable("b" + std::to_string(i));
     Symbol fc = FullyConnected(
       i == 0? x : outputs[i-1],  // data
       weights[i],
@@ -50,30 +50,29 @@ Symbol mlp(const vector<int> &layers) {
 
 int main(int argc, char** argv) {
   const int image_size = 28;
-  const vector<int> layers{128, 64, 10};
+  const std::vector<int> layers{128, 64, 10};
   const int batch_size = 100;
   const int max_epoch = 10;
   const float learning_rate = 0.1;
   const float weight_decay = 1e-2;
 
-  auto train_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/train-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/train-labels-idx1-ubyte")
-      .SetParam("batch_size", batch_size)
-      .SetParam("flat", 1)
-      .CreateDataIter();
-  auto val_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/t10k-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/t10k-labels-idx1-ubyte")
-      .SetParam("batch_size", batch_size)
-      .SetParam("flat", 1)
-      .CreateDataIter();
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   auto net = mlp(layers);
 
   Context ctx = Context::gpu();  // Use GPU for training
 
-  std::map<string, NDArray> args;
+  std::map<std::string, NDArray> args;
   args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
   args["label"] = NDArray(Shape(batch_size), ctx);
   // Let MXNet infer shapes of other parameters such as weights
@@ -107,7 +106,7 @@ int main(int argc, char** argv) {
     train_iter.Reset();
     train_acc.Reset();
 
-    auto tic = chrono::system_clock::now();
+    auto tic = std::chrono::system_clock::now();
     while (train_iter.Next()) {
       samples += batch_size;
       auto data_batch = train_iter.GetDataBatch();
@@ -130,8 +129,9 @@ int main(int argc, char** argv) {
       train_acc.Update(data_batch.label, exec->outputs[0]);
     }
     // one epoch of training is finished
-    auto toc = chrono::system_clock::now();
-    float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
+    auto toc = std::chrono::system_clock::now();
+    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                     (toc - tic).count() / 1000.0;
     LG << "Epoch[" << iter << "] " << samples/duration \
        << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
 
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 03b3d721764..bc86c0b6603 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -21,10 +21,12 @@
  */
 #include <map>
 #include <string>
+#include <fstream>
 #include <vector>
+#include <cstdlib>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-
 using namespace mxnet::cpp;
 
 Symbol ConvolutionNoBias(const std::string& symbol_name,
@@ -153,7 +155,7 @@ Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
 
 int main(int argc, char const *argv[]) {
   int batch_size = 50;
-  int max_epoch = 100;
+  int max_epoch = argc > 1 ? strtol(argv[1], NULL, 10) : 100;
   float learning_rate = 1e-4;
   float weight_decay = 1e-4;
 
@@ -161,24 +163,26 @@ int main(int argc, char const *argv[]) {
   std::map<std::string, NDArray> args_map;
   std::map<std::string, NDArray> aux_map;
 
-  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), Context::gpu());
-  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
-  resnet.InferArgsMap(Context::gpu(), &args_map, args_map);
-
-  auto train_iter = MXDataIter("ImageRecordIter")
-      .SetParam("path_imglist", "./sf1_train.lst")
-      .SetParam("path_imgrec", "./sf1_train.rec")
-      .SetParam("data_shape", Shape(3, 256, 256))
-      .SetParam("batch_size", batch_size)
-      .SetParam("shuffle", 1)
-      .CreateDataIter();
-
-  auto val_iter = MXDataIter("ImageRecordIter")
-      .SetParam("path_imglist", "./sf1_val.lst")
-      .SetParam("path_imgrec", "./sf1_val.rec")
-      .SetParam("data_shape", Shape(3, 256, 256))
-      .SetParam("batch_size", batch_size)
-      .CreateDataIter();
+  auto ctx = Context::gpu();
+#if MXNET_USE_CPU
+  ctx = Context::cpu();;
+#endif
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
+  args_map["data_label"] = NDArray(Shape(batch_size), ctx);
+  resnet.InferArgsMap(ctx, &args_map, args_map);
+
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("lr", learning_rate)
@@ -187,7 +191,7 @@ int main(int argc, char const *argv[]) {
      ->SetParam("rescale_grad", 1.0 / batch_size)
      ->SetParam("clip_gradient", 10);
 
-  auto *exec = resnet.SimpleBind(Context::gpu(), args_map);
+  auto *exec = resnet.SimpleBind(ctx, args_map);
   auto arg_names = resnet.ListArguments();
 
   for (int iter = 0; iter < max_epoch; ++iter) {
diff --git a/cpp-package/example/test_optimizer.cpp b/cpp-package/example/test_optimizer.cpp
index bf465b78698..ee120122856 100644
--- a/cpp-package/example/test_optimizer.cpp
+++ b/cpp-package/example/test_optimizer.cpp
@@ -18,7 +18,6 @@
  */
 #include "mxnet-cpp/MxNetCpp.h"
 
-using namespace std;
 using namespace mxnet::cpp;
 
 int main(int argc, char** argv) {
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 254a6d242fd..f92560fe8f7 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -21,22 +21,22 @@
  * Xin Li yakumolx@gmail.com
  */
 #include <chrono>
+#include "utils.h"
 #include "mxnet-cpp/MxNetCpp.h"
 
-using namespace std;
 using namespace mxnet::cpp;
 
-Symbol mlp(const vector<int> &layers) {
+Symbol mlp(const std::vector<int> &layers) {
   auto x = Symbol::Variable("X");
   auto label = Symbol::Variable("label");
 
-  vector<Symbol> weights(layers.size());
-  vector<Symbol> biases(layers.size());
-  vector<Symbol> outputs(layers.size());
+  std::vector<Symbol> weights(layers.size());
+  std::vector<Symbol> biases(layers.size());
+  std::vector<Symbol> outputs(layers.size());
 
   for (size_t i = 0; i < layers.size(); ++i) {
-    weights[i] = Symbol::Variable("w" + to_string(i));
-    biases[i] = Symbol::Variable("b" + to_string(i));
+    weights[i] = Symbol::Variable("w" + std::to_string(i));
+    biases[i] = Symbol::Variable("b" + std::to_string(i));
     Symbol fc = FullyConnected(
       i == 0? x : outputs[i-1],  // data
       weights[i],
@@ -49,33 +49,35 @@ Symbol mlp(const vector<int> &layers) {
 }
 
 int main(int argc, char** argv) {
-  const float MIN_SCORE = stof(argv[1]);
+  const float MIN_SCORE = std::stof(argv[1]);
 
   const int image_size = 28;
-  const vector<int> layers{128, 64, 10};
+  const std::vector<int> layers{128, 64, 10};
   const int batch_size = 100;
   const int max_epoch = 10;
   const float learning_rate = 0.1;
   const float weight_decay = 1e-2;
 
-  auto train_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/train-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/train-labels-idx1-ubyte")
-      .SetParam("batch_size", batch_size)
-      .SetParam("flat", 1)
-      .CreateDataIter();
-  auto val_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./mnist_data/t10k-images-idx3-ubyte")
-      .SetParam("label", "./mnist_data/t10k-labels-idx1-ubyte")
-      .SetParam("batch_size", batch_size)
-      .SetParam("flat", 1)
-      .CreateDataIter();
+  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
+                                          "./data/mnist_data/train-labels-idx1-ubyte",
+                                          "./data/mnist_data/t10k-images-idx3-ubyte",
+                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
+                                        };
+
+  auto train_iter =  MXDataIter("MNISTIter");
+  setDataIter(&train_iter, "Train", data_files, batch_size);
+
+  auto val_iter = MXDataIter("MNISTIter");
+  setDataIter(&val_iter, "Label", data_files, batch_size);
 
   auto net = mlp(layers);
 
   Context ctx = Context::gpu();  // Use GPU for training
+#if MXNET_USE_CPU
+  ctx = Context::cpu();
+#endif
 
-  std::map<string, NDArray> args;
+  std::map<std::string, NDArray> args;
   args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
   args["label"] = NDArray(Shape(batch_size), ctx);
   // Let MXNet infer shapes of other parameters such as weights
@@ -106,7 +108,7 @@ int main(int argc, char** argv) {
     int samples = 0;
     train_iter.Reset();
 
-    auto tic = chrono::system_clock::now();
+    auto tic = std::chrono::system_clock::now();
     while (train_iter.Next()) {
       samples += batch_size;
       auto data_batch = train_iter.GetDataBatch();
@@ -125,7 +127,7 @@ int main(int argc, char** argv) {
         opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
       }
     }
-    auto toc = chrono::system_clock::now();
+    auto toc = std::chrono::system_clock::now();
 
     Accuracy acc;
     val_iter.Reset();
@@ -138,7 +140,8 @@ int main(int argc, char** argv) {
       exec->Forward(false);
       acc.Update(data_batch.label, exec->outputs[0]);
     }
-    float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
+    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
+                     (toc - tic).count() / 1000.0;
     LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
     score = acc.Get();
   }
diff --git a/cpp-package/example/utils.h b/cpp-package/example/utils.h
new file mode 100644
index 00000000000..98b6472685b
--- /dev/null
+++ b/cpp-package/example/utils.h
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef CPP_PACKAGE_EXAMPLE_UTILS_H_
+#define CPP_PACKAGE_EXAMPLE_UTILS_H_
+
+#include <string>
+#include <fstream>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+bool isFileExists(const std::string &filename) {
+  std::ifstream fhandle(filename.c_str());
+  return fhandle.good();
+}
+
+bool check_datafiles(const std::vector<std::string> &data_files) {
+  for (size_t index=0; index < data_files.size(); index++) {
+    if (!(isFileExists(data_files[index]))) {
+      LG << "Error: File does not exist: "<< data_files[index];
+      return false;
+    }
+  }
+  return true;
+  }
+
+bool setDataIter(MXDataIter *iter , std::string useType,
+              const std::vector<std::string> &data_files, int batch_size) {
+    if (!check_datafiles(data_files))
+        return false;
+
+    iter->SetParam("batch_size", batch_size);
+    iter->SetParam("shuffle", 1);
+    iter->SetParam("flat", 1);
+
+    if (useType ==  "Train") {
+      iter->SetParam("image", data_files[0]);
+      iter->SetParam("label", data_files[1]);
+    } else if (useType == "Label") {
+      iter->SetParam("image", data_files[2]);
+      iter->SetParam("label", data_files[3]);
+    }
+
+    iter->CreateDataIter();
+    return true;
+}
+
+#endif  // CPP_PACKAGE_EXAMPLE_UTILS_H_
diff --git a/cpp-package/include/mxnet-cpp/operator.hpp b/cpp-package/include/mxnet-cpp/operator.hpp
index a0100cd601b..f4ce43d58d2 100644
--- a/cpp-package/include/mxnet-cpp/operator.hpp
+++ b/cpp-package/include/mxnet-cpp/operator.hpp
@@ -159,9 +159,11 @@ inline void Operator::Invoke(NDArray &output) {
 }
 
 inline Operator &Operator::SetInput(const std::string &name, Symbol symbol) {
-  input_keys_.push_back(name.c_str());
-  input_symbols_.push_back(symbol.GetHandle());
-  return *this;
+    if (symbol.GetHandle()) {
+      input_keys_.push_back(name.c_str());
+      input_symbols_.push_back(symbol.GetHandle());
+    }
+    return *this;
 }
 
 inline Operator &Operator::SetInput(const std::string &name, NDArray ndarray) {
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index 127ef156eb6..1c825c1502a 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -138,7 +138,7 @@ class Symbol {
   /*!
   * \return the SymbolHandle
   */
-  SymbolHandle GetHandle() const { return blob_ptr_->handle_; }
+  SymbolHandle GetHandle() const { return (blob_ptr_) ? blob_ptr_->handle_: NULL; }
   /*!
   * \brief construct an operator Symbol, with given input Symbol and config
   * \param name the name of the Symbol
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 2042529ace0..57007f3a81d 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -22,9 +22,31 @@ export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
 echo $LD_LIBRARY_PATH
 ls -l ../../lib/
 
-cp ../../build/cpp-package/example/test_optimizer .
-./test_optimizer
+./get_data.sh
+
+cp ../../build/cpp-package/example/lenet .
+./lenet 10
+
+cp ../../build/cpp-package/example/alexnet .
+./alexnet 1
+
+cp ../../build/cpp-package/example/lenet_with_mxdataiter .
+./lenet_with_mxdataiter 5
+
+cp ../../build/cpp-package/example/resnet .
+./resnet 5
+
+cp ../../build/cpp-package/example/mlp .
+./mlp
+
+cp ../../build/cpp-package/example/mlp_cpu .
+./mlp_cpu
+
+cp ../../build/cpp-package/example/mlp_gpu .
+./mlp_gpu
+
+ cp ../../build/cpp-package/example/test_optimizer .
+ ./test_optimizer
 
 cp ../../build/cpp-package/example/test_score .
-./get_mnist.sh
 ./test_score 0.93
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index b644423e9ed..ff0b3a8a000 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -26,8 +26,8 @@ <h3>A 60-minute Gluon Crash Course</h3>
         <a href="http://gluon-crash-course.mxnet.io/">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
-        <h3>MXNet 1.2.0.rc0 Released</h3>
-        <p>We're excited to announce the release of MXNet 1.2.0.rc0! Check out the release notes for latest updates.</p>
+        <h3>MXNet 1.2.0 Released</h3>
+        <p>We're excited to announce the release of MXNet 1.2.0! Check out the release notes for latest updates.</p>
         <a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+%28incubating%29+1.2.0+Release+Notes">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index 218454aea77..8ea2f9f2161 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -14,7 +14,7 @@ <h1 id="logo-wrap">
             <li><a class="main-nav-link" href="http://gluon.mxnet.io">Tutorials</a></li>
           </ul>
         </span>
- 
+
         <span id="dropdown-menu-position-anchor">
           <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">API <span class="caret"></span></a>
           <ul id="package-dropdown-menu" class="dropdown-menu navbar-menu">
@@ -43,8 +43,7 @@ <h1 id="logo-wrap">
           <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Community <span class="caret"></span></a>
           <ul id="package-dropdown-menu-community" class="dropdown-menu navbar-menu">
             <li><a class="main-nav-link" href="http://discuss.mxnet.io">Forum</a></li>
-            <li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet">Github</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}community/index.html">Community</a></li>
+            <li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet">Github</a></li>            
             <li><a class="main-nav-link" href="{{url_root}}community/contribute.html">Contribute</a></li>
             <li><a class="main-nav-link" href="{{url_root}}community/powered_by.html">Powered By</a></li>
           </ul>
diff --git a/docs/api/python/contrib/onnx.md b/docs/api/python/contrib/onnx.md
index 44aabaf4419..6fb546fc2b4 100644
--- a/docs/api/python/contrib/onnx.md
+++ b/docs/api/python/contrib/onnx.md
@@ -13,7 +13,7 @@ With ONNX format support for MXNet, developers can build and train models with a
 ```
 
 ### Installation Instructions
-- To use this module developers need to **install ONNX**, which requires protobuf compiler to be installed separately. Please follow the [instructions to install ONNX and its dependencies](https://github.com/onnx/onnx#installation). Once installed, you can go through the tutorials on how to use this module.
+- To use this module developers need to **install ONNX**, which requires the protobuf compiler to be installed separately. Please follow the [instructions to install ONNX and its dependencies](https://github.com/onnx/onnx#installation). **MXNet currently supports ONNX v1.1.1**. Once installed, you can go through the tutorials on how to use this module.
 
 
 This document describes all the ONNX-MXNet APIs.
@@ -23,6 +23,7 @@ This document describes all the ONNX-MXNet APIs.
     :nosignatures:
 
     mxnet.contrib.onnx.import_model
+    mxnet.contrib.onnx.get_model_metadata
 ```
 
 ## ONNX Tutorials
@@ -43,7 +44,8 @@ This document describes all the ONNX-MXNet APIs.
 ```eval_rst
 
 .. automodule:: mxnet.contrib.onnx
-    :members: import_model 
+    :members: import_model
+    :members: get_model_metadata
 
 ```
 
diff --git a/docs/api/python/executor/executor.md b/docs/api/python/executor/executor.md
index ce920ff935d..65245a41308 100644
--- a/docs/api/python/executor/executor.md
+++ b/docs/api/python/executor/executor.md
@@ -3,6 +3,8 @@
 The executor and executor manager are internal classes for managing symbolic
 graph execution. This document is only intended for reference for advanced users.
 
+.. note:: Direct interactions with executor and executor manager are dangerous and not recommended.
+
 ## Executor
 
 ```eval_rst
diff --git a/docs/api/python/gluon/gluon.md b/docs/api/python/gluon/gluon.md
index f523e649a45..9bf866d21a1 100644
--- a/docs/api/python/gluon/gluon.md
+++ b/docs/api/python/gluon/gluon.md
@@ -9,10 +9,79 @@
 
 ## Overview
 
-Gluon package is a high-level interface for MXNet designed to be easy to use while
-keeping most of the flexibility of low level API. Gluon supports both imperative
-and symbolic programming, making it easy to train complex models imperatively
-in Python and then deploy with symbolic graph in C++ and Scala.
+The Gluon package is a high-level interface for MXNet designed to be easy to use, while keeping most of the flexibility of a low level API. Gluon supports both imperative and symbolic programming, making it easy to train complex models imperatively in Python and then deploy with a symbolic graph in C++ and Scala.
+
+Based on the the [Gluon API specification](https://github.com/gluon-api/gluon-api), the Gluon API in Apache MXNet provides a clear, concise, and simple API for deep learning. It makes it easy to prototype, build, and train deep learning models without sacrificing training speed.
+
+**Advantages**
+
+1. Simple, Easy-to-Understand Code: Gluon offers a full set of plug-and-play neural network building blocks, including predefined layers, optimizers, and initializers.
+2. Flexible, Imperative Structure: Gluon does not require the neural network model to be rigidly defined, but rather brings the training algorithm and model closer together to provide flexibility in the development process.
+3. Dynamic Graphs: Gluon enables developers to define neural network models that are dynamic, meaning they can be built on the fly, with any structure, and using any of Python’s native control flow.
+4. High Performance: Gluon provides all of the above benefits without impacting the training speed that the underlying engine provides. 
+
+**Examples**
+
+*Simple, Easy-to-Understand Code*
+
+Use plug-and-play neural network building blocks, including predefined layers, optimizers, and initializers:
+
+```
+net = gluon.nn.Sequential()
+# When instantiated, Sequential stores a chain of neural network layers. 
+# Once presented with data, Sequential executes each layer in turn, using 
+# the output of one layer as the input for the next
+with net.name_scope():
+    net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
+    net.add(gluon.nn.Dense(256, activation="relu")) # 2nd hidden layer
+    net.add(gluon.nn.Dense(num_outputs))
+```
+
+*Flexible, Imperative Structure*
+
+Prototype, build, and train neural networks in fully imperative manner using the MXNet autograd package and the Gluon trainer method:
+
+```
+epochs = 10
+
+for e in range(epochs):
+    for i, (data, label) in enumerate(train_data):
+        with autograd.record():
+            output = net(data) # the forward iteration
+            loss = softmax_cross_entropy(output, label)
+            loss.backward()
+        trainer.step(data.shape[0])
+```
+
+*Dynamic Graphs*
+
+Build neural networks on the fly for use cases where neural networks must change in size and shape during model training:
+
+```
+def forward(self, F, inputs, tree):
+    children_outputs = [self.forward(F, inputs, child)
+                        for child in tree.children]
+    #Recursively builds the neural network based on each input sentence’s
+    #syntactic structure during the model definition and training process
+    ...
+```
+
+*High Performance*
+
+Easily cache the neural network to achieve high performance by defining your neural network with *HybridSequential* and calling the *hybridize* method:
+
+```
+net = nn.HybridSequential()
+with net.name_scope():
+    net.add(nn.Dense(256, activation="relu"))
+    net.add(nn.Dense(128, activation="relu"))
+    net.add(nn.Dense(2))
+    
+net.hybridize()
+```
+
+
+## Contents
 
 ```eval_rst
 .. toctree::
diff --git a/docs/api/python/gluon/model_zoo.md b/docs/api/python/gluon/model_zoo.md
index 950e2c02c2d..453fe8d4bdd 100644
--- a/docs/api/python/gluon/model_zoo.md
+++ b/docs/api/python/gluon/model_zoo.md
@@ -42,9 +42,12 @@ The following table summarizes the available models.
 | mobilenet0.5  | [MobileNet 0.5](https://arxiv.org/abs/1704.04861)                                     | 1,342,536    | 0.6307         | 0.8475         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)              |
 | mobilenet0.75 | [MobileNet 0.75](https://arxiv.org/abs/1704.04861)                                    | 2,601,976    | 0.6738         | 0.8782         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)              |
 | mobilenet1.0  | [MobileNet 1.0](https://arxiv.org/abs/1704.04861)                                     | 4,253,864    | 0.7105         | 0.9006         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)              |
-| mobilenetv2_1.0 | [MobileNetV2 1.0](https://arxiv.org/abs/1801.04381)                                 | 3,539,136    | 0.7159         | 0.9047         | Trained with [script](https://github.com/dmlc/gluon-cv/blob/15ed8a4c71d411b878f0d71d1c7afdce6710c913/scripts/classification/imagenet/train_imagenet.py) |
-| resnet18_v1   | [ResNet-18 V1](http://arxiv.org/abs/1512.03385)                                       | 11,699,112   | 0.7039         | 0.8959         | Trained with [script](https://github.com/dmlc/gluon-cv/blob/15ed8a4c71d411b878f0d71d1c7afdce6710c913/scripts/classification/imagenet/train_imagenet.py) |
-| resnet34_v1   | [ResNet-34 V1](http://arxiv.org/abs/1512.03385)                                       | 21,814,696   | 0.7411         | 0.9184         | Trained with [script](https://github.com/dmlc/gluon-cv/blob/15ed8a4c71d411b878f0d71d1c7afdce6710c913/scripts/classification/imagenet/train_imagenet.py) |
+| mobilenetv2_1.0  | [MobileNetV2 1.0](https://arxiv.org/abs/1801.04381)                                | 3,539,136    | 0.7192         | 0.9056         | Trained with [script](https://gluon-cv.mxnet.io/model_zoo/index.html#image-classification)                                      |
+| mobilenetv2_0.75 | [MobileNetV2 0.75](https://arxiv.org/abs/1801.04381)                               | 2,653,864    | 0.6961         | 0.8895         | Trained with [script](https://gluon-cv.mxnet.io/model_zoo/index.html#image-classification)                                      |
+| mobilenetv2_0.5  | [MobileNetV2 0.5](https://arxiv.org/abs/1801.04381)                                | 1,983,104    | 0.6449         | 0.8547         | Trained with [script](https://gluon-cv.mxnet.io/model_zoo/index.html#image-classification)                                      |
+| mobilenetv2_0.25 | [MobileNetV2 0.25](https://arxiv.org/abs/1801.04381)                               | 1,526,856    | 0.5074         | 0.7456         | Trained with [script](https://gluon-cv.mxnet.io/model_zoo/index.html#image-classification)                                      |
+| resnet18_v1   | [ResNet-18 V1](http://arxiv.org/abs/1512.03385)                                       | 11,699,112   | 0.7039         | 0.8959         | Trained with [script](https://gluon-cv.mxnet.io/model_zoo/index.html#image-classification)                                      |
+| resnet34_v1   | [ResNet-34 V1](http://arxiv.org/abs/1512.03385)                                       | 21,814,696   | 0.7411         | 0.9184         | Trained with [script](https://gluon-cv.mxnet.io/model_zoo/index.html#image-classification)                                      |
 | resnet50_v1   | [ResNet-50 V1](http://arxiv.org/abs/1512.03385)                                       | 25,629,032   | 0.7540         | 0.9266         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)              |
 | resnet101_v1  | [ResNet-101 V1](http://arxiv.org/abs/1512.03385)                                      | 44,695,144   | 0.7693         | 0.9334         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)              |
 | resnet152_v1  | [ResNet-152 V1](http://arxiv.org/abs/1512.03385)                                      | 60,404,072   | 0.7727         | 0.9353         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)              |
@@ -205,6 +208,9 @@ The following table summarizes the available models.
     mobilenet0_5
     mobilenet0_25
     mobilenet_v2_1_0
+    mobilenet_v2_0_75
+    mobilenet_v2_0_5
+    mobilenet_v2_0_25
 ```
 
 ```eval_rst
diff --git a/docs/api/python/gluon/nn.md b/docs/api/python/gluon/nn.md
index 1001f2055da..1791faf86f0 100644
--- a/docs/api/python/gluon/nn.md
+++ b/docs/api/python/gluon/nn.md
@@ -23,6 +23,8 @@ This document lists the neural network blocks in Gluon:
     LayerNorm
     Embedding
     Flatten
+    Lambda
+    HybridLambda
 ```
 
 
diff --git a/docs/api/python/image/image.md b/docs/api/python/image/image.md
index a3e2a1697d3..82af4aa9b5c 100644
--- a/docs/api/python/image/image.md
+++ b/docs/api/python/image/image.md
@@ -88,6 +88,7 @@ A list of supporting augmenters
     image.CastAug
 ```
 
+#### Image Iterator for Object Detection
 Similar to `ImageIter`, `ImageDetIter` is designed for `Object Detection` tasks.
 ```eval_rst
 .. autosummary::
@@ -178,6 +179,7 @@ and a list of augmenters specific for `Object detection` is provided
 .. autoclass:: mxnet.image.RandomCropAug
 .. autoclass:: mxnet.image.RandomSizedCropAug
 .. autoclass:: mxnet.image.CenterCropAug
+.. autoclass:: mxnet.image.SequentialAug
 .. autoclass:: mxnet.image.RandomOrderAug
 .. autoclass:: mxnet.image.BrightnessJitterAug
 .. autoclass:: mxnet.image.ContrastJitterAug
diff --git a/docs/api/python/index.md b/docs/api/python/index.md
index b097e2045b1..420f4c9b72f 100644
--- a/docs/api/python/index.md
+++ b/docs/api/python/index.md
@@ -1,10 +1,14 @@
 # MXNet - Python API
 
-MXNet provides a rich Python API to serve a broad community of Python developers.
-In this section, we provide an in-depth discussion of the functionality provided by
-various MXNet Python packages. We have included code samples for most of the APIs
-for improved clarity. These code samples will run as-is as long as MXNet is first
-imported by running:
+MXNet provides a comprehensive and flexible Python API to serve a broad community of developers with different levels of experience and wide ranging requirements. In this section, we provide an in-depth discussion of the functionality provided by various MXNet Python packages.
+
+MXNet's Python API has two primary high-level packages*: the Gluon API and Module API. We recommend that new users start with the Gluon API as it's more flexible and easier to debug. Underlying these high-level packages are the core packages of NDArray and Symbol.
+
+NDArray works with arrays in an imperative fashion, i.e. you define how arrays will be transformed to get to an end result. Symbol works with arrays in a declarative fashion, i.e. you define the end result that is required (via a symbolic graph) and the MXNet engine will use various optimizations to determine the steps required to obtain this. With NDArray you have a great deal of flexibility when composing operations (as you can use Python control flow), and you can easily step through your code and inspect the values of arrays, which helps with debugging. Unfortunately, this comes at a performance cost when compared to Symbol, which can perform optimizations on the symbolic graph.
+
+Module API is backed by Symbol, so, although it's very performant, it's also a little more restrictive. With the Gluon API, you can get the best of both worlds. You can develop and test your model imperatively using NDArray, a then switch to Symbol for faster model training and inference (if Symbol equivalents exist for your operations).
+
+Code examples are placed throughout the API documentation and these can be run after importing MXNet as follows:
 
 ```python
 >>> import mxnet as mx
@@ -12,13 +16,15 @@ imported by running:
 
 ```eval_rst
 
-.. note:: A convenient way to execute examples is the ``%doctest_mode`` mode of
+.. note:: A convenient way to execute code examples is using the ``%doctest_mode`` mode of
     Jupyter notebook, which allows for pasting multi-line examples containing
     ``>>>`` while preserving indentation. Run ``%doctest_mode?`` in Jupyter notebook
     for more details.
 
 ```
 
+\* Some old references to Model API may exist, but this API has been deprecated.
+
 ## NDArray API
 
 ```eval_rst
@@ -80,15 +86,6 @@ imported by running:
    gluon/contrib.md
 ```
 
-## KVStore API
-
-```eval_rst
-.. toctree::
-   :maxdepth: 1
-
-   kvstore/kvstore.md
-```
-
 ## IO API
 
 ```eval_rst
@@ -134,6 +131,15 @@ imported by running:
    metric/metric.md
 ```
 
+## Profiler API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   profiler/profiler.md
+```
+
 ## Run-Time Compilation API
 
 ```eval_rst
diff --git a/docs/api/python/kvstore/kvstore.md b/docs/api/python/kvstore/kvstore.md
index 28297faca49..efd34bc724b 100644
--- a/docs/api/python/kvstore/kvstore.md
+++ b/docs/api/python/kvstore/kvstore.md
@@ -1,5 +1,7 @@
 # KVStore API
 
+.. note:: Direct interactions with ``KVStore`` are dangerous and not recommended.
+
 ## Basic Push and Pull
 
 Provides basic operation over multiple devices (GPUs) on a single device.
diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md
index 25cabed808e..b017c601208 100644
--- a/docs/api/python/ndarray/contrib.md
+++ b/docs/api/python/ndarray/contrib.md
@@ -45,6 +45,7 @@ In the rest of this document, we list routines provided by the `ndarray.contrib`
     MultiProposal
     PSROIPooling
     Proposal
+    ROIAlign
     count_sketch
     ctc_loss
     dequantize
diff --git a/docs/api/python/ndarray/sparse.md b/docs/api/python/ndarray/sparse.md
index 1f67e82194b..581a74f15e5 100644
--- a/docs/api/python/ndarray/sparse.md
+++ b/docs/api/python/ndarray/sparse.md
@@ -386,6 +386,8 @@ We summarize the interface for each class in the following sections.
     elemwise_add
     elemwise_sub
     elemwise_mul
+    broadcast_add
+    broadcast_sub
     broadcast_mul
     broadcast_div
     negative
@@ -497,7 +499,7 @@ We summarize the interface for each class in the following sections.
 
     make_loss
     stop_gradient
-    mxnet.ndarray.contrib.SparseEmbedding
+    Embedding
     LinearRegressionOutput
     LogisticRegressionOutput
 ```
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index 1af18bbf86d..f2bb3f15dee 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -45,6 +45,7 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     MultiProposal
     PSROIPooling
     Proposal
+    ROIAlign
     count_sketch
     ctc_loss
     dequantize
diff --git a/docs/api/python/symbol/sparse.md b/docs/api/python/symbol/sparse.md
index a44ff150356..86191e352c3 100644
--- a/docs/api/python/symbol/sparse.md
+++ b/docs/api/python/symbol/sparse.md
@@ -97,6 +97,10 @@ In the rest of this document, we list sparse related routines provided by the
     elemwise_add
     elemwise_sub
     elemwise_mul
+    broadcast_add
+    broadcast_sub
+    broadcast_mul
+    broadcast_div
     negative
     dot
     add_n
@@ -193,7 +197,7 @@ In the rest of this document, we list sparse related routines provided by the
 
     make_loss
     stop_gradient
-    mxnet.symbol.contrib.SparseEmbedding
+    Embedding
     LinearRegressionOutput
     LogisticRegressionOutput
 ```
diff --git a/docs/build_version_doc/README.md b/docs/build_version_doc/README.md
index b297712ebcd..4fd2c10478a 100644
--- a/docs/build_version_doc/README.md
+++ b/docs/build_version_doc/README.md
@@ -7,85 +7,70 @@ This folder contains a variety of scripts to generate the MXNet.io website as we
 * [AddVersion.py](AddVersion.py) - MXNet.io site data massaging; injects the versions dropdown menu in the navigation bar
 * [build_site_tag.sh](build_site_tag.sh) - takes version tags as input and generates static html; calls `build_all_version.sh` and `update_all_version.sh`
 * [build_all_version.sh](build_all_version.sh) - takes version tags as input and builds the basic static html for MXNet.io
-* [build_doc.sh](build_doc.sh) - used by the CI system to generate MXNet.io; only triggered by new tags; not meant for manual runs or custom outputs
 * [Dockerfile](Dockerfile) - has all dependencies needed to build and update MXNet.io's static html
 * [update_all_version.sh](update_all_version.sh) - takes the output of `build_all_version.sh` then uses `AddVersion.py` and `AddPackageLink.py` to update the static html
 
-## CI Flow (WIP)
 
-* Refer to https://github.com/apache/incubator-mxnet/pull/10485
+## Setting Up a Docs Dev Server
 
-1. Docs build artifacts are deployed to the `asf-site` branch from the [incubator-mxnet-site](https://github.com/apache/incubator-mxnet-site).
-2. [MXNet.io](http://mxnet.io) should then show the new content.
+For these instructions, you will use an Ubuntu machine. This flow has been tested on a [Deep Learning Base AMI](https://aws.amazon.com/marketplace/pp/B077GCZ4GR), although you may use the full Deep Learning Base AMI or any other Ubuntu 16.04 system with some minor adjustments.
 
-## Manual Generation
+**Step 1:** Spin up your Ubuntu server and SSH in.
 
-Use Ubuntu and the setup defined below, or use the Dockerfile provided in this folder to spin up an Ubuntu image with all of the dependencies. Further info on Docker is provided later in this document. For a cloud image, this was tested on [Deep Learning AMI v5](https://aws.amazon.com/marketplace/pp/B077GCH38C?qid=1520359179176).
+**Step 2:** Create a Python 2.7 virtual environment (see note).
 
-**Note**: for AMI users or if you already have Conda, you might be stuck with the latest version and the docs build will have a conflict. To fix this, run `/home/ubuntu/anaconda3/bin/pip uninstall sphinx` and follow this with `pip install --user sphinx==1.5.6`.
+```bash
+sudo apt install virtualenv
+virtualenv -p python2.7 mxnet_docs
+source mxnet_docs/bin/activate
+```
 
-If you need to build <= v0.12.0, then use a Python 2 environment to avoid errors with `mxdoc.py`. This is a sphinx extension, that was not Python 3 compatible in the old versions. On the Deep Learning AMI, use `source activate mxnet_p27`, and then install the following dependencies.
+**Note:** Using a Python 2.7 environment is required to build older versions of the docs that have Python 3 incompatibilities. If you're only building the latest or version 1.0.0+, then you may use a Python 3 environment.
 
+**Step 3:** Clone the repo.
 
-### Dependencies
+```bash
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+```
 
-These are the dependencies for docs generation for Ubuntu 16.04.
+**Step 4:** Install dependencies.
 
-This script is available for you to run directly on Ubuntu from the source repository.
-Run `./setup_docs_ubuntu.sh`.
+This script will install dependencies for you.
 
+```bash
+./incubator-mxnet/docs/build_version_doc/setup_docs_ubuntu.sh
 ```
-sudo apt-get update
-sudo apt-get install -y \
-    apt-transport-https \
-    ca-certificates \
-    curl \
-    doxygen \
-    git \
-    libjemalloc-dev \
-    pandoc \
-    software-properties-common
 
-# You made need to run `/home/ubuntu/anaconda3/bin/pip uninstall sphinx`
-# Recommonmark/Sphinx errors: https://github.com/sphinx-doc/sphinx/issues/3800
-# Recommonmark should be replaced so Sphinx can be upgraded
-# For now we remove other versions of Sphinx and pin it to v1.5.6
+**Step 4:** Make the docs.
 
-pip install --user \
-    beautifulsoup4 \
-    breathe \
-    CommonMark==0.5.4 \
-    h5py \
-    mock==1.0.1 \
-    pypandoc \
-    recommonmark==0.4.0 \
-    sphinx==1.5.6
+Here you have two options:
 
-# Setup scala
-echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
-sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
-sudo apt-get update
-sudo apt-get install -y \
-  sbt \
-  scala
+* Build this current version (master) with the following:
 
-# Optionally setup Apache2
-sudo apt-get install -y apache2
-sudo ufw allow 'Apache Full'
-# turn on mod_rewrite
-sudo a2enmod rewrite
+```bash
+cd incubator-mxnet
+make docs USE_OPENMP=1
+```
 
-echo 'To enable redirects you need to edit /etc/apache2/apache2.conf '
-echo '--> Change directives for Directory for /var/www/html using the following: '
-echo '       AllowOverride all '
-echo '--> Then restart apache with: '
-echo '       sudo systemctl restart apache2'
+* Build all versions as what is seen in the production site. This will have the versions dropdown and any post-build processing that generates site artifacts and other requirements for the production site.
 
-# Cleanup
-sudo apt autoremove -y
+The following script will build all of the latest versions, set the default site to be `master` and use your dev server's IP or DNS for navigation items.
+
+```bash
+./build_site_tag.sh '1.2.0;1.1.0;1.0.0;0.12.0;0.11.0;master' master http://your-ip-or-dns
 ```
 
-### Full Website Build
+**Final Step:** Serve and test.
+
+Refer to [Serving Your Development Version](#serving-your-development-version) for detailed instructions.
+
+
+**Troubleshooting**: for AMI users or if you already have Conda, you might be stuck with the latest version and the docs build will have a conflict. To fix this, run `/home/ubuntu/anaconda3/bin/pip uninstall sphinx` and follow this with `pip install --user sphinx==1.5.6`.
+
+If you need to build <= v0.12.0, then use a Python 2 environment to avoid errors with `mxdoc.py`. This is a sphinx extension, that was not Python 3 compatible in the old versions. On the Deep Learning AMI, use `source activate mxnet_p27`, and then install the following dependencies.
+
+
+## Full Website Build
 The following three scripts will help you build multiple version tags and deploy a full site build that with each available API version. If you just want to run master or your current fork's branch you should skip ahead to the [Developer Instructions](#developer-instructions).
 
 The full site build scripts can be run stand-alone or in conjunction, but `build_all_version.sh` should be run first.
@@ -236,4 +221,63 @@ There are several manual and semi-automatic processes to be aware of, but the bo
 1. The root should have the current `.htaccess` file from master in `/docs/`. Make sure you've updated this in master and included the most recent version in your PR.
 2. The css file from master `/docs/_static/` will be needed. Be sure that the different versions of the site work. They might need the old version, but the newer version might fix bugs that were in the tags from the legacy versions.
 3. Pay attention to `mxdocs.py` as some docs modifications are happening there.
-4. Review Any other modifications to the legacy versions can be seen in
+
+
+## Dependencies
+
+These are the dependencies for docs generation for Ubuntu 16.04.
+
+This script is available for you to run directly on Ubuntu from the source repository.
+Run `./setup_docs_ubuntu.sh`.
+
+```
+sudo apt-get update
+sudo apt-get install -y \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    doxygen \
+    git \
+    libjemalloc-dev \
+    pandoc \
+    software-properties-common
+
+# You made need to run `/home/ubuntu/anaconda3/bin/pip uninstall sphinx`
+# Recommonmark/Sphinx errors: https://github.com/sphinx-doc/sphinx/issues/3800
+# Recommonmark should be replaced so Sphinx can be upgraded
+# For now we remove other versions of Sphinx and pin it to v1.5.6
+
+pip install \
+    beautifulsoup4 \
+    breathe \
+    CommonMark==0.5.4 \
+    h5py \
+    mock==1.0.1 \
+    pypandoc \
+    recommonmark==0.4.0 \
+    sphinx==1.5.6
+
+# Setup scala
+echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
+sudo apt-get update
+sudo apt-get install -y \
+  maven \
+  sbt \
+  scala
+
+# Optionally setup Apache2
+sudo apt-get install -y apache2
+sudo ufw allow 'Apache Full'
+# turn on mod_rewrite
+sudo a2enmod rewrite
+
+echo 'To enable redirects you need to edit /etc/apache2/apache2.conf '
+echo '--> Change directives for Directory for /var/www/html using the following: '
+echo '       AllowOverride all '
+echo '--> Then restart apache with: '
+echo '       sudo systemctl restart apache2'
+
+# Cleanup
+sudo apt autoremove -y
+```
diff --git a/docs/build_version_doc/artifacts/.htaccess b/docs/build_version_doc/artifacts/.htaccess
index 76d0893a528..5467448f521 100644
--- a/docs/build_version_doc/artifacts/.htaccess
+++ b/docs/build_version_doc/artifacts/.htaccess
@@ -3,3 +3,4 @@ RewriteRule ^get_started/why_mxnet.html$ /faq/why_mxnet.html [R=301,L]
 RewriteRule ^get_started.*$ /install/ [R=301,L]
 RewriteRule ^how_to.*$ /faq/ [R=301,L]
 RewriteRule ^api/python/symbol.html$ /api/python/symbol/symbol.html [R=301,L]
+RewriteRule ^community/index.html$ /community/contribute.html [R=301,L]
diff --git a/docs/community/contribute.md b/docs/community/contribute.md
index e3d10f527dc..a2d2f64616f 100644
--- a/docs/community/contribute.md
+++ b/docs/community/contribute.md
@@ -1,202 +1,153 @@
-# Contribute to MXNet
+# Contributing MXNet
 
-MXNet has been developed and is used by a group of active community members.
-Please contribute to improve the project.
-After your patch has been merged, remember to add your name to [CONTRIBUTORS.md](https://github.com/apache/incubator-mxnet/blob/master/CONTRIBUTORS.md).
+Apache MXNet (incubating) is a community led, open source deep learning project. We welcome new members and look forward to your contributions. Here you will find how to get started and links to detailed information on MXNet best practices and processes.
 
-## Code Contribution
 
-Before you start coding…
+## Getting Started
 
-… please make sure there is a JIRA issue that corresponds to your contribution. This is a general rule that the MXNet community follows for all code contributions, including bug fixes, improvements, or new features, with an exception for trivial hot fixes. If you would like to fix a bug that you found or if you would like to add a new feature or improvement to MXNet, please follow the [File a bug report or Propose an improvement or a new feature](http://mxnet.io/community/index.html) guidelines to open an issue in [MXNet’s JIRA](http://issues.apache.org/jira/browse/MXNet) before starting with the implementation.
-
-If the description of a JIRA issue indicates that its resolution will touch sensible parts of the code base, be sufficiently complex, or add significant amounts of new code, the MXNet community might request a design document (most contributions should not require a design document). The purpose of this document is to ensure that the overall approach to address the issue is sensible and agreed upon by the community. JIRA issues that require a design document are tagged with the requires-design-doc label. The label can be attached by any community member who feels that a design document is necessary. A good description helps to decide whether a JIRA issue requires a design document or not. The design document must be added or attached to or link from the JIRA issue and cover the following aspects:
-
-- Overview of the general approach<br/>
-- List of API changes (changed interfaces, new and deprecated configuration parameters, changed behavior, …)<br/>
-- Main components and classes to be touched<br/>
-- Known limitations of the proposed approach<br/>
-
-A design document can be added by anybody, including the reporter of the issue or the person working on it.<br/>
-
-Contributions for JIRA issues that require a design document will not be added to MXNet’s code base before a design document has been accepted by the community with lazy consensus. Please check if a design document is required before starting to code.
-
-
-### Core Library
-
-- Follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) for C++ code.
-- Use doxygen to document all of the interface code.
-- Use [RAII](http://en.cppreference.com/w/cpp/language/raii) to manage resources, including smart
- pointers like shared_ptr and unique_ptr as well as allocating in constructors and deallocating in
- destructors. Avoid explicit calls to new and delete when possible. Use make_shared and make_unique
-  instead.
-- To reproduce the linter checks, type ```make lint```. (You need to pip install pylint and cpplint
- before)
-
-### Python Package
-
-- Always add docstring to the new functions in numpydoc format.
-- To reproduce the linter checks, type ```make lint```.
-
-### R Package
-
-#### Code Style
-- Most of the C++ code in the R package relies heavily on [Rcpp](https://github.com/RcppCore/Rcpp).
-- We follow the Google C++ Style Guide for C++ code. This allows us to maintain consistency with the rest of the project. It also allows us to check style automatically with a linter.
-- To check the code style, type the following command at the root folder:
-```bash
-make rcpplint
-```
-- If necessary, disable the linter warning on certain lines with ```// NOLINT(*)``` comments.
-
-#### Auto-Generated API
-- Many MXNet APIs are exposed dynamically from Rcpp.
-- mxnet_generated.R is the auto-generated API and documents for these functions.
-- Remake the file by typing the following command at root folder:
-```bash
-make rcppexport
-```
-- Use this command only when there is an update to dynamic functions.
-
-#### API Document
-The document is generated using roxygen2. To remake the documents in the root folder, use the following command:
-```bash
-make roxygen.
-```
-
-#### R Markdown Vignettes
-R Markdown vignettes are located on GitHub in [R-package/vignettes](https://github.com/apache/incubator-mxnet/tree/master/R-package/vignettes).
-These R Markdown files aren't compiled. We host the compiled version on [doc/R-package](https://github.com/apache/incubator-mxnet/tree/master/R-package/).
-
-To add a new R Markdown vignettes:
-
-* Add the original R Markdown file to ```R-package/vignettes```
-* Modify ```doc/R-package/Makefile``` to add the Markdown files to be built.
-* Clone the [dmlc/web-data](https://github.com/dmlc/web-data) repo to  the  ```doc``` folder.
-* Type the following command for the ```doc/R-package``` package:
-```bash
-make the-markdown-to-make.md
-```
-* This generates the markdown and the figures and places them into ```doc/web-data/mxnet/knitr```.
-* Modify the ```doc/R-package/index.md``` to point to the generated markdown.
-* Add the generated figure to the ```dmlc/web-data``` repo.
-	* If you have already cloned the repo to doc, use ```git add```.
-* Create a pull request for both the markdown  and ```dmlc/web-data```.
-* You can also build the document locally with the following command: ```doc```
-```bash
-make html
-```
-
-### Test Cases
-
-* All of our tests can be found in the GitHub repo in [this directory](https://github.com/apache/incubator-mxnet/tree/master/tests).
-* We use Python nose for python test cases, and gtest for C++ unit tests.
-
-### Examples
-
-* Use cases and examples are on GitHub in [examples](https://github.com/apache/incubator-mxnet/tree/master/example)
-* If you write a blog post or tutorial about or using MXNet, please tell us by creating an issue
-in our github repo. We regularly feature high-quality contributed content from the community.
-
-## Standard for Contributing APIs
-
-Make sure to add documentation with any code you contribute. Follow these standards:
-
-### API Documentation
-* Document are created with Sphinx and [recommonmark](http://recommonmark.readthedocs.org/en/latest/).
-* Follow [numpy doc standards](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard) and
-some changes we made [MXNet doc standards](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard).
-* If an API is implemented in Python or has a wrapper defined, the documentation and the examples reside
-where the function is defined in `.py` file in [python/mxnet](https://github.com/apache/incubator-mxnet/tree/master/python/mxnet) folder. Same goes for other languages.
-* If the API is dynamically generated from the MXNet backend, the documentation is in the C++ code(.cc
-file) where the operator is registered in describe method of the `NNVM_REGISTER_OP`. The file and line
-number for the function is usually printed with the API documentation on mxnet.io.
-* A clear and concise description of the function and its behavior.
-* List and describe each parameter with the valid input values, whether it is required or optional,
-and the default value if the parameter is optional.
-* Add an example to help the user understand the API better. If the example can be language-neutral
-or is conceptual, add it in the C++ documentation. Make sure your example works
-by running a Python version of the example.
-  * If a concrete and simple language-specific example can further clarify the API and the API arguments, add the
-example in language-specific files.
-* Refer to these examples for guidance:- [Embedding](http://mxnet.io/api/python/ndarray/ndarray.html#mxnet.ndarray.Embedding) , [ROIPooling](http://mxnet.io/api/python/ndarray/ndarray.html#mxnet.ndarray.ROIPooling) , [Reshape](http://mxnet.io/api/python/ndarray/ndarray.html#mxnet.ndarray.Reshape).
-
-### Testing and Rendering
-* Make sure not to break any coding standards. Run
-```bash
-make lint
-```
-* You can build documents locally to proof them.
-
-## Guidelines to submit a Pull Request
-* Before submitting your contribution, rebase your code on the most recent version of the master:
-
-```bash
-    git remote add upstream https://github.com/apache/incubator-mxnet
-    git fetch upstream
-    git rebase upstream/master
-```
-* If you have multiple small commits,
-   merge them into meaningful groups (use ```git rebase``` then ```squash```).
-* Send the pull request.
-* Fix problems reported by automatic checks.
-* If you are contributing a new module, consider adding a test case in [tests](https://github.com/apache/incubator-mxnet/tree/master/tests).
-
-### Resolving a Conflict with the Master
-
-* Rebase to the current master:
-
- ```bash
-    # The first two steps can be skipped after you do it once.
-    git remote add upstream https://github.com/apache/incubator-mxnet
-    git fetch upstream
-    git rebase upstream/master
- ```
-
-*  Git might show some conflicts that prevent merging, for example,  ```conflicted.py```.
-  * Manually modify the file to resolve the conflict.
-	* After you resolve the conflict, mark it as resolved by using:
-
-```bash
-git add conflicted.py.
-```
-
-* Continue rebasing by using this command:
-
- ```bash
-    git rebase --continue
- ```
-
-* Finally push to your fork. You might need to force the  push:
-
- ```bash
-    git push --force
- ```
-
-### Combining Multiple Commits
-If you are submitting multiple commits with later commits that are just fixes to previous ones, you can combine commits into meaningful groups before creating a push request.
-
-* Before doing so, configure Git's default editor if you haven't already done that:
-
-```bash
-git config core.editor the-editor-you-like
-```
-* Assuming that you want to merge last the last three commits, type the following commands:
-
-```bash
-git rebase -i HEAD~3
-```
+The following actions are recommended steps for you to get started with contributing to MXNet.
 
-* In the text editor that appears, set the first commit as ```pick```, and change later ones to ```squash```.
-
-* After you save the file, another text editor will appear and ask you to modify the combined commit message.
-
-* Push the changes to your fork by forcing a push:
-
-```bash
-git push --force.
-```
-
-### What Is the Consequence of Forcing a Push?
-The previous two tips require forcing a push because we altered the path of the commits.
-It's fine to force a push to your own fork, as long as only your commits are changed.
+| Action | Purpose |
+|---|---|
+| [Create a forum account](#forum) | Asking & answering MXNet usage questions |
+| [Join the dev comm channels](#mxnet-dev-communications) | Discussions about the direction of MXNet |
+| [Follow MXNet on Social Media](#social-media) | Get updates about new features and events |
+| [Create a JIRA account](#jira) | Tracking tasks & prioritizing issues |
+| [Check out the MXNet wiki](#confluence-wiki) | The wiki has detailed contributor information |
+| [Setup MXNet for development](#setup-mxnet-for-development) | Your very own fork for creating pull requests |
+| [Your first contribution](#your-first-contribution) | Complete a first contribution task |
+
+
+### FAQ
+
+* I found a bug. How do I report it?
+    * [Bug submission info](#file-a-bug-report)
+* I have a minor bug fix or docs update I'd like to submit. What do I do?
+    * [Minor fixes process](#minor-fixes)
+* I would like to submit a pull request for a significant update. What is the process?
+    * [Pull request process](#formal-pull-request-process)
+* I want to propose a new feature. What is the process for this?
+    * [New feature process](#new-feature-process)
+* What's coming next with MXNet, and how can I help?
+    * [Roadmap info](#roadmap)
+
+
+## MXNet Dev Communications
+
+### Forum
+
+If you need help with using MXNet, have questions about applying it to a particular kind of problem, or have a discussion topic, please use the discussion forum:
+* [discuss.mxnet.io](https://discuss.mxnet.io) <i class="fas fa-external-link-alt"></i>
+
+### Dev Mailing List
+
+Please join the contributor mailing list by sending a subscribe email to dev-subscribe@mxnet.apache.org.
+* <a href="mailto:dev-subscribe@mxnet.apache.org">subscribe</a> <i class="far fa-envelope"></i>
+* [archive](https://lists.apache.org/list.html?dev@mxnet.apache.org) <i class="fas fa-external-link-alt"></i>
+
+### Slack
+
+To join the MXNet slack channel send request to the contributor mailing list.
+ * <a href="mailto:dev@mxnet.apache.org?subject=Requesting%20slack%20access">email</a> <i class="far fa-envelope"></i>
+ * [archive](https://the-asf.slackarchive.io/mxnet) <i class="fas fa-external-link-alt"></i>
+
+
+### Social Media
+
+Keep connected with the latest MXNet news and updates on [Twitter](https://twitter.com/apachemxnet) and [Reddit](https://reddit.com/r/mxnet). Also, subscribe to the [MXNet YouTube channel](https://www.youtube.com/channel/UCQua2ZAkbr_Shsgfk1LCy6A).
+
+<div class="g-ytsubscribe" data-channelid="UCQua2ZAkbr_Shsgfk1LCy6A" data-layout="full" data-count="hidden"></div>
+<br/><br/>
+<a href="https://twitter.com/apachemxnet?ref_src=twsrc%5Etfw" class="twitter-follow-button" data-show-count="false">Follow @apachemxnet</a><script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
+<br/><br/>
+<a href="https://reddit.com/r/mxnet"><img src="https://www.redditstatic.com/spreddit5.gif" alt="reddit" border="0"/> r/mxnet</a>
+
+
+## JIRA
+
+MXNet uses Apache's JIRA to track issues and larger projects. Anyone can review open issues, but in order create issues or view JIRA boards, you must create an account.
+
+* [Open JIRA Issues](https://issues.apache.org/jira/projects/MXNET/issues)
+* [JIRA boards](https://issues.apache.org/jira/secure/RapidBoard.jspa) <i class="fas fa-lock"></i>
+
+
+## Confluence Wiki
+
+The [MXNet Confluence Wiki](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Home) has detailed development environment setup info, design proposals, release process info, and more. This is generally where contributor information is maintained.
+
+* [MXNet Confluence Wiki](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Home) <i class="fas fa-external-link-alt"></i>
+
+
+## Setup MXNet for Development
+
+The process for setting up MXNet for development depends on several factors, and is constantly being improved and expanded for more development languages. Setup information is on the MXNet Confluence Wiki.
+
+* [MXNet Confluence Wiki: Development](https://cwiki.apache.org/confluence/display/MXNET/Development) <i class="fas fa-external-link-alt"></i>
+
+
+## Your First Contribution
+
+**Step 1**: Visit the project on GitHub and review the [calls for contribution](https://github.com/apache/incubator-mxnet/labels/Call%20for%20Contribution). Click the GitHub button:
+<a class="github-button" href="https://github.com/apache/incubator-mxnet/labels/Call%20for%20Contribution" data-size="large" data-show-count="true" aria-label="Issue apache/incubator-mxnet on GitHub">Call for Contribution</a>
+
+**Step 2**: Tackle a smaller issue or improve documentation to get familiar with the process. As part of your pull request, add your name to [CONTRIBUTORS.md](https://github.com/apache/incubator-mxnet/blob/master/CONTRIBUTORS.md).
+
+**Step 3**: Follow the [formal pull request (PR) process](#formal-pull-request-process) to submit your PR for review.
+
+**Important**: keep an eye on your pull request, respond to comments and change requests, and rebase or resubmit your PR if fails the Jenkins continuous integration tests. Ask for help in the [forum or slack channel](#mxnet-dev-communications) if you get stuck.
+
+
+## File a bug report
+
+Please let us know if you experienced a problem with MXNet. Please provide detailed information about the problem you encountered and, if possible, add a description that helps to reproduce the problem. You have two alternatives for filing a bug report:
+<p><a href="http://issues.apache.org/jira/browse/MXNet"><i class="fas fa-bug"></i> JIRA</a></p>
+<p><a href="https://github.com/apache/incubator-mxnet/issues"><i class="fab fa-github"></i> GitHub</a></p>
+
+
+## Minor Fixes
+
+If you have found an issue and would like to contribute a bug fix or documentation update, follow these guidelines:
+
+* If it is trivial, just create a [pull request](https://github.com/apache/incubator-mxnet/pulls).
+* If it is non-trivial, you should follow the [formal pull request process](#formal-pull-request-process) described in the next section.
+
+
+## Formal Pull Request Process
+
+Any new features of improvements that are non-trivial should follow the complete flow of:
+
+1. [Review the contribution standards](https://cwiki.apache.org/confluence/display/MXNET/Development+Process) for your type of submission.
+1. [Create a JIRA issue](https://issues.apache.org/jira/secure/CreateIssue!default.jspa).
+1. [Create the PR on GitHub](https://github.com/apache/incubator-mxnet/pulls) and add the JIRA issue ID to the PR's title.
+
+Further details on this process can be found on the [Wiki](https://cwiki.apache.org/confluence/display/MXNET/Development).
+
+
+## New Feature Process
+
+Our community is constantly looking for feedback to improve Apache MXNet. If you have an idea how to improve MXNet or have a new feature in mind that would be beneficial for MXNet users, please open an issue in [MXNet’s JIRA](http://issues.apache.org/jira/browse/MXNet). The improvement or new feature should be described in appropriate detail and include the scope and its requirements if possible. Detailed information is important for a few reasons:<br/>
+- It ensures your requirements are met when the improvement or feature is implemented.<br/>
+- It helps to estimate the effort and to design a solution that addresses your needs. <br/>
+- It allows for constructive discussions that might arise around this issue.
+
+Detailed information is also required, if you plan to contribute the improvement or feature you proposed yourself. Please read the [contributions](http://mxnet.io/community/contribute.html) guide in this case as well.
+
+
+## Roadmap
+
+MXNet is evolving fast. To see what's next and what the community is currently working on, check out the Roadmap issues on GitHub and the JIRA Boards:
+
+<a class="github-button" href="https://github.com/apache/incubator-mxnet/labels/Roadmap" data-size="large" data-show-count="true" aria-label="Issue apache/incubator-mxnet on GitHub">Roadmap</a>
+<br/>
+[JIRA boards](https://issues.apache.org/jira/secure/RapidBoard.jspa) <i class="fas fa-lock"></i>
+
+
+<script defer src="https://use.fontawesome.com/releases/v5.0.12/js/all.js" integrity="sha384-Voup2lBiiyZYkRto2XWqbzxHXwzcm4A5RfdfG6466bu5LqjwwrjXCMBQBLMWh7qR" crossorigin="anonymous"></script>
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<script src="https://apis.google.com/js/platform.js"></script>
+
+
+## Contributors
+MXNet has been developed by and is used by a group of active community members. Contribute to improving it!
+
+<i class="fab fa-github"></i> [Contributors and Committers](https://github.com/apache/incubator-mxnet/blob/master/CONTRIBUTORS.md)
diff --git a/docs/community/index.md b/docs/community/index.md
deleted file mode 100644
index aa20c393e96..00000000000
--- a/docs/community/index.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# MXNet Community
-## Questions about Using MXNet
-If you need help with using MXNet, have questions about applying it to a particular kind of problem, or have a discussion topic, please use our [forum](https://discuss.mxnet.io).
-
-## File a bug report
-Please let us know if you experienced a problem with MXNet and file a bug report. Open [MXNet’s JIRA](http://issues.apache.org/jira/browse/MXNet) and click on the blue `Create` button at the top. Please give detailed information about the problem you encountered and, if possible, add a description that helps to reproduce the problem.<p/>
-Issues may also be entered on github: [mxnet/issues](https://github.com/apache/incubator-mxnet/issues).  Github issues are synced to JIRA periodically. Thank you very much.
-
-## Propose an improvement or a new feature
-Our community is constantly looking for feedback to improve Apache MXNet. If you have an idea how to improve MXNet or have a new feature in mind that would be beneficial for MXNet users, please open an issue in [MXNet’s JIRA](http://issues.apache.org/jira/browse/MXNet). The improvement or new feature should be described in appropriate detail and include the scope and its requirements if possible. Detailed information is important for a few reasons:<br/>
-- It ensures your requirements are met when the improvement or feature is implemented.<br/> 
-- It helps to estimate the effort and to design a solution that addresses your needs. <br/>
-- It allows for constructive discussions that might arise around this issue.
-
-Detailed information is also required, if you plan to contribute the improvement or feature you proposed yourself. Please read the [contributions](http://mxnet.io/community/contribute.html) guide in this case as well.
-
-## Contributors
-MXNet has been developed and is used by a group of active community members. Contribute to improving it! For more information, see [contributions](http://mxnet.io/community/contribute.html).
-
-Please join the contributor mailing list. <a href="mailto:dev-subscribe@mxnet.apache.org">[subscribe]</a>  [archive](https://lists.apache.org/list.html?dev@mxnet.apache.org)
-
-To join the MXNet slack channel send request to the contributor mailing list. 
-<a href="mailto:dev-subscribe@mxnet.apache.org">[subscribe]</a> [archive](https://the-asf.slackarchive.io/mxnet)
-
-## Roadmap
-
-MXNet is evolving fast. To see what's next and what we are working on internally, go to the [MXNet Roadmap](https://github.com/apache/incubator-mxnet/labels/Roadmap).
-
-## Social Media
-
-- Subscribe to the [MXNet Youtube channel](https://www.youtube.com/channel/UCQua2ZAkbr_Shsgfk1LCy6A)
-- Join the discussion on reddit at [r/mxnet](https://reddit.com/r/mxnet)
-- Follow us on twitter [@apachemxnet](https://twitter.com/apachemxnet)!
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 3f42023fad5..ebe16e2bb99 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -5,6 +5,9 @@ They also include workflow questions, e.g., how to visualize a neural network co
 These answers are fairly focused. For more didactic, self-contained introductions to neural networks
 and full working examples, visit the [tutorials section](../tutorials/index.md).
 
+## API
+
+* [What's the difference between the Module and Gluon APIs for Python?](http://mxnet.io/api/python/index.html)
 
 ## Modeling
 * [How do I fine-tune pre-trained models to a new dataset?](http://mxnet.io/faq/finetune.html)
diff --git a/docs/faq/perf.md b/docs/faq/perf.md
index b5d73f69a03..ad81b5dafc1 100644
--- a/docs/faq/perf.md
+++ b/docs/faq/perf.md
@@ -29,65 +29,70 @@ Note that _MXNet_ treats all CPUs on a single machine as a single device.
 So whether you specify `cpu(0)` or `cpu()`, _MXNet_ will use all CPU cores on the machine.
 
 ### Scoring results
-The following table shows performance,
+The following table shows performance of [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz),
 namely number of images that can be predicted per second.
 We used [example/image-classification/benchmark_score.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py)
 to measure the performance on different AWS EC2 machines.
 
-AWS EC2 C4.8xlarge:
-
-| Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-| --- | --- | --- | --- | --- | --- | --- |
-|   1 |  119.57 | 34.23 |  111.36 |  54.42 |  42.83 | 19.51 |
-|   2 | 210.58 | 51.63 |  137.10 |  67.30 |  57.54 | 23.56 |
-|   4 | 318.54 | 70.00 |  187.21 |  76.53 |  63.64 | 25.80 |
-|   8 | 389.34 | 77.39 |  211.90 |  84.26 |  63.89 | 28.11 |
-|  16 | 489.12 | 85.26 |  220.52 |  82.00 |  63.93 | 27.08 |
-|  32 | 564.04 | 87.15 |  208.21 |  83.05 |  62.19 | 25.76 |
-
-AWS EC2 C4.4xlarge:
-
-| Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-| --- | --- | --- | --- | --- | --- | --- |
-|   1 |  109.96 | 23.00 |  71.82 |  28.10 |  30.66 | 11.81 |
-|   2 | 124.56 | 24.86 |  81.61 |  31.32 |  32.73 | 12.82 |
-|   4 | 157.01 | 26.60 |  86.77 |  32.94 |  33.32 | 13.16 |
-|   8 | 178.40 | 30.67 |  88.58 |  33.52 |  33.32 | 13.32 |
-|  16 | 189.52 | 35.61 |  90.36 |  33.63 |  32.94 | 13.18 |
-|  32 | 196.61 | 38.98 |  105.27 |  33.77 |  32.65 | 13.00 |
-
-AWS EC2 C4.2xlarge:
-
-| Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-| --- | --- | --- | --- | --- | --- | --- |
-|   1 |  70.75 | 12.87 |  42.86 |  16.53 |  18.14 | 7.01 |
-|   2 | 71.53 | 13.08 |  45.66 |  17.38 |  18.53 | 7.18 |
-|   4 | 84.72 | 15.38 |  47.50 |  17.80 |  18.96 | 7.35 |
-|   8 | 93.44 | 18.33 |  48.08 |  17.93 |  18.99 | 7.40 |
-|  16 | 97.03 | 20.12 |  55.73 |  18.00 |  18.91 | 7.36 |
-|  32 | 113.90 | 21.10 |  62.54 |  17.98 |  18.80 | 7.33 |
-
-AWS EC2 C4.xlarge:
-
-| Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-| --- | --- | --- | --- | --- | --- | --- |
-|   1 |  37.92 | 6.57 |  23.09 |  8.79 |  9.65 | 3.73 |
-|   2 | 36.77 | 7.31 |  24.00 |  9.00 |  9.84 | 3.78 |
-|   4 | 43.18 | 8.94 |  24.42 |  9.12 |  9.91 | 3.83 |
-|   8 | 47.05 | 10.01 |  28.32 |  9.13 |  9.88 | 3.83 |
-|  16 | 55.74 | 10.61 |  31.96 |  9.14 |  9.86 | 3.80 |
-|  32 | 65.05 | 10.91 |  33.86 |  9.34 |  10.31 | 3.86 |
-
-AWS EC2 C4.large:
-
-| Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-| --- | --- | --- | --- | --- | --- | --- |
-|   1 |  19.86 | 3.67 |  12.20 |  4.59 |  5.11 | 1.97 |
-|   2 | 19.37 | 4.24 |  12.41 |  4.64 |  5.15 | 1.98 |
-|   4 | 22.64 | 4.89 |  14.34 |  4.66 |  5.16 | 2.00 |
-|   8 | 27.19 | 5.25 |  16.17 |  4.66 |  5.16 | 1.99 |
-|  16 | 31.82 | 5.46 |  17.24 |  4.76 |  5.35 | OOM |
-|  32 | 34.67 | 5.55 |  17.64 |  4.88 |  OOM | OOM |
+AWS EC2 C5.18xlarge:
+
+| Batch | Alexnet | VGG 16    | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1     | 390.53  | 81.57  | 124.13       | 62.26        | 76.22     | 32.92      |
+| 2     | 596.45  | 100.84 | 206.58       | 93.36        | 119.55    | 46.80      |
+| 4     | 710.77  | 119.04 | 275.55       | 127.86       | 148.62    | 59.36      |
+| 8     | 921.40  | 120.38 | 380.82       | 157.11       | 167.95    | 70.78      |
+| 16    | 1018.43 | 115.30 | 411.67       | 168.71       | 178.54    | 75.13      |
+| 32    | 1290.31 | 107.19 | 483.34       | 179.38       | 193.47    | 85.86      |
+
+
+AWS EC2 C5.9xlarge:
+
+| Batch | Alexnet | VGG 16   | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|-------|--------------|--------------|-----------|------------|
+| 1     | 257.77  | 50.61 | 130.99       | 66.95        | 75.38     | 32.33      |
+| 2     | 410.60  | 63.02 | 195.14       | 87.84        | 102.67    | 41.57      |
+| 4     | 462.59  | 62.64 | 263.15       | 109.87       | 127.15    | 50.69      |
+| 8     | 573.79  | 63.95 | 309.99       | 121.36       | 140.84    | 59.01      |
+| 16    | 709.47  | 67.79 | 350.19       | 128.26       | 147.41    | 64.15      |
+| 32    | 831.46  | 69.58 | 354.91       | 129.92       | 149.18    | 64.25      |
+
+
+AWS EC2 C5.4xlarge:
+
+| Batch | Alexnet | VGG 16   | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|-------|--------------|--------------|-----------|------------|
+| 1     | 214.15  | 29.32 | 114.97       | 47.96        | 61.01     | 23.92      |
+| 2     | 310.04  | 34.81 | 150.09       | 60.89        | 71.16     | 27.92      |
+| 4     | 330.69  | 34.56 | 186.63       | 74.15        | 86.86     | 34.37      |
+| 8     | 378.88  | 35.46 | 204.89       | 77.05        | 91.10     | 36.93      |
+| 16    | 424.00  | 36.49 | 211.55       | 78.39        | 91.23     | 37.34      |
+| 32    | 481.95  | 37.23 | 213.71       | 78.23        | 91.68     | 37.26      |
+
+
+AWS EC2 C5.2xlarge:
+
+| Batch | Alexnet | VGG 16   | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|-------|--------------|--------------|-----------|------------|
+| 1     | 131.01  | 15.67 | 78.75        | 31.12        | 37.30     | 14.75      |
+| 2     | 182.29  | 18.01 | 98.59        | 39.13        | 45.98     | 17.84      |
+| 4     | 189.31  | 18.25 | 110.26       | 41.35        | 49.21     | 19.32      |
+| 8     | 211.75  | 18.57 | 115.46       | 42.53        | 49.98     | 19.81      |
+| 16    | 236.06  | 19.11 | 117.18       | 42.59        | 50.20     | 19.92      |
+| 32    | 261.13  | 19.46 | 116.20       | 42.72        | 49.95     | 19.80      |
+
+
+AWS EC2 C5.xlarge:
+
+| Batch | Alexnet | VGG 16  | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|------|--------------|--------------|-----------|------------|
+| 1     | 36.64   | 3.93 | 27.06        | 10.09        | 12.98     | 5.06       |
+| 2     | 49.21   | 4.49 | 29.67        | 10.80        | 12.94     | 5.14       |
+| 4     | 50.12   | 4.50 | 30.31        | 10.83        | 13.17     | 5.19       |
+| 8     | 54.71   | 4.58 | 30.22        | 10.89        | 13.19     | 5.20       |
+| 16    | 60.23   | 4.70 | 30.20        | 10.91        | 13.23     | 5.19       |
+| 32    | 66.37   | 4.76 | 30.10        | 10.90        | 13.22     | 5.15       |
+
 
 ## Other CPU
 
@@ -101,88 +106,114 @@ We suggest always checking to make sure that a recent cuDNN version is used.
 
 Setting the environment `export MXNET_CUDNN_AUTOTUNE_DEFAULT=1` sometimes also helps.
 
-We show results when using various GPUs including K80 (EC2 p2.2xlarge), M40,
-and P100 (DGX-1).
+We show results when using various GPUs including K80 (EC2 p2.2xlarge), M60 (EC2 g3.4xlarge),
+and V100 (EC2 p3.2xlarge).
 
 ### Scoring results
 
 Based on
 [example/image-classification/benchmark_score.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py)
-and MXNet commit `0a03417`, with cuDNN 5.1
+and  [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz), with cuDNN 7.0.5
 
 - K80 (single GPU)
 
-  | Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-  | --- | --- | --- | --- | --- | --- | --- |
-  |   1 | 202.66  | 70.76 | 74.91  | 42.61  | 70.94 | 24.87 |
-  |   2 | 233.76  | 63.53 | 119.60  | 60.09  | 92.28 | 34.23 |
-  |   4 | 367.91  | 78.16 | 164.41  | 72.30  | 116.68 | 44.76 |
-  |   8 | 624.14  | 119.06 | 195.24  | 79.62  | 129.37 | 50.96 |
-  |  16 | 1071.19 | 195.83 | 256.06  | 99.38  | 160.40 | 66.51 |
-  |  32 | 1443.90 | 228.96 | 287.93  | 106.43  | 167.12 | 69.73 |
-
-- M40
-
-  | Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-  | --- | --- | --- | --- | --- | --- | --- |
-  |   1 | 412.09 | 142.10 | 115.89  | 64.40  | 126.90 | 46.15 |
-  |   2 | 743.49 | 212.21 | 205.31  | 108.06  | 202.17 | 75.05 |
-  |   4 | 1155.43 | 280.92 | 335.69  | 161.59  | 266.53 | 106.83 |
-  |   8 | 1606.87 | 332.76 | 491.12  | 224.22  | 317.20 | 128.67 |
-  |  16 | 2070.97 | 400.10 | 618.25  | 251.87  | 335.62 | 134.60 |
-  |  32 | 2694.91 | 466.95 | 624.27  | 258.59  | 373.35 | 152.71 |
-
-- P100
-
-  | Batch | Alexnet | VGG | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
-  | --- | --- | --- | --- | --- | --- | --- |
-  |   1 | 624.84 | 294.6 | 139.82  | 80.17  | 162.27 | 58.99 |
-  |   2 | 1226.85 | 282.3 | 267.41  | 142.63  | 278.02 | 102.95 |
-  |   4 | 1934.97 | 399.3 | 463.38  | 225.56  | 423.63 | 168.91 |
-  |   8 | 2900.54 | 522.9 | 709.30  | 319.52  | 529.34 | 210.10 |
-  |  16 | 4063.70 | 755.3 | 949.22  | 444.65  | 647.43 | 270.07 |
-  |  32 | 4883.77 | 854.4 | 1197.74  | 493.72  | 713.17 | 294.17 |
+| Batch | Alexnet | VGG 16    | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1     | 243.93  | 43.59  | 68.62        | 35.52        | 67.41     | 23.65      |
+| 2     | 338.16  | 49.14  | 113.41       | 56.29        | 93.35     | 33.88      |
+| 4     | 478.92  | 53.44  | 159.61       | 74.43        | 119.18    | 45.23      |
+| 8     | 683.52  | 70.50  | 190.49       | 86.23        | 131.32    | 50.54      |
+| 16    | 1004.66 | 109.01 | 254.20       | 105.70       | 155.40    | 62.55      |
+| 32    | 1238.55 | 114.98 | 285.49       | 116.79       | 159.42    | 64.99      |
+| 64 | 1346.72 | 123.56 | 308.73 | 122.21 | 167.58 | 70.21 |
+| 128 | 1416.91 | OOM | 320.98 | 123.11 | 171.55 | 71.85 |
+| 256 | 1462.97 | OOM | 329.16 | 127.53 | 153.01 | 57.23 |
+
+- M60
+
+| Batch | Alexnet | VGG 16    | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1     | 243.49  | 59.95  | 101.97       | 48.30        | 95.46     | 39.29      |
+| 2     | 491.04  | 69.14  | 170.35       | 80.27        | 142.61    | 60.17      |
+| 4     | 711.54  | 78.94  | 257.89       | 123.09       | 182.36    | 76.51      |
+| 8     | 1077.73 | 109.34 | 343.42       | 152.82       | 208.74    | 87.27      |
+| 16    | 1447.21 | 144.93 | 390.25       | 166.32       | 220.73    | 92.41      |
+| 32    | 1797.66 | 151.86 | 416.69       | 176.56       | 230.19    | 97.03      |
+| 64 | 1779.38 | 150.18 | 427.51 | 183.47 | 239.12 | 101.59 |
+| 128 | 1787.36 | OOM | 439.04 | 185.29 | 243.31 | 103.39 |
+| 256 | 1899.10 | OOM | 450.22 | 183.42 | 242.36 | 100.98 |
+
+
+- V100
+
+| Batch | Alexnet | VGG 16    | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+|-------|---------|--------|--------------|--------------|-----------|------------|
+| 1     | 659.51  | 205.16 | 157.37 | 87.71 | 162.15    | 61.38      |
+| 2     | 1248.21 | 265.40 | 297.34 | 159.24 | 293.74    | 116.30     |
+| 4     | 2122.41 | 333.97 | 520.91 | 279.84 | 479.14    | 195.17     |
+| 8     | 3894.30 | 420.26 | 898.09 | 455.03 | 699.39    | 294.19     |
+| 16    | 5815.58 | 654.16 | 1430.97 | 672.54 | 947.45    | 398.79     |
+| 32    | 7906.09 | 708.43 | 1847.26 | 814.59 | 1076.81   | 451.82     |
+| 64 | 9486.26 | 701.59 | 2134.89 | 899.01 | 1168.37 | 480.44 |
+| 128 | 10177.84 | 703.30 | 2318.32 | 904.33 | 1233.15 | 511.79 |
+| 256 | 10990.46 | 473.62 | 2425.28 | 960.20 | 1155.07 | 449.35 |
+
+Below is the performance result on V100 using float 16.
+
+| Batch | VGG 16  | Inception-BN | Inception-v3 | Resnet 50 | Resnet 152 |
+| ----- | ------- | ------------ | ------------ | --------- | ---------- |
+| 1     | 276.29  | 155.53       | 150.99       | 270.89    | 96.79      |
+| 2     | 476.91  | 296.45       | 282.02       | 493.99    | 176.88     |
+| 4     | 711.92  | 525.05       | 492.45       | 851.15    | 321.52     |
+| 8     | 1047.11 | 900.26       | 807.94       | 1282.36   | 517.66     |
+| 16    | 1299.88 | 1441.41      | 1192.21      | 1722.97   | 724.57     |
+| 32    | 1486.63 | 1854.30      | 1512.08      | 2085.51   | 887.34     |
+| 64    | 1219.65 | 2138.61      | 1687.35      | 2341.67   | 1002.90    |
+| 128   | 1169.81 | 2317.39      | 1818.26      | 2355.04   | 1046.98    |
+| 256   | 764.16  | 2425.16      | 1653.74      | 1991.88   | 976.73     |
 
 ### Training results
 
 Based on
 [example/image-classification/train_imagenet.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_imagenet.py)
-and MXNet commit `0a03417`, with CUDNN 5.1. The benchmark script is available at
+and  [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz), with CUDNN 7.0.5. The benchmark script is available at
 [here](https://github.com/mli/mxnet-benchmark/blob/master/run_vary_batch.sh),
-where the batch size for Alexnet is increased by 8x.
+where the batch size for Alexnet is increased by 16x.
 
 - K80 (single GPU)
 
-  | Batch | Alexnet(\*8) | Inception-v3 | Resnet 50 |
+  | Batch | Alexnet(\*16) | Inception-v3 | Resnet 50 |
   | --- | --- | --- | --- |
-  |   1 | 230.69 | 9.81  | 13.83 |
-  |   2 | 348.10 | 15.31 | 21.85 |
-  |   4 | 457.28 | 20.48 | 29.58 |
-  |   8 | 533.51 | 24.47 | 36.83 |
-  |  16 | 582.36 | 28.46 | 43.60 |
-  |  32 | 483.37 | 29.62 | 45.52 |
+  |   1 | 300.30 | 10.48 | 15.61 |
+  |   2 | 406.08 | 16.00 | 23.88 |
+  |   4 | 461.01 | 22.10 | 32.26 |
+  |   8 | 484.00 | 26.80 | 39.42 |
+  |  16 | 490.45 | 31.62 | 46.69 |
+  |  32 | 414.72 | 33.78 | 49.48 |
 
-- M40
+- M60
 
-  | Batch | Alexnet(\*8) | Inception-v3 | Resnet 50 |
+  | Batch | Alexnet(\*16) | Inception-v3 | Resnet 50 |
   | --- | --- | --- | --- |
-  |   1 | 405.17  | 14.35 | 21.56 |
-  |   2 | 606.32  | 23.96 | 36.48 |
-  |   4 | 792.66  | 37.38 | 52.96 |
-  |   8 | 1016.51 | 52.69 | 70.21 |
-  |  16 | 1105.18 | 62.35 | 83.13 |
-  |  32 | 1046.23 | 68.87 | 90.74 |
+  |   1 | 380.96 | 14.06 | 20.55 |
+  |   2 | 530.53 | 21.90 | 32.65 |
+  |   4 | 600.17 | 31.96 | 45.57 |
+  |   8 | 633.60 | 40.58 | 54.92 |
+  |  16 | 639.37 | 46.88 | 64.44 |
+  |  32 | 576.54 | 50.05 | 68.34 |
 
-- P100
+- V100
 
-  | Batch | Alexnet(\*8) | Inception-v3 | Resnet 50 |
+  | Batch | Alexnet(\*16) | Inception-v3 | Resnet 50 |
   | --- | --- | --- | --- |
-  |   1 | 809.94  | 15.14  | 27.20  |
-  |   2 | 1202.93 | 30.34  | 49.55  |
-  |   4 | 1631.37 | 50.59  | 78.31  |
-  |   8 | 1882.74 | 77.75  | 122.45 |
-  |  16 | 2012.04 | 111.11 | 156.79 |
-  |  32 | 1869.69 | 129.98 | 181.53 |
+  |   1 | 1629.52 | 21.83 | 34.54 |
+  |   2 | 2359.73 | 40.11 | 65.01 |
+  |   4 | 2687.89 | 72.79 | 113.49 |
+  |   8 | 2919.02 | 118.43 | 174.81 |
+  |  16 | 2994.32 | 173.15 | 251.22 |
+  |  32 | 2585.61 | 214.48 | 298.51 |
+  | 64 | 1984.21 | 247.43 | 343.19 |
+  | 128 | OOM | 253.68 | 363.69 |
 
 ## Multiple Devices
 
diff --git a/docs/install/download.md b/docs/install/download.md
index 1d6d6d477db..ad3762ea9fa 100644
--- a/docs/install/download.md
+++ b/docs/install/download.md
@@ -2,11 +2,12 @@
 
 These source archives are generated from tagged releases. Updates and patches will not have been applied. For any updates refer to the corresponding branches in the [GitHub repository](https://github.com/apache/incubator-mxnet). Choose your flavor of download from the following links:
 
-| Version | Source                                                                                                      | PGP                                                                                                             | SHA                                                                                                                | MD5                                                                                                             |
-|---------|-------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|
-| 1.1.0   | [Download](https://www.apache.org/dist/incubator/mxnet/1.1.0/apache-mxnet-src-1.1.0-incubating.tar.gz)      | [Download](https://www.apache.org/dist/incubator/mxnet/1.1.0/apache-mxnet-src-1.1.0-incubating.tar.gz.asc)      | [Download](https://www.apache.org/dist/incubator/mxnet/1.1.0/apache-mxnet-src-1.1.0-incubating.tar.gz.sha512)      | [Download](https://www.apache.org/dist/incubator/mxnet/1.1.0/apache-mxnet-src-1.1.0-incubating.tar.gz.md5)      |
-| 1.0.0   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.0.0/apache-mxnet-src-1.0.0-incubating.tar.gz)   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.0.0/apache-mxnet-src-1.0.0-incubating.tar.gz.asc)   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.0.0/apache-mxnet-src-1.0.0-incubating.tar.gz.sha512)   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.0.0/apache-mxnet-src-0.12.1-incubating.tar.gz.md5)  |
-| 0.12.1  | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.1/apache-mxnet-src-0.12.1-incubating.tar.gz) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.1/apache-mxnet-src-0.12.1-incubating.tar.gz.asc) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.1/apache-mxnet-src-0.12.1-incubating.tar.gz.sha512) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.1/apache-mxnet-src-0.12.1-incubating.tar.gz.md5) |
-| 0.12.0  | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.0/apache-mxnet-src-0.12.0-incubating.tar.gz) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.0/apache-mxnet-src-0.12.0-incubating.tar.gz.asc) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.0/apache-mxnet-src-0.12.0-incubating.tar.gz.sha512) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.0/apache-mxnet-src-0.12.0-incubating.tar.gz.md5) |
-| 0.11.0  | [Download](http://archive.apache.org/dist/incubator/mxnet/0.11.0/apache-mxnet-src-0.11.0-incubating.tar.gz) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.11.0/apache-mxnet-src-0.11.0-incubating.tar.gz.asc) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.11.0/apache-mxnet-src-0.11.0-incubating.tar.gz.sha512) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.11.0/apache-mxnet-src-0.11.0-incubating.tar.gz.md5) |
+| Version | Source                                                                                                      | PGP                                                                                                             | SHA                                                                                                                |
+|---------|-------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| 1.2.0   | [Download](https://github.com/apache/incubator-mxnet/releases/download/1.2.0/apache-mxnet-src-1.2.0-incubating.tar.gz)   | [Download](https://github.com/apache/incubator-mxnet/releases/download/1.2.0/apache-mxnet-src-1.2.0-incubating.tar.gz.asc)    | [Download](https://github.com/apache/incubator-mxnet/releases/download/1.2.0/apache-mxnet-src-1.2.0-incubating.tar.gz.sha512)      |
+| 1.1.0   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.1.0/apache-mxnet-src-1.1.0-incubating.tar.gz)      | [Download](http://archive.apache.org/dist/incubator/mxnet/1.1.0/apache-mxnet-src-1.1.0-incubating.tar.gz.asc)      | [Download](http://archive.apache.org/dist/incubator/mxnet/1.1.0/apache-mxnet-src-1.1.0-incubating.tar.gz.sha512)     |
+| 1.0.0   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.0.0/apache-mxnet-src-1.0.0-incubating.tar.gz)   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.0.0/apache-mxnet-src-1.0.0-incubating.tar.gz.asc)   | [Download](http://archive.apache.org/dist/incubator/mxnet/1.0.0/apache-mxnet-src-1.0.0-incubating.tar.gz.sha512)   |
+| 0.12.1  | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.1/apache-mxnet-src-0.12.1-incubating.tar.gz) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.1/apache-mxnet-src-0.12.1-incubating.tar.gz.asc) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.1/apache-mxnet-src-0.12.1-incubating.tar.gz.sha512) |
+| 0.12.0  | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.0/apache-mxnet-src-0.12.0-incubating.tar.gz) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.0/apache-mxnet-src-0.12.0-incubating.tar.gz.asc) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.12.0/apache-mxnet-src-0.12.0-incubating.tar.gz.sha512) |
+| 0.11.0  | [Download](http://archive.apache.org/dist/incubator/mxnet/0.11.0/apache-mxnet-src-0.11.0-incubating.tar.gz) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.11.0/apache-mxnet-src-0.11.0-incubating.tar.gz.asc) | [Download](http://archive.apache.org/dist/incubator/mxnet/0.11.0/apache-mxnet-src-0.11.0-incubating.tar.gz.sha512) |
 
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 09a39e2c469..07027ad7457 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -34,9 +34,9 @@ To build and install MXNet yourself, you need the following dependencies. Instal
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
-1. Download the MXNet source code from [GitHub](https://github.com/dmlc/mxnet). Don't forget to pull the submodules:
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
 ```
-    git clone https://github.com/apache/incubator-mxnet.git ~/mxnet --recursive
+    git clone https://github.com/apache/incubator-mxnet.git --recursive
 ```
 2. Start a Visual Studio command prompt.
 3. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
diff --git a/docs/tutorials/gluon/pretrained_models.md b/docs/tutorials/gluon/pretrained_models.md
new file mode 100644
index 00000000000..0de5fdd0b44
--- /dev/null
+++ b/docs/tutorials/gluon/pretrained_models.md
@@ -0,0 +1,375 @@
+
+# Using pre-trained models in MXNet
+
+
+In this tutorial we will see how to use multiple pre-trained models with Apache MXNet. First, let's download three image classification models from the Apache MXNet [Gluon model zoo](https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html).
+* **DenseNet-121** ([research paper](https://arxiv.org/abs/1608.06993)), improved state of the art on [ImageNet dataset](http://image-net.org/challenges/LSVRC) in 2016.
+* **MobileNet** ([research paper](https://arxiv.org/abs/1704.04861)), MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks, suitable for mobile applications.
+* **ResNet-18** ([research paper](https://arxiv.org/abs/1512.03385v1)), the -152 version is the 2015 winner in multiple categories.
+
+Why would you want to try multiple models? Why not just pick the one with the best accuracy? As we will see later in the tutorial, even though these models have been trained on the same dataset and optimized for maximum accuracy, they do behave slightly differently on specific images. In addition, prediction speed and memory footprints can vary, and that is an important factor for many applications. By trying a few pretrained models, you have an opportunity to find a model that can be a good fit for solving your business problem.
+
+
+```python
+import json
+
+import matplotlib.pyplot as plt
+import mxnet as mx
+from mxnet import gluon, nd
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+%matplotlib inline
+```
+
+## Loading the model
+
+The [Gluon Model Zoo](https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html) provides a collection of off-the-shelf models. You can get the ImageNet pre-trained model by using `pretrained=True`. 
+If you want to train on your own classification problem from scratch, you can get an untrained network with a specific number of classes using the `classes` parameter: for example `net = vision.resnet18_v1(classes=10)`. However note that you cannot use the `pretrained` and `classes` parameter at the same time. If you want to use pre-trained weights as initialization of your network except for the last layer, have a look at the last section of this tutorial.
+
+We can specify the *context* where we want to run the model: the default behavior is to use a CPU context. There are two reasons for this:
+* First, this will allow you to test the notebook even if your machine is not equipped with a GPU :)
+* Second, we're going to predict a single image and we don't have any specific performance requirements. For production applications where you'd want to predict large batches of images with the best possible throughput, a GPU could definitely be the way to go.
+* If you want to use a GPU, make sure you have pip installed the right version of mxnet, or you will get an error when using the `mx.gpu()` context. Refer to the [install instructions](http://mxnet.incubator.apache.org/install/index.html)
+
+
+```python
+# We set the context to CPU, you can switch to GPU if you have one and installed a compatible version of MXNet 
+ctx = mx.cpu() 
+```
+
+
+```python
+# We can load three the three models
+densenet121 = vision.densenet121(pretrained=True, ctx=ctx)
+mobileNet = vision.mobilenet0_5(pretrained=True, ctx=ctx)
+resnet18 = vision.resnet18_v1(pretrained=True, ctx=ctx)
+```
+
+We can look at the description of the MobileNet network for example, which has a relatively simple yet deep architecture
+
+
+```python
+print(mobileNet)
+```
+
+    MobileNet(
+      (features): HybridSequential(
+        (0): Conv2D(3 -> 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        (1): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
+        (2): Activation(relu)
+        (3): Conv2D(1 -> 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
+        (4): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=16)
+        (5): Activation(relu)
+        (6): Conv2D(16 -> 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (7): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
+        (8): Activation(relu)
+        (9): Conv2D(1 -> 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
+        (10): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=32)
+        (11): Activation(relu)
+        (12): Conv2D(32 -> 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (13): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
+        (14): Activation(relu)
+        (15): Conv2D(1 -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
+        (16): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
+        (17): Activation(relu)
+        (18): Conv2D(64 -> 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (19): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
+        (20): Activation(relu)
+        (21): Conv2D(1 -> 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
+        (22): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=64)
+        (23): Activation(relu)
+        (24): Conv2D(64 -> 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (25): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
+        (26): Activation(relu)
+        (27): Conv2D(1 -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=128, bias=False)
+        (28): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
+        (29): Activation(relu)
+        (30): Conv2D(128 -> 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (31): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
+        (32): Activation(relu)
+        (33): Conv2D(1 -> 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=128, bias=False)
+        (34): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
+        (35): Activation(relu)
+        (36): Conv2D(128 -> 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (37): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (38): Activation(relu)
+        (39): Conv2D(1 -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
+        (40): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (41): Activation(relu)
+        (42): Conv2D(256 -> 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (43): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (44): Activation(relu)
+        (45): Conv2D(1 -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
+        (46): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (47): Activation(relu)
+        (48): Conv2D(256 -> 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (49): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (50): Activation(relu)
+        (51): Conv2D(1 -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
+        (52): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (53): Activation(relu)
+        (54): Conv2D(256 -> 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (55): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (56): Activation(relu)
+        (57): Conv2D(1 -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
+        (58): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (59): Activation(relu)
+        (60): Conv2D(256 -> 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (61): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (62): Activation(relu)
+        (63): Conv2D(1 -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False)
+        (64): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (65): Activation(relu)
+        (66): Conv2D(256 -> 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (67): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (68): Activation(relu)
+        (69): Conv2D(1 -> 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256, bias=False)
+        (70): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
+        (71): Activation(relu)
+        (72): Conv2D(256 -> 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (73): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
+        (74): Activation(relu)
+        (75): Conv2D(1 -> 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512, bias=False)
+        (76): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
+        (77): Activation(relu)
+        (78): Conv2D(512 -> 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
+        (79): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=512)
+        (80): Activation(relu)
+        (81): GlobalAvgPool2D(size=(1, 1), stride=(1, 1), padding=(0, 0), ceil_mode=True)
+        (82): Flatten
+      )
+      (output): Dense(512 -> 1000, linear)
+    )
+
+
+Let's have a closer look at the first convolution layer:
+
+
+```python
+print(mobileNet.features[0].params)
+```
+
+`mobilenet1_conv0_ (Parameter mobilenet1_conv0_weight (shape=(16, 3, 3, 3), dtype=<class 'numpy.float32'>))`<!--notebook-skip-line-->
+
+
+The first layer applies **`16`** different convolutional masks, of size **`InputChannels x 3 x 3`**. For the first convolution, there are **`3`** input channels, the `R`, `G`, `B` channels of the input image. That gives us the weight matrix of shape **`16 x 3 x 3 x 3`**. There is no bias applied in this convolution.
+
+Let's have a look at the output layer now:
+
+
+```python
+print(mobileNet.output)
+```
+
+`Dense(512 -> 1000, linear)`<!--notebook-skip-line-->
+
+
+Did you notice the shape of layer? The weight matrix is **1000 x 512**. This layer contains 1,000 neurons: each of them will store an activation representative of the probability of the image belonging to a specific category. Each neuron is also fully connected to all 512 neurons in the previous layer.
+
+OK, enough exploring! Now let's use these models to classify our own images.
+
+## Loading the data
+All three models have been pre-trained on the ImageNet data set which includes over 1.2 million pictures of objects and animals sorted in 1,000 categories.
+We get the imageNet list of labels. That way we have the mapping so when the model predicts for example category index `4`, we know it is predicting `hammerhead, hammerhead shark`
+
+
+```python
+mx.test_utils.download('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/onnx/image_net_labels.json')
+categories = np.array(json.load(open('image_net_labels.json', 'r')))
+print(categories[4])
+```
+
+`hammerhead, hammerhead shark` <!--notebook-skip-line-->
+
+
+Get a test image
+
+
+```python
+filename = mx.test_utils.download('https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/onnx/images/dog.jpg?raw=true', fname='dog.jpg')
+```
+
+If you want to use your own image for the test, copy the image to the same folder that contains the notebook and change the following line:
+
+
+```python
+filename = 'dog.jpg'
+```
+
+Load the image as a NDArray
+
+
+```python
+image = mx.image.imread(filename)
+plt.imshow(image.asnumpy())
+```
+
+![png](https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/onnx/images/dog.jpg?raw=true)<!--notebook-skip-line-->
+
+
+Neural network expects input in a specific format. Usually images comes in the `Width x Height x Channels` format. Where channels are the RGB channels.
+This network accepts images in the `BatchSize x 3 x 224 x 224`. `224 x 224` is the image resolution, that's how the model was trained. `3` is the number of channels : Red, Green and Blue (in this order). In this case we use a `BatchSize` of `1` since we are predicting one image at a time.
+Here are the transformation steps:
+* Read the image: this will return a NDArray shaped as (image height, image width, 3), with the three channels in RGB order.
+* Resize the shorter edge of the image 224.
+* Crop, using a size of 224x224 from the center of the image.
+* Shift the mean and standard deviation of our color channels to match the ones of the dataset the network has been trained on.
+* Transpose the array from (Height, Width, 3) to (3, Height, Width).
+* Add a fourth dimension, the batch dimension.
+
+
+
+```python
+def transform(image):
+    resized = mx.image.resize_short(image, 224) #minimum 224x224 images
+    cropped, crop_info = mx.image.center_crop(resized, (224, 224))
+    normalized = mx.image.color_normalize(cropped.astype(np.float32)/255,
+                                      mean=mx.nd.array([0.485, 0.456, 0.406]),
+                                      std=mx.nd.array([0.229, 0.224, 0.225])) 
+    # the network expect batches of the form (N,3,224,224)
+    transposed = normalized.transpose((2,0,1))  # Transposing from (224, 224, 3) to (3, 224, 224)
+    batchified = transposed.expand_dims(axis=0) # change the shape from (3, 224, 224) to (1, 3, 224, 224)
+    return batchified
+```
+
+## Testing the different networks
+We run the image through each pre-trained network. The models output a *NDArray* holding 1,000 activation values, which we convert to probabilities using the `softmax()` function, corresponding to the 1,000 categories it has been trained on. The output prediction NDArray has only one row since batch size is equal to 1
+
+
+```python
+predictions = resnet18(transform(image)).softmax()
+print(predictions.shape)
+```
+
+`(1, 1000)`<!--notebook-skip-line-->
+
+
+We then take the top `k` predictions for our image, here the top `3`.
+
+
+```python
+top_pred = predictions.topk(k=3)[0].asnumpy()
+```
+
+And we print the categories predicted with their corresponding probabilities:
+
+
+```python
+for index in top_pred:
+    probability = predictions[0][int(index)]
+    category = categories[int(index)]
+    print("{}: {:.2f}%".format(category, probability.asscalar()*100))
+```
+
+
+`boxer: 93.03%` <!--notebook-skip-line-->
+
+`bull mastiff: 5.73%`<!--notebook-skip-line-->
+
+`Staffordshire bullterrier, Staffordshire bull terrier: 0.58%` <!--notebook-skip-line-->
+
+
+Let's turn this into a function. Our parameters are an image, a model, a list of categories and the number of top categories we'd like to print. 
+
+
+```python
+def predict(model, image, categories, k):
+    predictions = model(transform(image)).softmax()
+    top_pred = predictions.topk(k=k)[0].asnumpy()
+    for index in top_pred:
+        probability = predictions[0][int(index)]
+        category = categories[int(index)]
+        print("{}: {:.2f}%".format(category, probability.asscalar()*100))
+    print('')
+```
+
+### DenseNet121
+
+
+```python
+%%time
+predict(densenet121, image, categories, 3)
+```
+
+`boxer: 94.77%`<!--notebook-skip-line-->
+
+`bull mastiff: 2.26%`<!--notebook-skip-line-->
+
+`American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier: 1.69%`<!--notebook-skip-line-->
+    
+
+`CPU times: user 360 ms, sys: 0 ns, total: 360 ms`<!--notebook-skip-line-->
+
+`Wall time: 165 ms`<!--notebook-skip-line-->
+
+
+### MobileNet
+
+
+```python
+%%time
+predict(mobileNet, image, categories, 3)
+```
+
+`boxer: 84.02%` <!--notebook-skip-line-->
+
+`bull mastiff: 13.63%` <!--notebook-skip-line-->
+
+`Rhodesian ridgeback: 0.66%` <!--notebook-skip-line-->
+
+
+`CPU times: user 72 ms, sys: 0 ns, total: 72 ms` <!--notebook-skip-line-->
+
+`Wall time: 31.2 ms` <!--notebook-skip-line-->
+
+
+### Resnet-18
+
+
+```python
+%%time
+predict(resnet18, image, categories, 3)
+```
+
+`boxer: 93.03%` <!--notebook-skip-line-->
+
+`bull mastiff: 5.73%` <!--notebook-skip-line-->
+
+`Staffordshire bullterrier, Staffordshire bull terrier: 0.58%` <!--notebook-skip-line-->
+
+
+`CPU times: user 156 ms, sys: 0 ns, total: 156 ms` <!--notebook-skip-line-->
+
+`Wall time: 77.1 ms` <!--notebook-skip-line-->
+
+
+As you can see, pre-trained networks produce slightly different predictions, and have different run-time. In this case, MobileNet is almost **5 times faster** than DenseNet!
+
+## Fine-tuning pre-trained models
+
+You can replace the output layer of your pre-trained model to fit the right number of classes for your own image classification task like this, for example for 10 classes:
+
+
+```python
+NUM_CLASSES = 10
+with resnet18.name_scope():
+    resnet18.output = gluon.nn.Dense(NUM_CLASSES)
+```
+
+
+```python
+print(resnet18.output)
+```
+
+`Dense(None -> 10, linear)` <!--notebook-skip-line-->
+
+
+Now you can train your model on your new data using the pre-trained weights as initialization. This is called transfer learning and it has proved to be very useful especially in the cases where you only have access to a small dataset. Your network will have already learned how to perform general pattern detection and feature extraction on the larger dataset.
+You can learn more about transfer learning and fine-tuning with MXNet in these tutorials:
+- [Transferring knowledge through fine-tuning](http://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html)
+- [Fine Tuning an ONNX Model](https://mxnet.incubator.apache.org/tutorials/onnx/fine_tuning_gluon.html)
+
+
+That's it! Explore the model zoo, have fun with pre-trained models!
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 94ea050b986..5cdd2d7aade 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -1,269 +1,156 @@
 # Tutorials
 
-MXNet has two primary high-level interfaces for its deep learning engine: the Gluon API and the Module API. Tutorials for each are provided below.
+MXNet tutorials can be found in this section. A variety of language bindings are available for MXNet (including Python, Scala, C++ and R) and we have a different tutorial section for each language.
 
-`TL;DR:` If you are new to deep learning or MXNet, you should start with the Gluon tutorials.
+Are you new to MXNet, and don't have a preference on language? We currently recommend starting with Python, and specifically the Gluon APIs (versus Module APIs) as they're more flexible and easier to debug.
 
-The difference between the two is an imperative versus symbolic programming style. Gluon makes it easy to prototype, build, and train deep learning models without sacrificing training speed by enabling both (1) intuitive imperative Python code development and (2) faster execution by automatically generating a symbolic execution graph using the hybridization feature.
+Another great resource for learning MXNet is our [examples section](https://github.com/apache/incubator-mxnet/tree/master/example) which includes a wide variety of models (from basic to state-of-the-art) for a wide variety of tasks including: object detection, style transfer, reinforcement learning, and many others.
 
-The Gluon and Module tutorials are in Python, but you can also find a variety of other MXNet tutorials, such as R, Scala, and C++ in the [Other Languages API Tutorials](#other-mxnet-api-tutorials) section below.
+<hr>
 
-[Example scripts and applications](#example-scripts-and-applications) as well as [contribution](#contributing-tutorials) info is below.
+## Python Tutorials
 
-<script type="text/javascript" src='../_static/js/options.js'></script>
+We have two types of API available for Python: Gluon APIs and Module APIs. [See here](/api/python/gluon/gluon.html) for a comparison.
 
+A comprehensive introduction to Gluon can be found at [The Straight Dope](http://gluon.mxnet.io/). Structured like a book, it build up from first principles of deep learning and take a theoretical walkthrough of progressively more complex models using the Gluon API. Also check out the [60-Minute Gluon Crash Course](http://gluon-crash-course.mxnet.io/) if you're short on time or have used other deep learning frameworks before.
 
-## Python API Tutorials
+Use the tutorial selector below to filter to the relevant tutorials. You might see a download link in the top right corner of some tutorials. Use this to download a Jupyter Notebook version of the tutorial, and re-run and adjust the code as you wish.
+
+<script type="text/javascript" src='../_static/js/options.js'></script>
 
 <!-- Gluon vs Module -->
+Select API:&nbsp;
 <div class="btn-group opt-group" role="group">
   <button type="button" class="btn btn-default opt active" style="font-size:22px">Gluon</button>
   <button type="button" class="btn btn-default opt"   style="font-size:22px">Module</button>
 </div>
-
-
-<!-- Levels -->
-<div class="gluon module">
-<div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Introduction</button>
-  <button type="button" class="btn btn-default opt">Applications</button>
-</div>
-</div>
-
-
-<!-- introduction Topics -->
-<div class="introduction">
-<div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Basics</button>
-  <button type="button" class="btn btn-default opt">Neural Networks</button>
-  <button type="button" class="btn btn-default opt">Advanced</button>
-</div>
-</div>
-
-
-<!-- Intermediate Topics
-<div class="intermediate">
-<div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Image Recognition</button>
-  <button type="button" class="btn btn-default opt">Human Language</button>
-  <button type="button" class="btn btn-default opt">Recommender Systems</button>
-  <button type="button" class="btn btn-default opt">Customization</button>
-</div>
-</div>
--->
-
-<!-- Advanced Topics
-<div class="advanced">
-<div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Distributed Training</button>
-  <button type="button" class="btn btn-default opt">Optimization</button>
-  <button type="button" class="btn btn-default opt">Adversarial Networks</button>
-</div>
-</div>
--->
 <!-- END - Main Menu -->
-<hr>
-
+<br>
 <div class="gluon">
-<div class="introduction">
-
-
-<div class="basics">
-
-- [Manipulate data the MXNet way with ndarray](http://gluon.mxnet.io/chapter01_crashcourse/ndarray.html)
-
-- [Automatic differentiation with autograd](http://gluon.mxnet.io/chapter01_crashcourse/autograd.html)
-
-- [Linear regression with gluon](http://gluon.mxnet.io/chapter02_supervised-learning/linear-regression-gluon.html)
-
-- [Serialization - saving, loading and checkpointing](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html)
-
-- [Gluon Datasets and DataLoaders](http://mxnet.incubator.apache.org/tutorials/gluon/datasets.html)
-
-</div>
-
-
-<div class="neural-networks">
-
-- [Multilayer perceptrons in gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/mlp-gluon.html)
-
-- [Multi-class object detection using CNNs in gluon](http://gluon.mxnet.io/chapter04_convolutional-neural-networks/cnn-gluon.html)
-
-- [Advanced RNNs with gluon](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html)
-
-</div>
-
-
-<div class="advanced">
-
-- [Plumbing: A look under the hood of gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/plumbing.html)
-
-- [Designing a custom layer with gluon](/tutorials/gluon/custom_layer.html)
-
-- [Block and Parameter naming](/tutorials/gluon/naming.html)
-
-- [Fast, portable neural networks with Gluon HybridBlocks](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html)
-
-- [Training on multiple GPUs with gluon](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html)
-
-- [Applying data augmentation](/tutorials/gluon/data_augmentation.html)
-
-</div>
-
-</div> <!--end of introduction-->
-
-
-<div class="applications">
-
-- [Creating custom operators with numpy](/tutorials/gluon/customop.html)
-
-- [Handwritten digit recognition (MNIST)](/tutorials/gluon/mnist.html)
-
-- [Hybrid network example](/tutorials/gluon/hybrid.html)
-
-- [Neural network building blocks with gluon](/tutorials/gluon/gluon.html)
-
-- [Simple autograd example](/tutorials/gluon/autograd.html)
-
-- [Data Augmentation with Masks (for Object Segmentation)](/tutorials/python/data_augmentation_with_masks.html)
-
-- [Inference using an ONNX model](/tutorials/onnx/inference_on_onnx_model.html)
-
-- [Fine-tuning an ONNX model on Gluon](/tutorials/onnx/fine_tuning_gluon.html)
-
-</div> <!--end of applications-->
 
+* Getting Started
+    * [60-Minute Gluon Crash Course](http://gluon-crash-course.mxnet.io/) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+    * [MNIST Handwritten Digit Classification](/tutorials/gluon/mnist.html)
+* Models
+    * [Model Zoo: using pre-trained models](/tutorials/gluon/pretrained_models.html)
+    * [Linear Regression](http://gluon.mxnet.io/chapter02_supervised-learning/linear-regression-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+    * [Word-level text generation with RNN, LSTM and GRU](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+    * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+* Practitioner Guides
+    * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+    * [Checkpointing and Model Serialization (a.k.a. saving and loading)](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+    * [Inference using an ONNX model](/tutorials/onnx/inference_on_onnx_model.html)
+    * [Fine-tuning an ONNX model on Gluon](/tutorials/onnx/fine_tuning_gluon.html)
+    * [Visualizing Decisions of Convolutional Neural Networks](/tutorials/vision/cnn_visualization.html)
+* API Guides
+    * Core APIs
+        * NDArray
+            * [NDArray API](/tutorials/gluon/ndarray.html) ([Alternative](http://gluon.mxnet.io/chapter01_crashcourse/ndarray.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
+            * [Advanced NDArray API](/tutorials/basic/ndarray.html)
+            * [NDArray Indexing](https://mxnet.incubator.apache.org/tutorials/basic/ndarray_indexing.html)
+            * Sparse NDArray
+                * [Sparse Gradient Updates (RowSparseNDArray)](/tutorials/sparse/row_sparse.html)
+                * [Compressed Sparse Row Storage Format (CSRNDArray)](/tutorials/sparse/csr.html)
+                * [Linear Regression with Sparse Symbols](/tutorials/sparse/train.html)
+        * Symbol
+            * [Symbol API](/tutorials/basic/symbol.html) (Caution: written before Gluon existed)
+        * KVStore
+            * [Key-Value Store API](/tutorials/python/kvstore.html)
+    * Gluon APIs
+        * Blocks and Operators
+            * [Blocks](/tutorials/gluon/gluon.html) ([Alternative](http://gluon.mxnet.io/chapter03_deep-neural-networks/plumbing.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
+            * [Custom Blocks](/tutorials/gluon/custom_layer.html) ([Alternative](http://gluon.mxnet.io/chapter03_deep-neural-networks/custom-layer.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
+            * [HybridBlocks](/tutorials/gluon/hybrid.html) ([Alternative](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
+            * [Block Naming](/tutorials/gluon/naming.html)
+            * [Custom Operators](/tutorials/gluon/customop.html)
+        * Autograd
+            * [AutoGrad API](/tutorials/gluon/autograd.html)
+            * [AutoGrad API with chain rule](http://gluon.mxnet.io/chapter01_crashcourse/autograd.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+            * [AutoGrad API with Python control flow](http://gluon-crash-course.mxnet.io/autograd.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+        * Data
+            * [Datasets and DataLoaders](/tutorials/gluon/datasets.html)
+            * [Applying Data Augmentation](/tutorials/gluon/data_augmentation.html)
+            * [Data Augmentation with Masks (for Object Segmentation)](https://mxnet.incubator.apache.org/tutorials/python/data_augmentation_with_masks.html)
 </div> <!--end of gluon-->
 
-
 <div class="module">
 
-
-<div class="introduction">
-
-
-<div class="basics">
-
-- [Imperative tensor operations on CPU/GPU](/tutorials/basic/ndarray.html)
-
-- [NDArray Indexing](/tutorials/basic/ndarray_indexing.html)
-
-- [Symbol API](/tutorials/basic/symbol.html)
-
-- [Module API](/tutorials/basic/module.html)
-
-- [Iterators - Loading data](/tutorials/basic/data.html)
-
-</div>
-
-
-<div class="neural-networks">
-
-- [Linear regression](/tutorials/python/linear-regression.html)
-
-- [MNIST - handwriting recognition](/tutorials/python/mnist.html)
-
-- [Large scale image classification](/tutorials/vision/large_scale_classification.html)
-
-<!-- broken #9532
-- [Image recognition](/tutorials/python/predict_image.html)
--->
-</div>
-
-
-<div class="advanced">
-
-- [NDArray in Compressed Sparse Row storage format](/tutorials/sparse/csr.html)
-
-- [Sparse gradient updates](/tutorials/sparse/row_sparse.html)
-
-- [Train a linear regression model with sparse symbols](/tutorials/sparse/train.html)
-
-- [Applying data augmentation](/tutorials/python/data_augmentation.html)
-
-- [Types of data augmentation](/tutorials/python/types_of_data_augmentation.html)
-
-</div>
-
-</div> <!--end of introduction-->
-
-
-<div class="applications">
-
-- [Connectionist Temporal Classification](../tutorials/speech_recognition/ctc.html)
-
-- [Distributed key-value store](/tutorials/python/kvstore.html)
-
-- [Fine-tuning a pre-trained ImageNet model with a new dataset](/faq/finetune.html)
-
-- [Generative Adversarial Networks](/tutorials/unsupervised_learning/gan.html)
-
-- [Matrix factorization in recommender systems](/tutorials/python/matrix_factorization.html)
-
-- [Text classification (NLP) on Movie Reviews](/tutorials/nlp/cnn.html)
-
-- [Importing an ONNX model into MXNet](http://mxnet.incubator.apache.org/tutorials/onnx/super_resolution.html) 
-
-</div> <!--end of applications-->
-
+* Getting Started
+    * [Module API](/tutorials/basic/module.html)
+    * [MNIST Handwritten Digit Classification](/tutorials/python/mnist.html)
+* Models
+    * [Linear Regression](/tutorials/python/linear-regression.html)
+    * [Linear Regression with Sparse Symbols](/tutorials/sparse/train.html)
+    * [MNIST Handwritten Digit Classification](/tutorials/python/mnist.html)
+    * [Movie Review Classification using Convolutional Networks](/tutorials/nlp/cnn.html)
+    * [Generative Adversarial Networks (GANs)](/tutorials/unsupervised_learning/gan.html)
+    * [Recommender Systems using Matrix Factorization](/tutorials/python/matrix_factorization.html)
+    * [Speech Recognition with Connectionist Temporal Classification Loss](/tutorials/speech_recognition/ctc.html)
+* Practitioner Guides
+    * [Predicting on new images using a pre-trained ImageNet model](/tutorials/python/predict_image.html)
+    * [Fine-Tuning a pre-trained ImageNet model with a new dataset](/faq/finetune.html)
+    * [Large-Scale Multi-Host Multi-GPU Image Classification](/tutorials/vision/large_scale_classification.html)
+    * [Importing an ONNX model into MXNet](/tutorials/onnx/super_resolution.html)
+* API Guides
+    * Core APIs
+        * NDArray
+            * [NDArray API](/tutorials/gluon/ndarray.html)
+            * [Advanced NDArray API](/tutorials/basic/ndarray.html)
+            * [NDArray Indexing](/tutorials/basic/ndarray_indexing.html)
+            * Sparse NDArray
+                * [Sparse Gradient Updates (RowSparseNDArray)](/tutorials/sparse/row_sparse.html)
+                * [Compressed Sparse Row Storage Format (CSRNDArray)](/tutorials/sparse/csr.html)
+                * [Linear Regression with Sparse Symbols](/tutorials/sparse/train.html)
+        * Symbol
+            * [Symbol API](/tutorials/basic/symbol.html)
+        * KVStore
+            * [Key-Value Store API](/tutorials/python/kvstore.html)
+    * Module APIs
+        * [Module API](/tutorials/basic/module.html)
+        * Data
+            * [Data Iterators](/tutorials/basic/data.html)
+            * [Applying Data Augmentation](/tutorials/python/data_augmentation.html)
+            * [Types of Data Augmentation](/tutorials/python/types_of_data_augmentation.html)
 </div> <!--end of module-->
 
-
 <hr>
 
-## Other Languages API Tutorials
+## Scala Tutorials
 
+* Getting Started
+    * [MXNet and JetBrain's IntelliJ](/tutorials/scala/mxnet_scala_on_intellij.html)
+* Models
+    * [MNIST Handwritten Digit Recognition with Fully Connected Network](/tutorials/scala/mnist.html)
+    * [Barack Obama speech generation with Character-level LSTM](/tutorials/scala/char_lstm.html)
 
-<div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">C</button>
-  <button type="button" class="btn btn-default opt">Scala</button>
-  <button type="button" class="btn btn-default opt">R</button>
-</div>
 <hr>
 
-<div class="c">
-
-- [MNIST with the MXNet C++ API](/tutorials/c%2B%2B/basics.html)
-</div> <!--end of c++-->
-
-
-<div class="r">
+## C++ Tutorials
 
-- [NDArray: Vectorized Tensor Computations on CPUs and GPUs with R](/tutorials/r/ndarray.html)
-- [Symbol API with R](/tutorials/r/symbol.html)
-- [Custom Iterator](/tutorials/r/CustomIterator.html)
-- [Callback Function](/tutorials/r/CallbackFunction.html)
-- [Five minute neural network](/tutorials/r/fiveMinutesNeuralNetwork.html)
-- [MNIST with R](/tutorials/r/mnistCompetition.html)
-- [Classify images via R with a pre-trained model](/tutorials/r/classifyRealImageWithPretrainedModel.html)
-- [Char RNN Example with R](/tutorials/r/charRnnModel.html)
-- [Custom loss functions in R](/tutorials/r/CustomLossFunction.html)
-
-
-</div> <!--end of r-->
-
-
-<div class="scala">
-
-- [Setup your MXNet with Scala on IntelliJ](/tutorials/scala/mxnet_scala_on_intellij.html)
-- [MNIST with the Scala API](/tutorials/scala/mnist.html)
-- [Use Scala to build a Long Short-Term Memory network that generates Barack Obama's speech patterns](/tutorials/scala/char_lstm.html)
-
-</div> <!--end of scala-->
+* Models
+    * [MNIST Handwritten Digit Recognition with Fully Connected Network](/tutorials/c%2B%2B/basics.html)
 
 <hr>
 
-
-## Example Scripts and Applications
-
-More tutorials and examples are available in the [GitHub repository](https://github.com/apache/incubator-mxnet/tree/master/example).
-
-
-## Learn More About Gluon!
-
-Most of the Gluon tutorials are hosted on [gluon.mxnet.io](http://gluon.mxnet.io), and you may want to follow the chapters on directly the Gluon site.
-
-
+## R Tutorials
+
+* Getting Started
+    * [Basic Classification & Regression](/tutorials/r/fiveMinutesNeuralNetwork.html)
+    * [Using a pre-trained model for Image Classification](/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* Models
+    * [MNIST Handwritten Digit Classification with Convolutional Network](/tutorials/r/mnistCompetition.html)
+    * [Shakespeare generation with Character-level RNN](/tutorials/r/charRnnModel.html)
+* API Guides
+    * [NDArray API](/tutorials/r/ndarray.html)
+    * [Symbol API](/tutorials/r/symbol.html)
+    * [Callbacks](/tutorials/r/CallbackFunction.html)
+    * [Custom Data Iterators](/tutorials/r/CustomIterator.html)
+    * [Custom Loss Functions](/tutorials/r/CustomLossFunction.html)
+ 
+<hr>
+ 
 ## Contributing Tutorials
 
-Want to contribute an MXNet tutorial? To get started, [review these details](https://github.com/apache/incubator-mxnet/tree/master/example#contributing) on example and tutorial writing.
+We really appreciate contributions, and tutorials are a great way to share your knowledge and help the community. After you have followed [these steps](https://github.com/apache/incubator-mxnet/tree/master/example#contributing), please submit a pull request on Github.
+
+And if you have any feedback on this section please raise an issue on Github.
diff --git a/docs/tutorials/onnx/fine_tuning_gluon.md b/docs/tutorials/onnx/fine_tuning_gluon.md
index fc940fc36f9..07d8bdf0aa4 100644
--- a/docs/tutorials/onnx/fine_tuning_gluon.md
+++ b/docs/tutorials/onnx/fine_tuning_gluon.md
@@ -40,7 +40,7 @@ logging.basicConfig(level=logging.INFO)
 
 
 ### Downloading supporting files
-These are images and a vizualisation script
+These are images and a vizualisation script:
 
 
 ```python
@@ -59,12 +59,12 @@ from utils import *
 
 ## Downloading a model from the ONNX model zoo
 
-We download a pre-trained model, in our case the [vgg16](https://arxiv.org/abs/1409.1556) model, trained on [ImageNet](http://www.image-net.org/) from the [ONNX model zoo](https://github.com/onnx/models). The model comes packaged in an archive `tar.gz` file containing an `model.onnx` model file and some sample input/output data.
+We download a pre-trained model, in our case the [GoogleNet](https://arxiv.org/abs/1409.4842) model, trained on [ImageNet](http://www.image-net.org/) from the [ONNX model zoo](https://github.com/onnx/models). The model comes packaged in an archive `tar.gz` file containing an `model.onnx` model file.
 
 
 ```python
-base_url = "https://s3.amazonaws.com/download.onnx/models/"
-current_model = "vgg16"
+base_url = "https://s3.amazonaws.com/download.onnx/models/opset_3/"
+current_model = "bvlc_googlenet"
 model_folder = "model"
 archive_file = "{}.tar.gz".format(current_model)
 archive_path = os.path.join(model_folder, archive_file)
@@ -230,15 +230,15 @@ sym.get_internals()
 
 
 
-```<Symbol group [gpu_0/data_0, gpu_0/conv1_w_0, gpu_0/conv1_b_0, convolution0, relu0, lrn0, pad0, pooling0, gpu_0/conv2_w_0, gpu_0/conv2_b_0, convolution1, relu1, lrn1, pad1, pooling1, gpu_0/conv3_w_0, gpu_0/conv3_b_0, convolution2, relu2, gpu_0/conv4_w_0, gpu_0/conv4_b_0, convolution3, relu3, gpu_0/conv5_w_0, gpu_0/conv5_b_0, convolution4, relu4, pad2, pooling2, flatten0, gpu_0/fc6_w_0, linalg_gemm20, gpu_0/fc6_b_0, _mulscalar0, broadcast_add0, relu5, flatten1, gpu_0/fc7_w_0, linalg_gemm21, gpu_0/fc7_b_0, _mulscalar1, broadcast_add1, relu6, flatten2, gpu_0/fc8_w_0, linalg_gemm22, gpu_0/fc8_b_0, _mulscalar2, broadcast_add2, softmax0]>```<!--notebook-skip-line-->
+```<Symbol group [data_0, pad0, conv1/7x7_s2_w_0, conv1/7x7_s2_b_0, convolution0, relu0, pad1, pooling0, lrn0, pad2, conv2/3x3_reduce_w_0, conv2/3x3_reduce_b_0, convolution1, relu1, pad3, conv2/3x3_w_0, conv2/3x3_b_0, convolution2, relu2, lrn1, pad4, pooling1, pad5, inception_3a/1x1_w_0, inception_3a/1x1_b_0, convolution3, relu3, pad6, .................................................................................inception_5b/pool_proj_b_0, convolution56, relu56, concat8, pad70, pooling13, dropout0, flatten0, loss3/classifier_w_0, linalg_gemm20, loss3/classifier_b_0, _mulscalar0, broadcast_add0, softmax0]>```<!--notebook-skip-line-->
 
 
 
-We get the network until the output of the `relu6` layer
+We get the network until the output of the `flatten0` layer
 
 
 ```python
-new_sym, new_arg_params, new_aux_params = get_layer_output(sym, arg_params, aux_params, 'relu6')
+new_sym, new_arg_params, new_aux_params = get_layer_output(sym, arg_params, aux_params, 'flatten0')
 ```
 
 ### Fine-tuning in gluon
@@ -258,7 +258,7 @@ We create a symbol block that is going to hold all our pre-trained layers, and a
 
 
 ```python
-pre_trained = gluon.nn.SymbolBlock(outputs=new_sym, inputs=mx.sym.var('gpu_0/data_0'))
+pre_trained = gluon.nn.SymbolBlock(outputs=new_sym, inputs=mx.sym.var('data_0'))
 net_params = pre_trained.collect_params()
 for param in new_arg_params:
     if param in net_params:
@@ -299,7 +299,7 @@ Initialize trainer with common training parameters
 
 
 ```python
-LEARNING_RATE = 0.001
+LEARNING_RATE = 0.0005
 WDECAY = 0.00001
 MOMENTUM = 0.9
 ```
@@ -349,7 +349,7 @@ print("Untrained network Test Accuracy: {0:.4f}".format(evaluate_accuracy_gluon(
 
 ```python
 val_accuracy = 0
-for epoch in range(20):
+for epoch in range(5):
     for i, (data, label) in enumerate(dataloader_train):
         data = data.astype(np.float32).as_in_context(ctx)
         label = label.as_in_context(ctx)
@@ -430,4 +430,4 @@ plot_predictions(caltech101_images_test, result, categories, TOP_P)
 
 **Great!** The network classified these images correctly after being fine-tuned on a dataset that contains images of `wrench`, `dolphin` and `lotus`
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/onnx/inference_on_onnx_model.md b/docs/tutorials/onnx/inference_on_onnx_model.md
index f342dad9bea..b2522ad0c1f 100644
--- a/docs/tutorials/onnx/inference_on_onnx_model.md
+++ b/docs/tutorials/onnx/inference_on_onnx_model.md
@@ -51,12 +51,12 @@ from utils import *
 
 ## Downloading a model from the ONNX model zoo
 
-We download a pre-trained model, in our case the [vgg16](https://arxiv.org/abs/1409.1556) model, trained on [ImageNet](http://www.image-net.org/) from the [ONNX model zoo](https://github.com/onnx/models). The model comes packaged in an archive `tar.gz` file containing an `model.onnx` model file and some sample input/output data.
+We download a pre-trained model, in our case the [GoogleNet](https://arxiv.org/abs/1409.4842) model, trained on [ImageNet](http://www.image-net.org/) from the [ONNX model zoo](https://github.com/onnx/models). The model comes packaged in an archive `tar.gz` file containing an `model.onnx` model file.
 
 
 ```python
-base_url = "https://s3.amazonaws.com/download.onnx/models/" 
-current_model = "vgg16"
+base_url = "https://s3.amazonaws.com/download.onnx/models/opset_3/" 
+current_model = "bvlc_googlenet"
 model_folder = "model"
 archive = "{}.tar.gz".format(current_model)
 archive_file = os.path.join(model_folder, archive)
@@ -97,29 +97,37 @@ We get the symbol and parameter objects
 sym, arg_params, aux_params = onnx_mxnet.import_model(onnx_path)
 ```
 
-We pick a context, GPU if available, otherwise CPU
+We pick a context, CPU is fine for inference, switch to mx.gpu() if you want to use your GPU.
 
 
 ```python
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.cpu()
 ```
 
-We obtain the data names of the inputs to the model, by listing all the inputs to the symbol graph and excluding the argument and auxiliary parameters from that list:
+We obtain the data names of the inputs to the model by using the model metadata API: 
 
 ```python
-data_names = [graph_input for graph_input in sym.list_inputs()
-                      if graph_input not in arg_params and graph_input not in aux_params]
-print(data_names)
+model_metadata = onnx_mxnet.get_model_metadata(onnx_path)
+print(model_metadata)
 ```
 
+```
+{'output_tensor_data': [(u'gpu_0/softmax_1', (1L, 1000L))],
+ 'input_tensor_data': [(u'gpu_0/data_0', (1L, 3L, 224L, 224L))]}
+```
+
+```python
+data_names = [inputs[0] for inputs in model_metadata.get('input_tensor_data')]
+print(data_names)
+```
 
-```['gpu_0/data_0']```
 
+```[u'data_0']```<!--notebook-skip-line-->
 
 And load them into a MXNet Gluon symbol block. 
 
 ```python
-net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('gpu_0/data_0'))
+net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('data_0'))
 net_params = net.collect_params()
 for param in arg_params:
     if param in net_params:
@@ -137,30 +145,6 @@ We can now cache the computational graph through [hybridization](https://mxnet.i
 net.hybridize()
 ```
 
-## Test using sample inputs and outputs
-The model comes with sample input/output we can use to test that whether model is correctly loaded
-
-
-```python
-numpy_path = os.path.join(model_folder, current_model, 'test_data_0.npz')
-sample = np.load(numpy_path, encoding='bytes')
-inputs = sample['inputs']
-outputs = sample['outputs']
-```
-
-
-```python
-print("Input format: {}".format(inputs[0].shape))
-print("Output format: {}".format(outputs[0].shape))
-```
-
-`Input format: (1, 3, 224, 224)` <!--notebook-skip-line-->
-
-
-`Output format: (1, 1000)` <!--notebook-skip-line-->
-    
-
-
 We can visualize the network (requires graphviz installed)
 
 
@@ -169,9 +153,7 @@ mx.visualization.plot_network(sym,  node_attrs={"shape":"oval","fixedsize":"fals
 ```
 
 
-
-
-![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/onnx/network.png?raw=true)<!--notebook-skip-line-->
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/onnx/network2.png?raw=true)<!--notebook-skip-line-->
 
 
 
@@ -187,21 +169,6 @@ def run_batch(net, data):
     return np.array(results)
 ```
 
-
-```python
-result = run_batch(net, nd.array([inputs[0]], ctx))
-```
-
-
-```python
-print("Loaded model and sample output predict the same class: {}".format(np.argmax(result) == np.argmax(outputs[0])))
-```
-
-Loaded model and sample output predict the same class: True <!--notebook-skip-line-->
-
-
-Good the sample output and our prediction match, now we can run against real data
-
 ## Test using real images
 
 
@@ -265,4 +232,4 @@ We show that in our next tutorial:
 
 - [Fine-tuning an ONNX Model using the modern imperative MXNet/Gluon](http://mxnet.incubator.apache.org/tutorials/onnx/fine_tuning_gluon.html)
     
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/python/predict_image.md b/docs/tutorials/python/predict_image.md
index c78a5d5a98b..3e68be07fa7 100644
--- a/docs/tutorials/python/predict_image.md
+++ b/docs/tutorials/python/predict_image.md
@@ -1,7 +1,6 @@
 # Predict with pre-trained models
 
-This tutorial explains how to recognize objects in an image with a
-pre-trained model, and how to perform feature extraction.
+This tutorial explains how to recognize objects in an image with a pre-trained model, and how to perform feature extraction.
 
 ## Prerequisites
 
@@ -9,25 +8,21 @@ To complete this tutorial, we need:
 
 - MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html)
 
-- [Python Requests](http://docs.python-requests.org/en/master/), [Matplotlib](https://matplotlib.org/) and [Jupyter Notebook](http://jupyter.org/index.html).
+- [Matplotlib](https://matplotlib.org/) and [Jupyter Notebook](http://jupyter.org/index.html).
 
 ```
-$ pip install requests matplotlib jupyter opencv-python
+$ pip install matplotlib
 ```
 
 ## Loading
 
-We first download a pre-trained ResNet 152 layer that is trained on the full
-ImageNet dataset with over 10 million images and 10 thousand classes. A
-pre-trained model contains two parts, a json file containing the model
-definition and a binary file containing the parameters. In addition, there may be
-a text file for the labels.
+We first download a pre-trained ResNet 18 model that is trained on the ImageNet dataset with over 1 million images and one thousand classes. A pre-trained model contains two parts, a json file containing the model definition and a binary file containing the parameters. In addition, there may be a `synset.txt` text file for the labels.
 
 ```python
 import mxnet as mx
-path='http://data.mxnet.io/models/imagenet-11k/'
-[mx.test_utils.download(path+'resnet-152/resnet-152-symbol.json'),
- mx.test_utils.download(path+'resnet-152/resnet-152-0000.params'),
+path='http://data.mxnet.io/models/imagenet/'
+[mx.test_utils.download(path+'resnet/18-layers/resnet-18-0000.params'),
+ mx.test_utils.download(path+'resnet/18-layers/resnet-18-symbol.json'),
  mx.test_utils.download(path+'synset.txt')]
 ```
 
@@ -39,7 +34,7 @@ ctx = mx.cpu()
 ```
 
 ```python
-sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-152', 0)
+sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-18', 0)
 mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
 mod.bind(for_training=False, data_shapes=[('data', (1,3,224,224))], 
          label_shapes=mod._label_shapes)
@@ -56,7 +51,6 @@ prediction:
 ```python
 %matplotlib inline
 import matplotlib.pyplot as plt
-import cv2
 import numpy as np
 # define a simple data batch
 from collections import namedtuple
@@ -65,23 +59,22 @@ Batch = namedtuple('Batch', ['data'])
 def get_image(url, show=False):
     # download and show the image
     fname = mx.test_utils.download(url)
-    img = cv2.cvtColor(cv2.imread(fname), cv2.COLOR_BGR2RGB)
+    img = mx.image.imread(fname)
     if img is None:
-         return None
+        return None
     if show:
-         plt.imshow(img)
-         plt.axis('off')
+        plt.imshow(img.asnumpy())
+        plt.axis('off')
     # convert into format (batch, RGB, width, height)
-    img = cv2.resize(img, (224, 224))
-    img = np.swapaxes(img, 0, 2)
-    img = np.swapaxes(img, 1, 2)
-    img = img[np.newaxis, :]
+    img = mx.image.imresize(img, 224, 224) # resize
+    img = img.transpose((2, 0, 1)) # Channel first
+    img = img.expand_dims(axis=0) # batchify
     return img
 
 def predict(url):
     img = get_image(url, show=True)
     # compute the predict probabilities
-    mod.forward(Batch([mx.nd.array(img)]))
+    mod.forward(Batch([img]))
     prob = mod.get_outputs()[0].asnumpy()
     # print the top-5
     prob = np.squeeze(prob)
@@ -96,19 +89,27 @@ Now, we can perform prediction with any downloadable URL:
 predict('https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true')
 ```
 
+`probability=0.249607, class=n02119022 red fox, Vulpes vulpes` <!--notebook-skip-line-->
+
+`probability=0.172868, class=n02119789 kit fox, Vulpes macrotis` <!--notebook-skip-line-->
+
+![](https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true) <!--notebook-skip-line-->
+
 ```python
 predict('https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true')
 ```
 
+`probability=0.873920, class=n02110958 pug, pug-dog` <!--notebook-skip-line-->
+
+`probability=0.102659, class=n02108422 bull mastiff` <!--notebook-skip-line-->
+
+![](https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true) <!--notebook-skip-line-->
+
 ## Feature extraction
 
-By feature extraction, we mean presenting the input images by the output of an
-internal layer rather than the last softmax layer. These outputs, which can be
-viewed as the feature of the raw input image, can then be used by other
-applications such as object detection.
+By feature extraction, we mean presenting the input images by the output of an internal layer rather than the last softmax layer. These outputs, which can be viewed as the feature of the raw input image, can then be used by other applications such as object detection.
 
-We can use the ``get_internals`` method to get all internal layers from a
-Symbol.
+We can use the ``get_internals`` method to get all internal layers from a Symbol.
 
 ```python
 # list the last 10 layers
@@ -116,11 +117,20 @@ all_layers = sym.get_internals()
 all_layers.list_outputs()[-10:]
 ```
 
-An often used layer for feature extraction is the one before the last fully
-connected layer. For ResNet, and also Inception, it is the flattened layer with
-name `flatten0` which reshapes the 4-D convolutional layer output into 2-D for
-the fully connected layer. The following source code extracts a new Symbol which
-outputs the flattened layer and creates a model.
+```
+['bn1_moving_var',
+ 'bn1_output',
+ 'relu1_output',
+ 'pool1_output',
+ 'flatten0_output',
+ 'fc1_weight',
+ 'fc1_bias',
+ 'fc1_output',
+ 'softmax_label',
+ 'softmax_output']
+ ```
+
+An often used layer for feature extraction is the one before the last fully connected layer. For ResNet, and also Inception, it is the flattened layer with name `flatten0` which reshapes the 4-D convolutional layer output into 2-D for the fully connected layer. The following source code extracts a new Symbol which outputs the flattened layer and creates a model.
 
 ```python
 fe_sym = all_layers['flatten0_output']
@@ -133,10 +143,11 @@ We can now invoke `forward` to obtain the features:
 
 ```python
 img = get_image('https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true')
-fe_mod.forward(Batch([mx.nd.array(img)]))
-features = fe_mod.get_outputs()[0].asnumpy()
-print(features)
-assert features.shape == (1, 2048)
+fe_mod.forward(Batch([img]))
+features = fe_mod.get_outputs()[0]
+print('Shape',features.shape)
+print(features.asnumpy())
+assert features.shape == (1, 512)
 ```
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/python/types_of_data_augmentation.md b/docs/tutorials/python/types_of_data_augmentation.md
index 4ec461d68ec..4308932bf48 100644
--- a/docs/tutorials/python/types_of_data_augmentation.md
+++ b/docs/tutorials/python/types_of_data_augmentation.md
@@ -302,7 +302,7 @@ Some shortcut functions are provided to perform multiple augmentation in a singl
 ```python
 # A random crop, with a random resizing, and random aspect ratio jitter
 example_image_copy = example_image.copy()
-aug = mx.image.RandomSizedCropAug(size=(100, 100), min_area=0.1, ratio=(1.0, 1.5))
+aug = mx.image.RandomSizedCropAug(size=(100, 100), area=0.1, ratio=(1.0, 1.5))
 aug_image = aug(example_image_copy)
 plot_mx_array(aug_image)
 
diff --git a/docs/tutorials/scala/mxnet_scala_on_intellij.md b/docs/tutorials/scala/mxnet_scala_on_intellij.md
index c93d99c4504..8cdf9249b89 100644
--- a/docs/tutorials/scala/mxnet_scala_on_intellij.md
+++ b/docs/tutorials/scala/mxnet_scala_on_intellij.md
@@ -132,7 +132,7 @@ After clicking Finish, you will be presented with the project's first view.
 The project's `pom.xml` will be open for editing.
 
 **Step 3.** Setup project properties:
-  - Specify project properties in `pom.xml` by pasting the following content in the `<properties>` tag. You will be overwriting the `<scala.version>` tag in the process, upgrading from `2.11.5` to `2.11.8`.
+  - Specify project properties in `pom.xml` by pasting the following content in the `properties` tag. You will be overwriting the `scala.version` tag in the process, upgrading from `2.11.5` to `2.11.8`.
 
 ```xml
 <properties>
@@ -143,7 +143,7 @@ The project's `pom.xml` will be open for editing.
 
 **Step 4.** Setup project profiles and platforms:
 
-  - Specify project profiles and platforms in `pom.xml` by pasting the following content below the `</properties>` tag:
+  - Specify project profiles and platforms in `pom.xml` by pasting the following content below the closing `properties` tag:
 
 ```xml
 <profiles>
@@ -170,7 +170,7 @@ The project's `pom.xml` will be open for editing.
 
 **Step 5.** Setup project dependencies:
 
-  - Specify project dependencies in `pom.xml` adding the dependencies listed below. Place them inside the `<dependencies>` tag:
+  - Specify project dependencies in `pom.xml` adding the dependencies listed below. Place them inside the `dependencies` tag:
 
 ```xml
 <dependencies>
@@ -233,7 +233,7 @@ The project's `pom.xml` will be open for editing.
 
 ![project 2](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/scala/intellij-project-2.png)
 
-Note the `<systemPath>` tag and update it to match the file path to the jar file that was created when you built the MXNet-Scala package. It can be found in the `mxnet-incubator/scala-package/assembly/{platform}/target` directory, and is named with the pattern `mxnet-full_${scala.binary.version}-${platform}-{version-SNAPSHOT}.jar`.
+Note the `systemPath` tag and update it to match the file path to the jar file that was created when you built the MXNet-Scala package. It can be found in the `mxnet-incubator/scala-package/assembly/{platform}/target` directory, and is named with the pattern `mxnet-full_${scala.binary.version}-${platform}-{version-SNAPSHOT}.jar`.
 
 **Step 6.** Import dependencies with Maven:
 
@@ -246,7 +246,7 @@ Click "Import Changes" in this prompt.
 **Step 7.** Build the project:
 - To build the project, from the menu choose Build, and then choose Build Project.
 
-**Note**: During the build you may experience `[ERROR] scalac error: bad option: '-make:transitive'`. You can fix this by deleting or commenting this out in your `pom.xml`. This line in question is: `<arg>-make:transitive</arg>`.
+**Note**: During the build you may experience `[ERROR] scalac error: bad option: '-make:transitive'`. You can fix this by deleting or commenting this out in your `pom.xml`. This line in question is the `arg` tag, and it should contain: `-make:transitive`.
 
 **Step 8.** Run the Hello World App:
 
@@ -306,5 +306,5 @@ The build generates a new jar file in the `target` folder called `scalaInference
 For more information about MXNet Scala resources, see the following:
 
 * [Scala API](http://mxnet.io/api/scala/)
-* [More Scala Examples](https://github.com/incubator-mxnet/tree/master/scala-package/examples/)
+* [More Scala Examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/)
 * [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
diff --git a/docs/tutorials/vision/cnn_visualization.md b/docs/tutorials/vision/cnn_visualization.md
new file mode 100644
index 00000000000..ea027dff09a
--- /dev/null
+++ b/docs/tutorials/vision/cnn_visualization.md
@@ -0,0 +1,245 @@
+# Visualizing Decisions of Convolutional Neural Networks
+
+Convolutional Neural Networks have made a lot of progress in Computer Vision. Their accuracy is as good as humans in some tasks. However it remains hard to explain the predictions of convolutional neural networks, as they lack the interpretability offered by other models, for example decision trees.
+
+It is often helpful to be able to explain why a model made the prediction it made. For example when a model misclassifies an image, it is hard to say why without visualizing the network's decision.
+
+<img align="right" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/volcano_barn_spider.png" alt="Explaining the misclassification of volcano as spider" width=500px/>
+
+Visualizations also help build confidence about the predictions of a model. For example, even if a model correctly predicts birds as birds, we would want to confirm that the model bases its decision on the features of bird and not on the features of some other object that might occur together with birds in the dataset (like leaves).
+
+In this tutorial, we show how to visualize the predictions made by convolutional neural networks using [Gradient-weighted Class Activation Mapping](https://arxiv.org/abs/1610.02391). Unlike many other visualization methods, Grad-CAM can be used on a wide variety of CNN model families - CNNs with fully connected layers, CNNs used for structural outputs (e.g. captioning), CNNs used in tasks with multi-model input (e.g. VQA) or reinforcement learning without architectural changes or re-training.
+
+In the rest of this notebook, we will explain how to visualize predictions made by [VGG-16](https://arxiv.org/abs/1409.1556). We begin by importing the required dependencies. `gradcam` module contains the implementation of visualization techniques used in this notebook.
+
+```python
+from __future__ import print_function
+
+import mxnet as mx
+from mxnet import gluon
+
+from matplotlib import pyplot as plt
+import numpy as np
+
+gradcam_file = "gradcam.py" 
+base_url = "https://raw.githubusercontent.com/indhub/mxnet/cnnviz/example/cnn_visualization/{}?raw=true"
+mx.test_utils.download(base_url.format(gradcam_file), fname=gradcam_file)
+import gradcam
+```
+
+## Building the network to visualize
+
+Next, we build the network we want to visualize. For this example, we will use the [VGG-16](https://arxiv.org/abs/1409.1556) network. This code was taken from the Gluon [model zoo](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/model_zoo/vision/alexnet.py) and refactored to make it easy to switch between `gradcam`'s and Gluon's implementation of ReLU and Conv2D. Same code can be used for both training and visualization with a minor (one line) change.
+
+Notice that we import ReLU and Conv2D from `gradcam` module instead of mxnet.gluon.nn.
+- We use a modified ReLU because we use guided backpropagation for visualization and guided backprop requires ReLU layer to block the backward flow of negative gradients corresponding to the neurons which decrease the activation of the higher layer unit we aim to visualize. Check [this](https://arxiv.org/abs/1412.6806) paper to learn more about guided backprop.
+- We use a modified Conv2D (a wrapper on top of Gluon's Conv2D) because we want to capture the output of a given convolutional layer and its gradients. This is needed to implement Grad-CAM. Check [this](https://arxiv.org/abs/1610.02391) paper to learn more about Grad-CAM.
+
+When you train the network, you could just import `Activation` and `Conv2D` from `gluon.nn` instead. No other part of the code needs any change to switch between training and visualization.
+
+```python
+import os
+from mxnet.gluon.model_zoo import model_store
+
+from mxnet.initializer import Xavier
+from mxnet.gluon.nn import MaxPool2D, Flatten, Dense, Dropout, BatchNorm
+from gradcam import Activation, Conv2D
+
+class VGG(mx.gluon.HybridBlock):
+    def __init__(self, layers, filters, classes=1000, **kwargs):
+        super(VGG, self).__init__(**kwargs)
+        assert len(layers) == len(filters)
+        with self.name_scope():
+            self.features = self._make_features(layers, filters)
+            self.features.add(Dense(4096, activation='relu',
+                                       weight_initializer='normal',
+                                       bias_initializer='zeros'))
+            self.features.add(Dropout(rate=0.5))
+            self.features.add(Dense(4096, activation='relu',
+                                       weight_initializer='normal',
+                                       bias_initializer='zeros'))
+            self.features.add(Dropout(rate=0.5))
+            self.output = Dense(classes,
+                                   weight_initializer='normal',
+                                   bias_initializer='zeros')
+
+    def _make_features(self, layers, filters):
+        featurizer = mx.gluon.nn.HybridSequential(prefix='')
+        for i, num in enumerate(layers):
+            for _ in range(num):
+                featurizer.add(Conv2D(filters[i], kernel_size=3, padding=1,
+                                         weight_initializer=Xavier(rnd_type='gaussian',
+                                                                   factor_type='out',
+                                                                   magnitude=2),
+                                         bias_initializer='zeros'))
+                featurizer.add(Activation('relu'))
+            featurizer.add(MaxPool2D(strides=2))
+        return featurizer
+
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        x = self.output(x)
+        return x
+```
+
+## Loading pretrained weights
+
+We'll use pre-trained weights (trained on ImageNet) from model zoo instead of training the model from scratch.
+
+```python
+# Number of convolution layers and number of filters for each VGG configuration.
+# Check the VGG [paper](https://arxiv.org/abs/1409.1556) for more details on the different architectures.
+vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+            13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+            16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+            19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+
+def get_vgg(num_layers, ctx=mx.cpu(), root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+
+    # Get the number of convolution layers and filters
+    layers, filters = vgg_spec[num_layers]
+
+    # Build the VGG network
+    net = VGG(layers, filters, **kwargs)
+
+    # Load pretrained weights from model zoo
+    from mxnet.gluon.model_zoo.model_store import get_model_file
+    net.load_params(get_model_file('vgg%d' % num_layers, root=root), ctx=ctx)
+
+    return net
+
+def vgg16(**kwargs):
+    return get_vgg(16, **kwargs)
+```
+
+## Preprocessing and other helpers
+
+We'll resize the input image to 224x224 before feeding it to the network. We normalize the images using the same parameters ImageNet dataset was normalised using to create the pretrained model. These parameters are published [here](https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html). We use `transpose` to convert the image to channel-last format.
+
+Note that we do not hybridize the network. This is because we want `gradcam.Activation` and `gradcam.Conv2D` to behave differently at different times during the execution. For example, `gradcam.Activation` will do the regular backpropagation while computing the gradient of the topmost convolutional layer but will do guided backpropagation when computing the gradient of the image.
+
+```python
+image_sz = (224, 224)
+
+def preprocess(data):
+    data = mx.image.imresize(data, image_sz[0], image_sz[1])
+    data = data.astype(np.float32)
+    data = data/255
+    data = mx.image.color_normalize(data,
+                                    mean=mx.nd.array([0.485, 0.456, 0.406]),
+                                    std=mx.nd.array([0.229, 0.224, 0.225]))
+    data = mx.nd.transpose(data, (2,0,1))
+    return data
+
+network = vgg16(ctx=mx.cpu())
+```
+
+We define a helper to display multiple images in a row in Jupyter notebook.
+
+```python
+def show_images(pred_str, images):
+    titles = [pred_str, 'Grad-CAM', 'Guided Grad-CAM', 'Saliency Map']
+    num_images = len(images)
+    fig=plt.figure(figsize=(15,15))
+    rows, cols = 1, num_images
+    for i in range(num_images):
+        fig.add_subplot(rows, cols, i+1)
+        plt.xlabel(titles[i])
+        plt.imshow(images[i], cmap='gray' if i==num_images-1 else None)
+    plt.show()
+```
+
+Given an image, the network predicts a probability distribution over all categories. The most probable category can be found by applying the `argmax` operation. This gives an integer corresponding to the category. We still need to convert this to a human readable category name to know what category the network predicted. [Synset](http://data.mxnet.io/models/imagenet/synset.txt) file contains the mapping between Imagenet category index and category name. We'll download the synset file, load it in a list to convert category index to human readable category names.
+
+```python
+synset_url = "http://data.mxnet.io/models/imagenet/synset.txt"
+synset_file_name = "synset.txt"
+mx.test_utils.download(synset_url, fname=synset_file_name)
+synset = []
+with open('synset.txt', 'r') as f:
+    synset = [l.rstrip().split(' ', 1)[1].split(',')[0] for l in f]
+    
+def get_class_name(cls_id):
+    return "%s (%d)" % (synset[cls_id], cls_id)
+
+def run_inference(net, data):
+    out = net(data)
+    return out.argmax(axis=1).asnumpy()[0].astype(int)
+```
+
+## Visualizing CNN decisions
+
+Next, we'll write a method to get an image, preprocess it, predict category and visualize the prediction. We'll use `gradcam.visualize()` to create the visualizations. `gradcam.visualize` returns a tuple with the following visualizations:
+
+1. **Grad-CAM:** This is a heatmap superimposed on the input image showing which part(s) of the image contributed most to the CNN's decision.
+2. **Guided Grad-CAM:** Guided Grad-CAM shows which exact pixels contributed the most to the CNN's decision.
+3. **Saliency map:** Saliency map is a monochrome image showing which pixels contributed the most to the CNN's decision. Sometimes, it is easier to see the areas in the image that most influence the output in a monochrome image than in a color image.
+
+```python
+def visualize(net, img_path, conv_layer_name):
+    orig_img = mx.img.imread(img_path)
+    preprocessed_img = preprocess(orig_img)
+    preprocessed_img = preprocessed_img.expand_dims(axis=0)
+    
+    pred_str = get_class_name(run_inference(net, preprocessed_img))
+    
+    orig_img = mx.image.imresize(orig_img, image_sz[0], image_sz[1]).asnumpy()
+    vizs = gradcam.visualize(net, preprocessed_img, orig_img, conv_layer_name)
+    return (pred_str, (orig_img, *vizs))
+```
+
+Next, we need to get the name of the last convolutional layer that extracts features from the image. We use the gradient information flowing into the last convolutional layer of the CNN to understand the importance of each neuron for a decision of interest. We are interested in the last convolutional layer because convolutional features naturally retain spatial information which is lost in fully connected layers. So, we expect the last convolutional layer to have the best compromise between high level semantics and detailed spacial information. The neurons in this layer look for semantic class specific information in the image (like object parts).
+
+In our network, feature extractors are added to a HybridSequential block named features. You can list the layers in that block by just printing `network.features`. You can see that the topmost convolutional layer is at index 28. `network.features[28]._name` will give the name of the layer.
+
+```python
+last_conv_layer_name = network.features[28]._name
+print(last_conv_layer_name)
+```
+vgg0_conv2d12<!--notebook-skip-line-->
+
+Let's download some images we can use for visualization.
+
+```python
+images = ["hummingbird.jpg", "jellyfish.jpg", "snow_leopard.jpg", "volcano.jpg"]
+base_url = "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/{}?raw=true"
+for image in images:
+    mx.test_utils.download(base_url.format(image), fname=image)
+```
+
+We now have everything we need to start visualizing. Let's visualize the CNN decision for the images we downloaded.
+
+```python
+show_images(*visualize(network, "hummingbird.jpg", last_conv_layer_name))
+```
+
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/hummingbird.png" alt="Visualizing CNN decision"/><!--notebook-skip-line-->
+
+```python
+show_images(*visualize(network, "jellyfish.jpg", last_conv_layer_name))
+```
+
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/jellyfish.png" alt="Visualizing CNN decision"/><!--notebook-skip-line-->
+
+```python
+show_images(*visualize(network, "snow_leopard.jpg", last_conv_layer_name))
+```
+
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/snow_leopard.png" alt="Visualizing CNN decision"/><!--notebook-skip-line-->
+
+Shown above are some images the network was able to predict correctly. We can see that the network is basing its decision on the appropriate features. Now, let's look at an example that the network gets the prediction wrong and visualize why it gets the prediction wrong.
+
+```python
+show_images(*visualize(network, "volcano.jpg", last_conv_layer_name))
+```
+
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/volcano.png" alt="Visualizing CNN decision"/><!--notebook-skip-line-->
+
+While it is not immediately evident why the network thinks this volcano is a spider, after looking at the Grad-CAM visualization, it is hard to look at the volcano and not see the spider!
+
+Being able to visualize why a CNN predicts specific classes is a powerful tool to diagnose prediction failures. Even when the network is making correct predictions, visualizing activations is an important step to verify that the network is making its decisions based on the right features and not some correlation which happens to exist in the training data.
+
+The visualization method demonstrated in this tutorial applies to a wide variety of network architectures and a wide variety of tasks beyond classification - like VQA and image captioning. Any type of differentiable output can be used to create the visualizations shown above. Visualization techniques like these solve (at least partially) the long standing problem of interpretability of neural networks.
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
+
diff --git a/example/MXNetTutorialTemplate.ipynb b/example/MXNetTutorialTemplate.ipynb
index 2ec9b8562a4..851a87f1824 100644
--- a/example/MXNetTutorialTemplate.ipynb
+++ b/example/MXNetTutorialTemplate.ipynb
@@ -32,7 +32,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A brief explanation of how the reader can use the tutorial. Can the reader copy each code snippet into a Python or other environment? Or can the reader run <filename> before or after reading through the explanations to understand how the code works?"
+    "A brief explanation of how the reader can use the tutorial. Can the reader copy each code snippet into a Python or other environment? Or can the reader run `<filename>` before or after reading through the explanations to understand how the code works?"
    ]
   },
   {
@@ -70,10 +70,10 @@
    "source": [
     "To complete this tutorial, you need:\n",
     "\n",
-    "- [MXNet](//http://mxnet.io/get_started/setup.html#overview)\n",
-    "- [Language](http://)\n",
-    "- [Tool](http://)\n",
-    "- Familiarity with concept or tool"
+    "- [MXNet](https://mxnet.incubator.apache.org/install/#overview)\n",
+    "- [Language](https://mxnet.incubator.apache.org/tutorials/)\n",
+    "- [Tool](https://mxnet.incubator.apache.org/api/python/index.html)\n",
+    "- [Familiarity with concept or tool](https://gluon.mxnet.io/)\n"
    ]
   },
   {
@@ -96,10 +96,9 @@
    "source": [
     "You can download the data used in this tutorial from the [Site Name](http://) site. To download the data:\n",
     "\n",
-    "1. At the <language> prompt, type:\n",
-    "\n",
-    "    ``command``\n",
+    "1. At the `<language>` prompt, type:\n",
     "\n",
+    "    `<command>`\n",
     "2. Second task.\n",
     "\n",
     "3. Last task."
@@ -109,7 +108,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Briefly describe key aspects of the data. If there are two or more aspects of the data that require involved discussion, use subheads (### <Concept or Sub-component Name>). To include a graphic, introduce it with a brief description and use the image linking tool to include it. Store the graphic in GitHub and use the following format: <img width=\"517\" alt=\"screen shot 2016-05-06 at 10 13 16 pm\" src=\"https://cloud.githubusercontent.com/assets/5545640/15089697/d6f4fca0-13d7-11e6-9331-7f94fcc7b4c6.png\">. You do not need to provide a title for your graphics. "
+    "Briefly describe key aspects of the data. If there are two or more aspects of the data that require involved discussion, use subheads (### `<Concept or Sub-component Name>`). To include a graphic, introduce it with a brief description and use the image linking tool to include it. Store the graphic in GitHub and use the following format: <img width=\"517\" alt=\"screen shot 2016-05-06 at 10 13 16 pm\" src=\"https://cloud.githubusercontent.com/assets/5545640/15089697/d6f4fca0-13d7-11e6-9331-7f94fcc7b4c6.png\">. You do not need to provide a title for your graphics. "
    ]
   },
   {
@@ -343,7 +342,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To *perform the task*, *provide explanation here.*"
+    "To *fperform the task*, *provide explanation here.*"
    ]
   },
   {
diff --git a/example/cnn_visualization/README.md b/example/cnn_visualization/README.md
new file mode 100644
index 00000000000..10b91492600
--- /dev/null
+++ b/example/cnn_visualization/README.md
@@ -0,0 +1,17 @@
+# Visualzing CNN decisions
+
+This folder contains an MXNet Gluon implementation of [Grad-CAM](https://arxiv.org/abs/1610.02391) that helps visualize CNN decisions.
+
+A tutorial on how to use this from Jupyter notebook is available [here](https://mxnet.incubator.apache.org/tutorials/vision/cnn_visualization.html).
+
+You can also do the visualization from terminal:
+```
+$ python gradcam_demo.py hummingbird.jpg
+Predicted category  : hummingbird (94)
+Original Image      : hummingbird_orig.jpg
+Grad-CAM            : hummingbird_gradcam.jpg
+Guided Grad-CAM     : hummingbird_guided_gradcam.jpg
+Saliency Map        : hummingbird_saliency.jpg
+```
+
+![Output of gradcam_demo.py](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/hummingbird_filenames.png)
diff --git a/example/cnn_visualization/gradcam.py b/example/cnn_visualization/gradcam.py
new file mode 100644
index 00000000000..a8708f78758
--- /dev/null
+++ b/example/cnn_visualization/gradcam.py
@@ -0,0 +1,263 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+
+import mxnet as mx
+import mxnet.ndarray as nd
+
+from mxnet import gluon
+from mxnet import autograd
+from mxnet.gluon import nn
+
+import numpy as np
+import cv2
+
+class ReluOp(mx.operator.CustomOp):
+    """Modified ReLU as described in section 3.4 in https://arxiv.org/abs/1412.6806.
+    This is used for guided backpropagation to get gradients of the image w.r.t activations.
+    This Operator will do a regular backpropagation if `guided_backprop` is set to False
+    and a guided packpropagation if `guided_backprop` is set to True. Check gradcam_demo.py
+    for an example usage."""
+
+    guided_backprop = False
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        x = in_data[0]
+        y = nd.maximum(x, nd.zeros_like(x))
+        self.assign(out_data[0], req[0], y)
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        if ReluOp.guided_backprop:
+            # Get output and gradients of output
+            y = out_data[0]
+            dy = out_grad[0]
+            # Zero out the negatives in the gradients of the output
+            dy_positives = nd.maximum(dy, nd.zeros_like(dy))
+            # What output values were greater than 0?
+            y_ones = y.__gt__(0)
+            # Mask out the values for which at least one of dy or y is negative
+            dx = dy_positives * y_ones
+            self.assign(in_grad[0], req[0], dx)
+        else:
+            # Regular backward for ReLU
+            x = in_data[0]
+            x_gt_zero = x.__gt__(0)
+            dx = out_grad[0] * x_gt_zero
+            self.assign(in_grad[0], req[0], dx)
+
+def set_guided_backprop(mode=True):
+    ReluOp.guided_backprop = mode
+
+@mx.operator.register("relu")
+class ReluProp(mx.operator.CustomOpProp):
+    def __init__(self):
+        super(ReluProp, self).__init__(True)
+
+    def infer_shape(self, in_shapes):
+        data_shape = in_shapes[0]
+        output_shape = data_shape
+        return (data_shape,), (output_shape,), ()
+
+    def create_operator(self, ctx, in_shapes, in_dtypes):
+        return ReluOp()  
+
+class Activation(mx.gluon.HybridBlock):
+    @staticmethod
+    def set_guided_backprop(mode=False):
+        ReluOp.guided_backprop = mode
+
+    def __init__(self, act_type, **kwargs):
+        assert act_type == 'relu'
+        super(Activation, self).__init__(**kwargs)
+
+    def hybrid_forward(self, F, x):
+        return F.Custom(x, op_type='relu')
+
+class Conv2D(mx.gluon.HybridBlock):
+    """Wrapper on top of gluon.nn.Conv2D to capture the output and gradients of output of a Conv2D
+    layer in a network. Use `set_capture_layer_name` to select the layer
+    whose outputs and gradients of outputs need to be captured. After the backward pass,
+    `conv_output` will contain the output and `conv_output.grad` will contain the
+    output's gradients. Check gradcam_demo.py for example usage."""
+
+    conv_output = None
+    capture_layer_name = None
+
+    def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
+                 dilation=(1, 1), groups=1, layout='NCHW',
+                 activation=None, use_bias=True, weight_initializer=None,
+                 bias_initializer='zeros', in_channels=0, **kwargs):
+        super(Conv2D, self).__init__(**kwargs)
+        self.conv = nn.Conv2D(channels, kernel_size, strides=strides, padding=padding,
+                             dilation=dilation, groups=groups, layout=layout,
+                             activation=activation, use_bias=use_bias, weight_initializer=weight_initializer,
+                             bias_initializer=bias_initializer, in_channels=in_channels)
+
+    def hybrid_forward(self, F, x):
+        out = self.conv(x)
+        name = self._prefix[:-1]
+        if name == Conv2D.capture_layer_name:
+            out.attach_grad()
+            Conv2D.conv_output = out
+        return out
+
+def set_capture_layer_name(name):
+    Conv2D.capture_layer_name = name
+
+def _get_grad(net, image, class_id=None, conv_layer_name=None, image_grad=False):
+    """This is an internal helper function that can be used for either of these
+    but not both at the same time:
+    1. Record the output and gradient of output of an intermediate convolutional layer.
+    2. Record the gradients of the image.
+
+    Parameters
+    ----------
+    image : NDArray
+        Image to visuaize. This is an NDArray with the preprocessed image.
+    class_id : int
+        Category ID this image belongs to. If not provided,
+        network's prediction will be used.
+    conv_layer_name: str
+        Name of the convolutional layer whose output and output's gradients need to be acptured.
+    image_grad: bool
+        Whether to capture gradients of the image."""
+
+    if image_grad:
+        image.attach_grad()
+        Conv2D.capture_layer_name = None
+        Activation.set_guided_backprop(True)
+    else:
+        # Tell convviz.Conv2D which layer's output and gradient needs to be recorded
+        Conv2D.capture_layer_name = conv_layer_name
+        Activation.set_guided_backprop(False)
+    
+    # Run the network
+    with autograd.record(train_mode=False):
+        out = net(image)
+    
+    # If user didn't provide a class id, we'll use the class that the network predicted
+    if class_id == None:
+        model_output = out.asnumpy()
+        class_id = np.argmax(model_output)
+
+    # Create a one-hot target with class_id and backprop with the created target
+    one_hot_target = mx.nd.one_hot(mx.nd.array([class_id]), 1000)
+    out.backward(one_hot_target, train_mode=False)
+
+    if image_grad:
+        return image.grad[0].asnumpy()
+    else:
+        # Return the recorded convolution output and gradient
+        conv_out = Conv2D.conv_output
+        return conv_out[0].asnumpy(), conv_out.grad[0].asnumpy()
+
+def get_conv_out_grad(net, image, class_id=None, conv_layer_name=None):
+    """Get the output and gradients of output of a convolutional layer.
+
+    Parameters:
+    ----------
+    net: Block
+        Network to use for visualization.
+    image: NDArray
+        Preprocessed image to use for visualization.
+    class_id: int
+        Category ID this image belongs to. If not provided,
+        network's prediction will be used.
+    conv_layer_name: str
+        Name of the convolutional layer whose output and output's gradients need to be acptured."""
+    return _get_grad(net, image, class_id, conv_layer_name, image_grad=False)
+
+def get_image_grad(net, image, class_id=None):
+    """Get the gradients of the image.
+
+    Parameters:
+    ----------
+    net: Block
+        Network to use for visualization.
+    image: NDArray
+        Preprocessed image to use for visualization.
+    class_id: int
+        Category ID this image belongs to. If not provided,
+        network's prediction will be used."""
+    return _get_grad(net, image, class_id, image_grad=True)
+
+def grad_to_image(gradient):
+    """Convert gradients of image obtained using `get_image_grad`
+    into image. This shows parts of the image that is most strongly activating
+    the output neurons."""
+    gradient = gradient - gradient.min()
+    gradient /= gradient.max()
+    gradient = np.uint8(gradient * 255).transpose(1, 2, 0)
+    gradient = gradient[..., ::-1]
+    return gradient
+
+def get_cam(imggrad, conv_out):
+    """Compute CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details"""
+    weights = np.mean(imggrad, axis=(1, 2))
+    cam = np.ones(conv_out.shape[1:], dtype=np.float32)
+    for i, w in enumerate(weights):
+        cam += w * conv_out[i, :, :]
+    cam = cv2.resize(cam, (imggrad.shape[1], imggrad.shape[2]))
+    cam = np.maximum(cam, 0)
+    cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam)) 
+    cam = np.uint8(cam * 255)
+    return cam
+
+def get_guided_grad_cam(cam, imggrad):
+    """Compute Guided Grad-CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details"""
+    return np.multiply(cam, imggrad)
+
+def get_img_heatmap(orig_img, activation_map):
+    """Draw a heatmap on top of the original image using intensities from activation_map"""
+    heatmap = cv2.applyColorMap(activation_map, cv2.COLORMAP_COOL)
+    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
+    img_heatmap = np.float32(heatmap) + np.float32(orig_img)
+    img_heatmap = img_heatmap / np.max(img_heatmap)
+    img_heatmap *= 255
+    return img_heatmap.astype(int)
+
+def to_grayscale(cv2im):
+    """Convert gradients to grayscale. This gives a saliency map."""
+    # How strongly does each position activate the output
+    grayscale_im = np.sum(np.abs(cv2im), axis=0)
+
+    # Normalize between min and 99th percentile
+    im_max = np.percentile(grayscale_im, 99)
+    im_min = np.min(grayscale_im)
+    grayscale_im = np.clip((grayscale_im - im_min) / (im_max - im_min), 0, 1)
+
+    grayscale_im = np.expand_dims(grayscale_im, axis=0)
+    return grayscale_im
+
+def visualize(net, preprocessed_img, orig_img, conv_layer_name):
+    # Returns grad-cam heatmap, guided grad-cam, guided grad-cam saliency
+    imggrad = get_image_grad(net, preprocessed_img)
+    conv_out, conv_out_grad = get_conv_out_grad(net, preprocessed_img, conv_layer_name=conv_layer_name)
+
+    cam = get_cam(imggrad, conv_out)
+    
+    ggcam = get_guided_grad_cam(cam, imggrad)
+    img_ggcam = grad_to_image(ggcam)
+    
+    img_heatmap = get_img_heatmap(orig_img, cam)
+    
+    ggcam_gray = to_grayscale(ggcam)
+    img_ggcam_gray = np.squeeze(grad_to_image(ggcam_gray))
+    
+    return img_heatmap, img_ggcam, img_ggcam_gray
+
diff --git a/example/cnn_visualization/gradcam_demo.py b/example/cnn_visualization/gradcam_demo.py
new file mode 100644
index 00000000000..d9ca5ddade8
--- /dev/null
+++ b/example/cnn_visualization/gradcam_demo.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import gluon
+
+import argparse
+import os
+import numpy as np
+import cv2
+
+import vgg
+import gradcam
+
+# Receive image path from command line
+parser = argparse.ArgumentParser(description='Grad-CAM demo')
+parser.add_argument('img_path', metavar='image_path', type=str, help='path to the image file')
+
+args = parser.parse_args()
+
+# We'll use VGG-16 for visualization
+network = vgg.vgg16(pretrained=True, ctx=mx.cpu())
+# We'll resize images to 224x244 as part of preprocessing
+image_sz = (224, 224)
+
+def preprocess(data):
+    """Preprocess the image before running it through the network"""
+    data = mx.image.imresize(data, image_sz[0], image_sz[1])
+    data = data.astype(np.float32)
+    data = data/255
+    # These mean values were obtained from
+    # https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html
+    data = mx.image.color_normalize(data,
+                                    mean=mx.nd.array([0.485, 0.456, 0.406]),
+                                    std=mx.nd.array([0.229, 0.224, 0.225]))
+    data = mx.nd.transpose(data, (2,0,1)) # Channel first
+    return data
+
+def read_image_mxnet(path):
+    with open(path, 'rb') as fp:
+        img_bytes = fp.read()
+    return mx.img.imdecode(img_bytes)
+
+def read_image_cv(path):
+    return cv2.resize(cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB), image_sz)
+
+# synset.txt contains the names of Imagenet categories
+# Load the file to memory and create a helper method to query category_index -> category name
+synset_url = "http://data.mxnet.io/models/imagenet/synset.txt"
+synset_file_name = "synset.txt"
+mx.test_utils.download(synset_url, fname=synset_file_name)
+
+synset = []
+with open('synset.txt', 'r') as f:
+    synset = [l.rstrip().split(' ', 1)[1].split(',')[0] for l in f]
+    
+def get_class_name(cls_id):
+    return "%s (%d)" % (synset[cls_id], cls_id)
+
+def run_inference(net, data):
+    """Run the input image through the network and return the predicted category as integer"""
+    out = net(data)
+    return out.argmax(axis=1).asnumpy()[0].astype(int)
+
+def visualize(net, img_path, conv_layer_name):
+    """Create Grad-CAM visualizations using the network 'net' and the image at 'img_path'
+    conv_layer_name is the name of the top most layer of the feature extractor"""
+    image = read_image_mxnet(img_path)
+    image = preprocess(image)
+    image = image.expand_dims(axis=0)
+    
+    pred_str = get_class_name(run_inference(net, image))
+    
+    orig_img = read_image_cv(img_path)
+    vizs = gradcam.visualize(net, image, orig_img, conv_layer_name)
+    return (pred_str, (orig_img, *vizs))
+
+# Create Grad-CAM visualization for the user provided image
+last_conv_layer_name = 'vgg0_conv2d12'
+cat, vizs = visualize(network, args.img_path, last_conv_layer_name)
+
+print("{0:20}: {1:80}".format("Predicted category", cat))
+
+# Write the visualiations into file
+img_name = os.path.split(args.img_path)[1].split('.')[0]
+suffixes = ['orig', 'gradcam', 'guided_gradcam', 'saliency']
+image_desc = ['Original Image', 'Grad-CAM', 'Guided Grad-CAM', 'Saliency Map']
+
+for i, img in enumerate(vizs):
+    img = img.astype(np.float32)
+    if len(img.shape) == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    out_file_name = "%s_%s.jpg" % (img_name, suffixes[i])
+    cv2.imwrite(out_file_name, img)
+    print("{0:20}: {1:80}".format(image_desc[i], out_file_name))
+
diff --git a/example/cnn_visualization/vgg.py b/example/cnn_visualization/vgg.py
new file mode 100644
index 00000000000..b6215a334e3
--- /dev/null
+++ b/example/cnn_visualization/vgg.py
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import gluon
+
+import os
+from mxnet.gluon.model_zoo import model_store
+
+from mxnet.initializer import Xavier
+from mxnet.gluon.nn import MaxPool2D, Flatten, Dense, Dropout, BatchNorm
+from gradcam import Activation, Conv2D
+
+class VGG(mx.gluon.HybridBlock):
+    def __init__(self, layers, filters, classes=1000, batch_norm=False, **kwargs):
+        super(VGG, self).__init__(**kwargs)
+        assert len(layers) == len(filters)
+        with self.name_scope():
+            self.features = self._make_features(layers, filters, batch_norm)
+            self.features.add(Dense(4096, activation='relu',
+                                       weight_initializer='normal',
+                                       bias_initializer='zeros'))
+            self.features.add(Dropout(rate=0.5))
+            self.features.add(Dense(4096, activation='relu',
+                                       weight_initializer='normal',
+                                       bias_initializer='zeros'))
+            self.features.add(Dropout(rate=0.5))
+            self.output = Dense(classes,
+                                   weight_initializer='normal',
+                                   bias_initializer='zeros')
+
+    def _make_features(self, layers, filters, batch_norm):
+        featurizer = mx.gluon.nn.HybridSequential(prefix='')
+        for i, num in enumerate(layers):
+            for _ in range(num):
+                featurizer.add(Conv2D(filters[i], kernel_size=3, padding=1,
+                                         weight_initializer=Xavier(rnd_type='gaussian',
+                                                                   factor_type='out',
+                                                                   magnitude=2),
+                                         bias_initializer='zeros'))
+                if batch_norm:
+                    featurizer.add(BatchNorm())
+                featurizer.add(Activation('relu'))
+            featurizer.add(MaxPool2D(strides=2))
+        return featurizer
+
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        x = self.output(x)
+        return x
+
+vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+            13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+            16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+            19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+
+def get_vgg(num_layers, pretrained=False, ctx=mx.cpu(),
+            root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+    layers, filters = vgg_spec[num_layers]
+    net = VGG(layers, filters, **kwargs)
+    if pretrained:
+        from mxnet.gluon.model_zoo.model_store import get_model_file
+        batch_norm_suffix = '_bn' if kwargs.get('batch_norm') else ''
+        net.load_params(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix),
+                                       root=root), ctx=ctx)
+    return net
+
+def vgg16(**kwargs):
+    return get_vgg(16, **kwargs)
+
diff --git a/example/ctc/hyperparams.py b/example/ctc/hyperparams.py
index 7289d19c03f..cdcb874b3a6 100644
--- a/example/ctc/hyperparams.py
+++ b/example/ctc/hyperparams.py
@@ -29,7 +29,7 @@ def __init__(self):
         self._eval_epoch_size = 3000
         self._batch_size = 128
         self._num_epoch = 100
-        self._learning_rate = 0.001
+        self._learning_rate = 0.01
         self._momentum = 0.9
         self._num_label = 4
         # Network hyper parameters
diff --git a/example/gluon/embedding_learning/model.py b/example/gluon/embedding_learning/model.py
index 0f041bc1fc4..f82240e2cd5 100644
--- a/example/gluon/embedding_learning/model.py
+++ b/example/gluon/embedding_learning/model.py
@@ -108,9 +108,11 @@ def hybrid_forward(self, F, x):
         mask = np.ones(weights.shape)
         for i in range(0, n, k):
             mask[i:i+k, i:i+k] = 0
+        mask_uniform_probs = mask * (1.0/(n-k))
 
         weights = weights * F.array(mask) * (distance < self.nonzero_loss_cutoff)
-        weights = weights / F.sum(weights, axis=1, keepdims=True)
+        weights_sum = F.sum(weights, axis=1, keepdims=True)
+        weights = weights / weights_sum
 
         a_indices = []
         p_indices = []
@@ -120,10 +122,11 @@ def hybrid_forward(self, F, x):
         for i in range(n):
             block_idx = i // k
 
-            try:
+            if weights_sum[i] != 0:
                 n_indices += np.random.choice(n, k-1, p=np_weights[i]).tolist()
-            except:
-                n_indices += np.random.choice(n, k-1).tolist()
+            else:
+                # all samples are above the cutoff so we sample uniformly
+                n_indices += np.random.choice(n, k-1, p=mask_uniform_probs[i]).tolist()
             for j in range(block_idx * k, (block_idx + 1) * k):
                 if j != i:
                     a_indices.append(i)
@@ -217,8 +220,11 @@ def hybrid_forward(self, F, anchors, positives, negatives, beta_in, a_indices=No
         pos_loss = F.maximum(d_ap - beta + self._margin, 0.0)
         neg_loss = F.maximum(beta - d_an + self._margin, 0.0)
 
-        pair_cnt = float(F.sum((pos_loss > 0.0) + (neg_loss > 0.0)).asscalar())
-
-        # Normalize based on the number of pairs.
-        loss = (F.sum(pos_loss + neg_loss) + beta_reg_loss) / pair_cnt
+        pair_cnt = F.sum((pos_loss > 0.0) + (neg_loss > 0.0))
+        if pair_cnt == 0.0:
+            # When poss_loss and neg_loss is zero then total loss is zero as well
+            loss = F.sum(pos_loss + neg_loss)
+        else:
+            # Normalize based on the number of pairs.
+            loss = (F.sum(pos_loss + neg_loss) + beta_reg_loss) / pair_cnt
         return gluon.loss._apply_weighting(F, loss, self._weight, None)
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index 9412b6f9371..f5427feae2f 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -67,11 +67,8 @@ def _load_model(args, rank=0):
 def _save_model(args, rank=0):
     if args.model_prefix is None:
         return None
-    dst_dir = os.path.dirname(args.model_prefix)
-    if not os.path.isdir(dst_dir):
-        os.mkdir(dst_dir)
     return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
-        args.model_prefix, rank))
+        args.model_prefix, rank), period=args.save_period)
 
 
 def add_fit_args(parser):
@@ -111,6 +108,7 @@ def add_fit_args(parser):
                        help='show progress for every n batches')
     train.add_argument('--model-prefix', type=str,
                        help='model prefix')
+    train.add_argument('--save-period', type=int, default=1, help='params saving period')
     parser.add_argument('--monitor', dest='monitor', type=int, default=0,
                         help='log network parameters every N iters if larger than 0')
     train.add_argument('--load-epoch', type=int,
@@ -238,7 +236,7 @@ def fit(args, network, data_loader, **kwargs):
             # AlexNet will not converge using Xavier
             initializer = mx.init.Normal()
             # VGG will not trend to converge using Xavier-Gaussian
-        elif 'vgg' in args.network:
+        elif args.network and 'vgg' in args.network:
             initializer = mx.init.Xavier()
         else:
             initializer = mx.init.Xavier(
diff --git a/example/onnx/super_resolution.py b/example/onnx/super_resolution.py
index a52f1a892a6..fcb8ccc88ed 100644
--- a/example/onnx/super_resolution.py
+++ b/example/onnx/super_resolution.py
@@ -55,10 +55,8 @@ def get_test_image():
 
 def perform_inference(sym, arg_params, aux_params, input_img, img_cb, img_cr):
     """Perform inference on image using mxnet"""
-    # To fetch the data names of the input to the model we list the inputs of the symbol graph
-    # and exclude the argument and auxiliary parameters from the list
-    data_names = [graph_input for graph_input in sym.list_inputs()
-                  if graph_input not in arg_params and graph_input not in aux_params]
+    metadata = onnx_mxnet.get_model_metadata('super_resolution.onnx')
+    data_names = [input_name[0] for input_name in metadata.get('input_tensor_data')]
     # create module
     mod = mx.mod.Module(symbol=sym, data_names=data_names, label_names=None)
     mod.bind(for_training=False, data_shapes=[(data_names[0], input_img.shape)])
diff --git a/example/reinforcement-learning/dqn/README.md b/example/reinforcement-learning/dqn/README.md
index 58f7b56146f..fd32667a1f8 100644
Binary files a/example/reinforcement-learning/dqn/README.md and b/example/reinforcement-learning/dqn/README.md differ
diff --git a/example/reinforcement-learning/dqn/base.py b/example/reinforcement-learning/dqn/base.py
index 982ae17f86c..f3cd962ef5b 100644
--- a/example/reinforcement-learning/dqn/base.py
+++ b/example/reinforcement-learning/dqn/base.py
@@ -162,7 +162,7 @@ def save_params(self, dir_path="", epoch=None):
                                         params=self.params,
                                         aux_states=self.aux_states)
         misc_saving_path = save_misc(dir_path=dir_path, epoch=epoch, name=self.name,
-                                     content={'data_shapes': {k: map(int, v) for k, v in self.data_shapes.items()}})
+                                     content={'data_shapes': {k: list(map(int, v)) for k, v in self.data_shapes.items()}})
         logging.info('Saving %s, params: \"%s\", misc: \"%s\"',
                      self.name, param_saving_path, misc_saving_path)
 
diff --git a/example/reinforcement-learning/dqn/dqn_demo.py b/example/reinforcement-learning/dqn/dqn_demo.py
old mode 100644
new mode 100755
index 8655d5cb55b..aef44f87ebf
--- a/example/reinforcement-learning/dqn/dqn_demo.py
+++ b/example/reinforcement-learning/dqn/dqn_demo.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -47,7 +49,7 @@ def main():
     parser.add_argument('-r', '--rom', required=False, type=str,
                         default=os.path.join('roms', 'breakout.bin'),
                         help='Path of the ROM File.')
-    parser.add_argument('-v', '--visualization', required=False, type=int, default=0,
+    parser.add_argument('-v', '--visualization', action='store_true',
                         help='Visualize the runs.')
     parser.add_argument('--lr', required=False, type=float, default=0.01,
                         help='Learning rate of the AdaGrad optimizer')
diff --git a/ci/docker/install/amzn_linux_opencv.sh b/example/reinforcement-learning/dqn/setup.sh
similarity index 62%
rename from ci/docker/install/amzn_linux_opencv.sh
rename to example/reinforcement-learning/dqn/setup.sh
index 956407e8362..3fcfacbe0a7 100755
--- a/ci/docker/install/amzn_linux_opencv.sh
+++ b/example/reinforcement-learning/dqn/setup.sh
@@ -17,17 +17,27 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+set -e
+set -x
 
-set -ex
+pip install opencv-python
+pip install scipy
+
+# Install arcade learning environment
+sudo apt-get install libsdl1.2-dev libsdl-gfx1.2-dev libsdl-image1.2-dev cmake
+git clone git@github.com:mgbellemare/Arcade-Learning-Environment.git || true
 pushd .
-yum install -y python27 python27-setuptools
-git clone https://github.com/opencv/opencv
-cd opencv
+cd Arcade-Learning-Environment
 mkdir -p build
 cd build
-cmake -DBUILD_opencv_gpu=OFF -DWITH_EIGEN=ON -DWITH_TBB=ON -DWITH_CUDA=OFF -DWITH_1394=OFF \
--DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=/usr/local -GNinja ..
-ninja install
-popd
\ No newline at end of file
+cmake -DUSE_SDL=ON -DUSE_RLGLUE=OFF -DBUILD_EXAMPLES=ON -GNinja ..
+ninja
+cd ..
+pip install -e .
+popd
+cp Arcade-Learning-Environment/ale.cfg .
+
+# Copy roms
+git clone git@github.com:npow/atari.git || true
+cp -R atari/roms .
+
diff --git a/example/rnn/large_word_lm/model.py b/example/rnn/large_word_lm/model.py
index 7ee010efb71..3d3c83b631f 100644
--- a/example/rnn/large_word_lm/model.py
+++ b/example/rnn/large_word_lm/model.py
@@ -46,12 +46,11 @@ def cross_entropy_loss(inputs, labels, rescale_loss=1):
 
 def rnn(bptt, vocab_size, num_embed, nhid, num_layers, dropout, num_proj, batch_size):
     """ word embedding + LSTM Projected """
-    embed = mx.sym.contrib.SparseEmbedding
     state_names = []
     data = S.var('data')
     weight = S.var("encoder_weight", stype='row_sparse')
-    embed = embed(data=data, weight=weight, input_dim=vocab_size,
-                  output_dim=num_embed, name='embed', deterministic=True)
+    embed = S.sparse.Embedding(data=data, weight=weight, input_dim=vocab_size,
+                               output_dim=num_embed, name='embed', sparse_grad=True)
     states = []
     outputs = S.Dropout(embed, p=dropout)
     for i in range(num_layers):
@@ -78,7 +77,6 @@ def sampled_softmax(num_classes, num_samples, in_dim, inputs, weight, bias,
             This under-estimates the full softmax and is only used for training.
         """
         # inputs = (n, in_dim)
-        embed = mx.sym.contrib.SparseEmbedding
         sample, prob_sample, prob_target = sampled_values
 
         # (num_samples, )
@@ -90,12 +88,13 @@ def sampled_softmax(num_classes, num_samples, in_dim, inputs, weight, bias,
         sample_label = S.concat(sample, label, dim=0)
         # lookup weights and biases
         # (num_samples+n, dim)
-        sample_target_w = embed(data=sample_label, weight=weight,
-                                     input_dim=num_classes, output_dim=in_dim,
-                                     deterministic=True)
+        sample_target_w = S.sparse.Embedding(data=sample_label, weight=weight,
+                                             input_dim=num_classes, output_dim=in_dim,
+                                             sparse_grad=True)
         # (num_samples+n, 1)
-        sample_target_b = embed(data=sample_label, weight=bias,
-                                input_dim=num_classes, output_dim=1, deterministic=True)
+        sample_target_b = S.sparse.Embedding(data=sample_label, weight=bias,
+                                             input_dim=num_classes, output_dim=1,
+                                             sparse_grad=True)
         # (num_samples, dim)
         sample_w = S.slice(sample_target_w, begin=(0, 0), end=(num_samples, None))
         target_w = S.slice(sample_target_w, begin=(num_samples, 0), end=(None, None))
diff --git a/example/sparse/factorization_machine/README.md b/example/sparse/factorization_machine/README.md
index 7609f31d5c5..32b956ed020 100644
--- a/example/sparse/factorization_machine/README.md
+++ b/example/sparse/factorization_machine/README.md
@@ -11,6 +11,6 @@ It takes more than 30 GB to download and extract the dataset.
 
 ## Train the Model
 
-- python train.py --train-data /path/to/criteo.kaggle2014.test.svm --test-data /path/to/criteo.kaggle2014.test.svm
+- python train.py --data-train /path/to/criteo.kaggle2014.test.svm --data-test /path/to/criteo.kaggle2014.test.svm
 
 [Rendle, Steffen. "Factorization machines." In Data Mining (ICDM), 2010 IEEE 10th International Conference on, pp. 995-1000. IEEE, 2010. ](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
diff --git a/example/sparse/factorization_machine/train.py b/example/sparse/factorization_machine/train.py
index 741cf958db6..af3d60b59a0 100644
--- a/example/sparse/factorization_machine/train.py
+++ b/example/sparse/factorization_machine/train.py
@@ -58,6 +58,7 @@
 parser.add_argument('--kvstore', type=str, default='local',
                     help='what kvstore to use', choices=["dist_async", "local"])
 
+
 if __name__ == '__main__':
     import logging
     head = '%(asctime)-15s %(message)s'
@@ -75,6 +76,16 @@
     assert(args.data_train is not None and args.data_test is not None), \
           "dataset for training or test is missing"
 
+    def batch_row_ids(data_batch):
+        """ Generate row ids based on the current mini-batch """
+        idx = data_batch.data[0].indices
+        return {'w': idx, 'v': idx}
+
+    def all_row_ids(data_batch):
+        """ Generate row ids for all rows """
+        all_rows = mx.nd.arange(0, num_features, dtype='int64')
+        return {'w': all_rows, 'v': all_rows}
+
     # create kvstore
     kv = mx.kvstore.create(kvstore)
     # data iterator
@@ -102,12 +113,6 @@
     metric = mx.metric.create(['log_loss'])
     speedometer = mx.callback.Speedometer(batch_size, log_interval)
 
-    # get the sparse weight parameter
-    w_index = mod._exec_group.param_names.index('w')
-    w_param = mod._exec_group.param_arrays[w_index]
-    v_index = mod._exec_group.param_names.index('v')
-    v_param = mod._exec_group.param_arrays[v_index]
-
     logging.info('Training started ...')
     train_iter = iter(train_data)
     eval_iter = iter(eval_data)
@@ -118,9 +123,7 @@
             nbatch += 1
             # manually pull sparse weights from kvstore so that _square_sum
             # only computes the rows necessary
-            row_ids = batch.data[0].indices
-            kv.row_sparse_pull('w', w_param, row_ids=[row_ids], priority=-w_index)
-            kv.row_sparse_pull('v', v_param, row_ids=[row_ids], priority=-v_index)
+            mod.prepare(batch, sparse_row_id_fn=batch_row_ids)
             mod.forward_backward(batch)
             # update all parameters (including the weight parameter)
             mod.update()
@@ -131,8 +134,7 @@
             speedometer(speedometer_param)
 
         # pull all updated rows before validation
-        kv.row_sparse_pull('w', w_param, row_ids=[row_ids], priority=-w_index)
-        kv.row_sparse_pull('v', v_param, row_ids=[row_ids], priority=-v_index)
+        mod.prepare(None, all_row_ids)
         # evaluate metric on validation dataset
         score = mod.score(eval_iter, ['log_loss'])
         logging.info("epoch %d, eval log loss = %s" % (epoch, score[0][1]))
diff --git a/example/sparse/linear_classification/train.py b/example/sparse/linear_classification/train.py
index cde40dd0ed3..4d60efbaf4f 100644
--- a/example/sparse/linear_classification/train.py
+++ b/example/sparse/linear_classification/train.py
@@ -46,7 +46,7 @@
 
 def batch_row_ids(data_batch):
     """ Generate row ids based on the current mini-batch """
-    return {'weight': batch.data[0].indices}
+    return {'weight': data_batch.data[0].indices}
 
 def all_row_ids(data_batch):
     """ Generate row ids for all rows """
diff --git a/example/sparse/matrix_factorization/README.md b/example/sparse/matrix_factorization/README.md
index 5c4beef45fb..ddbf662c858 100644
--- a/example/sparse/matrix_factorization/README.md
+++ b/example/sparse/matrix_factorization/README.md
@@ -1,6 +1,6 @@
 Matrix Factorization w/ Sparse Embedding
 ===========
-The example demonstrates the basic usage of the SparseEmbedding operator in MXNet, adapted based on @leopd's recommender examples.
+The example demonstrates the basic usage of the sparse.Embedding operator in MXNet, adapted based on @leopd's recommender examples.
 This is for demonstration purpose only.
 
 ```
diff --git a/example/sparse/matrix_factorization/model.py b/example/sparse/matrix_factorization/model.py
index 672c39270c5..68dd4d61cf1 100644
--- a/example/sparse/matrix_factorization/model.py
+++ b/example/sparse/matrix_factorization/model.py
@@ -19,37 +19,30 @@
 
 def matrix_fact_net(factor_size, num_hidden, max_user, max_item, dense):
     # input
-    user = mx.symbol.Variable('user')
-    item = mx.symbol.Variable('item')
-    score = mx.symbol.Variable('score')
+    user = mx.sym.Variable('user')
+    item = mx.sym.Variable('item')
+    score = mx.sym.Variable('score')
     stype = 'default' if dense else 'row_sparse'
-    user_weight = mx.symbol.Variable('user_weight', stype=stype)
-    item_weight = mx.symbol.Variable('item_weight', stype=stype)
-    if not dense:
-        embed = mx.symbol.contrib.SparseEmbedding
-        # user feature lookup
-        user = embed(data=user, weight=user_weight,
-                     input_dim=max_user, output_dim=factor_size, deterministic=True)
-        # item feature lookup
-        item = embed(data=item, weight=item_weight,
-                     input_dim=max_item, output_dim=factor_size, deterministic=True)
-    else:
-        # user feature lookup
-        user = mx.symbol.Embedding(data=user, weight=user_weight,
-                                   input_dim=max_user, output_dim=factor_size)
-        # item feature lookup
-        item = mx.symbol.Embedding(data=item, weight=item_weight,
-                                   input_dim=max_item, output_dim=factor_size)
+    sparse_grad = not dense
+    user_weight = mx.sym.Variable('user_weight', stype=stype)
+    item_weight = mx.sym.Variable('item_weight', stype=stype)
+    # user feature lookup
+    user = mx.sym.Embedding(data=user, weight=user_weight, sparse_grad=sparse_grad,
+                            input_dim=max_user, output_dim=factor_size)
+    # item feature lookup
+    item = mx.sym.Embedding(data=item, weight=item_weight, sparse_grad=sparse_grad,
+                            input_dim=max_item, output_dim=factor_size)
+
     # non-linear transformation of user features
-    user = mx.symbol.Activation(data=user, act_type='relu')
-    user_act = mx.symbol.FullyConnected(data=user, num_hidden=num_hidden)
+    user = mx.sym.Activation(data=user, act_type='relu')
+    user_act = mx.sym.FullyConnected(data=user, num_hidden=num_hidden)
     # non-linear transformation of item features
-    item = mx.symbol.Activation(data=item, act_type='relu')
-    item_act = mx.symbol.FullyConnected(data=item, num_hidden=num_hidden)
+    item = mx.sym.Activation(data=item, act_type='relu')
+    item_act = mx.sym.FullyConnected(data=item, num_hidden=num_hidden)
     # predict by the inner product, which is elementwise product and then sum
     pred = user_act * item_act
-    pred = mx.symbol.sum(data=pred, axis=1)
-    pred = mx.symbol.Flatten(data=pred)
+    pred = mx.sym.sum(data=pred, axis=1)
+    pred = mx.sym.Flatten(data=pred)
     # loss layer
-    pred = mx.symbol.LinearRegressionOutput(data=pred, label=score)
+    pred = mx.sym.LinearRegressionOutput(data=pred, label=score)
     return pred
diff --git a/example/sparse/wide_deep/model.py b/example/sparse/wide_deep/model.py
index b90745599c1..37c83fb647d 100644
--- a/example/sparse/wide_deep/model.py
+++ b/example/sparse/wide_deep/model.py
@@ -44,8 +44,8 @@ def wide_deep_model(num_linear_features, num_embed_features, num_cont_features,
 
     for i, embed in enumerate(embeds):
         embed_weight = mx.symbol.Variable('embed_%d_weight' % i, stype='row_sparse')
-        features.append(mx.symbol.contrib.SparseEmbedding(data=embed, weight=embed_weight,
-                        input_dim=input_dims[i], output_dim=hidden_units[0]))
+        features.append(mx.symbol.sparse.Embedding(data=embed, weight=embed_weight,
+                        input_dim=input_dims[i], output_dim=hidden_units[0], sparse_grad=True))
 
     hidden = mx.symbol.concat(*features, dim=1)
     hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[1])
diff --git a/example/ssd/README.md b/example/ssd/README.md
index 0b970923e44..55387c5fd2d 100644
--- a/example/ssd/README.md
+++ b/example/ssd/README.md
@@ -17,6 +17,7 @@ remarkable traits of MXNet.
 Due to the permission issue, this example is maintained in this [repository](https://github.com/zhreshold/mxnet-ssd) separately. You can use the link regarding specific per example [issues](https://github.com/zhreshold/mxnet-ssd/issues).
 
 ### What's new
+* Added live camera capture and detection display (run with --camera flag)
 * Added multiple trained models.
 * Added a much simpler way to compose network from mainstream classification networks (resnet, inception...) and [Guide](symbol/README.md).
 * Update to the latest version according to caffe version, with 5% mAP increase.
@@ -84,6 +85,13 @@ python demo.py --cpu --network resnet50 --data-shape 512
 ```
 * Check `python demo.py --help` for more options.
 
+### Live Camera detection
+
+Use `init.sh` to download the trained model.
+You can use `./demo.py --camera` to use a video capture device with opencv such as a webcam. This
+will open a window that will display the camera output together with the detections. You can play
+with the detection threshold to get more or less detections.
+
 ### Train the model
 This example only covers training on Pascal VOC dataset. Other datasets should
 be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
diff --git a/example/ssd/benchmark_score.py b/example/ssd/benchmark_score.py
new file mode 100644
index 00000000000..6af1b217e21
--- /dev/null
+++ b/example/ssd/benchmark_score.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import os
+import sys
+import argparse
+import importlib
+import mxnet as mx
+import time
+import logging
+
+from symbol.symbol_factory import get_symbol
+from symbol.symbol_factory import get_symbol_train
+from symbol import symbol_builder
+
+
+parser = argparse.ArgumentParser(description='MxNet SSD benchmark')
+parser.add_argument('--network', '-n', type=str, default='vgg16_reduced')
+parser.add_argument('--batch_size', '-b', type=int, default=0)
+parser.add_argument('--shape', '-w', type=int, default=300)
+parser.add_argument('--class_num', '-class', type=int, default=20)
+
+
+def get_data_shapes(batch_size):
+    image_shape = (3, 300, 300)
+    return [('data', (batch_size,)+image_shape)]
+
+def get_data(batch_size):
+    data_shapes = get_data_shapes(batch_size)
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in data_shapes]
+    batch = mx.io.DataBatch(data, [])
+    return batch
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    network = args.network
+    image_shape = args.shape
+    num_classes = args.class_num
+    b = args.batch_size
+    supported_image_shapes = [300, 512]
+    supported_networks = ['vgg16_reduced', 'inceptionv3', 'resnet50']
+
+    if network not in supported_networks:
+        raise Exception(network + " is not supported")
+
+    if image_shape not in supported_image_shapes:
+       raise Exception("Image shape should be either 300*300 or 512*512!")
+
+    if b == 0:
+        batch_sizes = [1, 2, 4, 8, 16, 32]
+    else:
+        batch_sizes = [b]
+
+    data_shape = (3, image_shape, image_shape)
+    net = get_symbol(network, data_shape[1], num_classes=num_classes,
+                     nms_thresh=0.4, force_suppress=True)
+    
+    num_batches = 100
+    dry_run = 5   # use 5 iterations to warm up
+    
+    for bs in batch_sizes:
+        batch = get_data(bs)
+        mod = mx.mod.Module(net, label_names=None, context=mx.cpu())
+        mod.bind(for_training = False,
+                 inputs_need_grad = False,
+                 data_shapes = get_data_shapes(bs))
+        mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+
+        # get data
+        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes]
+        batch = mx.io.DataBatch(data, [])
+
+        for i in range(dry_run + num_batches):
+            if i == dry_run:
+                tic = time.time()
+            mod.forward(batch, is_train=False)
+            for output in mod.get_outputs():
+                output.wait_to_read()
+
+        avg_time = (time.time() - tic) / num_batches
+        fps = bs / avg_time
+        print("SSD-" + network + " with " + str(num_classes) + " classes and shape " + str(data_shape))
+        print("batchsize=" + str(bs) + " " + str(1000*avg_time) + " ms")
+        print("batchsize=" + str(bs) + " " + str(fps) + " imgs/s")
diff --git a/example/ssd/dataset/cv2Iterator.py b/example/ssd/dataset/cv2Iterator.py
new file mode 100644
index 00000000000..469faeac828
--- /dev/null
+++ b/example/ssd/dataset/cv2Iterator.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+import cv2
+
+
+class CameraIterator():
+    """
+    An iterator that captures frames with opencv or the specified capture
+    """
+    def __init__(self, capture=cv2.VideoCapture(0), frame_resize=None):
+        self._capture = capture
+        self._frame_resize = frame_resize
+        if frame_resize:
+            assert isinstance(frame_resize, tuple) and (len(tuple) == 2), "frame_resize should be a tuple of (x,y)"
+            self._frame_shape = (1, 3, frame_resize[0], frame_resize[1])
+        else:
+            self._frame_shape = (1, 3,
+                int(self._capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
+                int(self._capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ret, frame = self._capture.read()
+        if cv2.waitKey(1) & 0xFF == ord('q') or ret is not True:
+            raise StopIteration
+        if self._frame_resize:
+            frame = cv2.resize(frame, (self._frame_resize[0], self._frame_resize[1]))
+        return frame
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_alue, traceback):
+        self.close()
+
+    def close(self):
+        self._capture.release()
diff --git a/example/ssd/demo.py b/example/ssd/demo.py
index 0480bdd658b..4ae8b350742 100755
--- a/example/ssd/demo.py
+++ b/example/ssd/demo.py
@@ -25,6 +25,9 @@
 import sys
 from detect.detector import Detector
 from symbol.symbol_factory import get_symbol
+from dataset.cv2Iterator import CameraIterator
+import logging
+import cv2
 
 def get_detector(net, prefix, epoch, data_shape, mean_pixels, ctx, num_class,
                  nms_thresh=0.5, force_nms=True, nms_topk=400):
@@ -72,6 +75,8 @@ def parse_args():
                         type=str, nargs='?')
     parser.add_argument('--epoch', dest='epoch', help='epoch of trained model',
                         default=0, type=int)
+    parser.add_argument('--batch-size', dest='batch_size', help='batch size',
+                        default=1, type=int)
     parser.add_argument('--prefix', dest='prefix', help='trained model prefix',
                         default=os.path.join(os.getcwd(), 'model', 'ssd_'),
                         type=str)
@@ -102,6 +107,8 @@ def parse_args():
                         car, cat, chair, cow, diningtable, dog, horse, motorbike, \
                         person, pottedplant, sheep, sofa, train, tvmonitor',
                         help='string of comma separated names, or text filename')
+    parser.add_argument('--camera', action='store_true',
+                        help="use camera for image capturing")
     args = parser.parse_args()
     return args
 
@@ -131,13 +138,59 @@ def parse_data_shape(data_shape_str):
         raise ValueError("Unexpected data_shape: %s", data_shape_str)
     return data_shape
 
-if __name__ == '__main__':
-    args = parse_args()
-    if args.cpu:
-        ctx = mx.cpu()
-    else:
-        ctx = mx.gpu(args.gpu_id)
+def draw_detection(frame, det, class_names):
+    (klass, score, x0, y0, x1, y1) = det
+    klass_name = class_names[int(klass)]
+    h = frame.shape[0]
+    w = frame.shape[1]
+    # denormalize detections from [0,1] to the frame size
+    p0 = tuple(map(int, (x0*w,y0*h)))
+    p1 = tuple(map(int, (x1*w,y1*h)))
+    logging.info("detection: %s %s", klass_name, score)
+    cv2.rectangle(frame, p0, p1, (0,0,255), 2)
+    # Where to draw the text, a few pixels above the top y coordinate
+    tp0 = (p0[0], p0[1]-5)
+    draw_text = "{} {}".format(klass_name, score)
+    cv2.putText(frame, draw_text, tp0, cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (0,0,255))
 
+
+def network_path(prefix, network, data_shape):
+    return "{}{}_{}".format(prefix, network, data_shape)
+
+def run_camera(args,ctx):
+    assert args.batch_size == 1, "only batch size of 1 is supported"
+    logging.info("Detection threshold is {}".format(args.thresh))
+    iter = CameraIterator()
+    class_names = parse_class_names(args.class_names)
+    mean_pixels = (args.mean_r, args.mean_g, args.mean_b)
+    data_shape = int(args.data_shape)
+    batch_size = int(args.batch_size)
+    detector = Detector(
+        get_symbol(args.network, data_shape, num_classes=len(class_names)),
+        network_path(args.prefix, args.network, data_shape),
+        args.epoch,
+        data_shape,
+        mean_pixels,
+        batch_size,
+        ctx
+    )
+    for frame in iter:
+        logging.info("Frame info: shape %s type %s", frame.shape, frame.dtype)
+        logging.info("Generating batch")
+        data_batch = detector.create_batch(frame)
+        logging.info("Detecting objects")
+        detections_batch = detector.detect_batch(data_batch)
+        #detections = [mx.nd.array((1,1,0.2,0.2,0.4,0.4))]
+        detections = detections_batch[0]
+        logging.info("%d detections", len(detections))
+        for det in detections:
+            obj = det.asnumpy()
+            (klass, score, x0, y0, x1, y1) = obj
+            if score > args.thresh:
+                draw_detection(frame, obj, class_names)
+        cv2.imshow('frame', frame)
+
+def run_images(args,ctx):
     # parse image list
     image_list = [i.strip() for i in args.images.split(',')]
     assert len(image_list) > 0, "No valid image specified to detect"
@@ -156,3 +209,22 @@ def parse_data_shape(data_shape_str):
     # run detection
     detector.detect_and_visualize(image_list, args.dir, args.extension,
                                   class_names, args.thresh, args.show_timer)
+
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+    logging.basicConfig(format='%(asctime)-15s %(message)s')
+    args = parse_args()
+    if args.cpu:
+        ctx = mx.cpu()
+    else:
+        ctx = mx.gpu(args.gpu_id)
+
+    if args.camera:
+        run_camera(args, ctx)
+    else:
+        run_images(args, ctx)
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/example/ssd/detect/detector.py b/example/ssd/detect/detector.py
index 8d72eaa8669..1b5e8cb76ee 100644
--- a/example/ssd/detect/detector.py
+++ b/example/ssd/detect/detector.py
@@ -21,6 +21,9 @@
 from dataset.testdb import TestDB
 from dataset.iterator import DetIter
 import logging
+import cv2
+from mxnet.io import DataBatch, DataDesc
+
 
 class Detector(object):
     """
@@ -58,8 +61,25 @@ def __init__(self, symbol, model_prefix, epoch, data_shape, mean_pixels, \
         self.mod.bind(data_shapes=[('data', (batch_size, 3, data_shape[0], data_shape[1]))])
         self.mod.set_params(args, auxs)
         self.mean_pixels = mean_pixels
+        self.mean_pixels_nd = mx.nd.array(mean_pixels).reshape((3,1,1))
 
-    def detect(self, det_iter, show_timer=False):
+    def create_batch(self, frame):
+        """
+        :param frame: an (w,h,channels) numpy array (image)
+        :return: DataBatch of (1,channels,data_shape,data_shape)
+        """
+        frame_resize = mx.nd.array(cv2.resize(frame, (self.data_shape[0], self.data_shape[1])))
+        #frame_resize = mx.img.imresize(frame, self.data_shape[0], self.data_shape[1], cv2.INTER_LINEAR)
+        # Change dimensions from (w,h,channels) to (channels, w, h)
+        frame_t = mx.nd.transpose(frame_resize, axes=(2,0,1))
+        frame_norm = frame_t - self.mean_pixels_nd
+        # Add dimension for batch, results in (1,channels,w,h)
+        batch_frame = [mx.nd.expand_dims(frame_norm, axis=0)]
+        batch_shape = [DataDesc('data', batch_frame[0].shape)]
+        batch = DataBatch(data=batch_frame, provide_data=batch_shape)
+        return batch
+
+    def detect_iter(self, det_iter, show_timer=False):
         """
         detect all images in iterator
 
@@ -86,6 +106,17 @@ def detect(self, det_iter, show_timer=False):
         result = Detector.filter_positive_detections(detections)
         return result
 
+    def detect_batch(self, batch):
+        """
+        Return detections for batch
+        :param batch:
+        :return:
+        """
+        self.mod.forward(batch, is_train=False)
+        detections = self.mod.get_outputs()[0]
+        positive_detections = Detector.filter_positive_detections(detections)
+        return positive_detections
+
     def im_detect(self, im_list, root_dir=None, extension=None, show_timer=False):
         """
         wrapper for detecting multiple images
@@ -108,7 +139,7 @@ def im_detect(self, im_list, root_dir=None, extension=None, show_timer=False):
         test_db = TestDB(im_list, root_dir=root_dir, extension=extension)
         test_iter = DetIter(test_db, 1, self.data_shape, self.mean_pixels,
                             is_train=False)
-        return self.detect(test_iter, show_timer)
+        return self.detect_iter(test_iter, show_timer)
 
     def visualize_detection(self, img, dets, classes=[], thresh=0.6):
         """
@@ -197,7 +228,6 @@ def detect_and_visualize(self, im_list, root_dir=None, extension=None,
         ----------
 
         """
-        import cv2
         dets = self.im_detect(im_list, root_dir, extension, show_timer=show_timer)
         if not isinstance(im_list, list):
             im_list = [im_list]
diff --git a/example/ssd/init.sh b/example/ssd/init.sh
index 53104bb7d78..863ba871da8 100755
--- a/example/ssd/init.sh
+++ b/example/ssd/init.sh
@@ -53,6 +53,7 @@ function download_demo_images() {
     popd
 }
 
-download_pascal_voc
+# Uncomment to download training dataset
+#download_pascal_voc
 download_model
 download_demo_images
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 38fd7edd1f3..a652fe5b707 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -102,7 +102,7 @@
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 2
+#define MXNET_MINOR 3
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
@@ -217,6 +217,11 @@ struct Context {
    * \return GPU Context. -1 for current GPU.
    */
   inline static Context GPU(int32_t dev_id = -1);
+  /*!
+   * Get the number of GPUs available.
+   * \return The number of GPUs that are available.
+   */
+  inline static int32_t GetGPUCount();
   /*!
    * Create a pinned CPU context.
    * \param dev_id the device id for corresponding GPU.
@@ -307,6 +312,20 @@ inline Context Context::GPU(int32_t dev_id) {
   return Create(kGPU, dev_id);
 }
 
+inline int32_t Context::GetGPUCount() {
+#if MXNET_USE_CUDA
+  int32_t count;
+  cudaError_t e = cudaGetDeviceCount(&count);
+  if (e == cudaErrorNoDevice) {
+    return 0;
+  }
+  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
+  return count;
+#else
+  return 0;
+#endif
+}
+
 inline Context Context::FromString(const std::string& str) {
   Context ret;
   try {
@@ -365,7 +384,10 @@ constexpr size_t kMKLDNNAlign = 64;
 namespace std {
 template<> struct hash<mxnet::Context> {
   size_t operator()(const mxnet::Context& ctx) const {
-    return (static_cast<size_t>(ctx.dev_type) << 32) | ctx.dev_id;
+    size_t res = 0;
+    res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_type));
+    res = dmlc::HashCombine(res, static_cast<size_t>(ctx.dev_id));
+    return res;
   }
 };
 }
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 3f040515c2f..be47c3c14fa 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -383,6 +383,13 @@ MXNET_DLL int MXSetNumOMPThreads(int thread_num);
  */
 MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size);
 
+/*!
+ * \brief Get the number of GPUs.
+ * \param pointer to int that will hold the number of GPUs available.
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXGetGPUCount(int* out);
+
 /*!
  * \brief get the MXNet library version as an integer
  * \param pointer to the integer holding the version number
@@ -663,6 +670,7 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
 MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
                                  int ndim,
                                  dim_t *dims,
+                                 bool reverse,
                                  NDArrayHandle *out);
 /*!
  * \brief get the shape of the array
@@ -1646,6 +1654,47 @@ MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                                    NDArrayHandle** aux_states,
                                    ExecutorHandle shared_exec_handle,
                                    ExecutorHandle* out);
+
+/*!
+ * \brief Return a new executor with the same symbol and shared memory,
+ * but different input/output shapes.
+ *
+ * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
+ * \param allow_up_sizing Whether to allow allocating new ndarrays that's larger than the original.
+ * \param dev_type device type of default context
+ * \param dev_id device id of default context
+ * \param num_map_keys size of group2ctx map
+ * \param map_keys keys of group2ctx map
+ * \param map_dev_types device type of group2ctx map
+ * \param map_dev_ids device id of group2ctx map
+ * \param num_in_args length of in_args
+ * \param in_args in args array
+ * \param arg_grads arg grads handle array
+ * \param num_aux_states length of auxiliary states
+ * \param aux_states auxiliary states array
+ * \param shared_exec input executor handle for memory sharing
+ * \param out output executor handle
+ * \return a new executor
+ */
+MXNET_DLL int MXExecutorReshape(int partial_shaping,
+                                int allow_up_sizing,
+                                int dev_type,
+                                int dev_id,
+                                mx_uint num_map_keys,
+                                const char** map_keys,
+                                const int* map_dev_types,
+                                const int* map_dev_ids,
+                                const mx_uint num_provided_arg_shapes,
+                                const char** provided_arg_shape_names,
+                                const mx_uint* provided_arg_shape_data,
+                                const mx_uint* provided_arg_shape_idx,
+                                mx_uint* num_in_args,
+                                NDArrayHandle** in_args,
+                                NDArrayHandle** arg_grads,
+                                mx_uint* num_aux_states,
+                                NDArrayHandle** aux_states,
+                                ExecutorHandle shared_exec,
+                                ExecutorHandle *out);
 /*!
  * \brief set a call back to notify the completion of operation
  */
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index d749100f5de..842653f8653 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -103,6 +103,29 @@ class Executor {
    * \return aux state map in the executor.
    */
   virtual const std::unordered_map<std::string, NDArray>& aux_state_map() const = 0;
+  /*!
+   * \brief Return a new executor with the same symbol and shared memory,
+   *  but different input/output shapes.
+   *
+   * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
+   * \param allow_up_sizing Whether to allow allocating new ndarrays that's larger than the original.
+   * \param default_ctx the default context of binding.
+   * \param ctx_map Context mapping group to context.
+   * \param provided_arg_shapes New shape for arguments.
+   * \param in_args the NDArray that stores the input arguments.
+   * \param arg_grads NDArray that is used to store the gradient output of the input arguments.
+   * \param aux_states NDArray that is used as internal states.
+   * \return a new executor.
+   */
+  virtual Executor* Reshape(const bool partial_shaping,
+                            const bool allow_up_sizing,
+                            const Context& default_ctx,
+                            const std::map<std::string, Context>& ctx_map,
+                            const std::unordered_map<std::string, TShape>&
+                              provided_arg_shapes,
+                            std::vector<NDArray>* in_args,
+                            std::vector<NDArray>* arg_grads,
+                            std::vector<NDArray>* aux_states) = 0;
   /*!
    * \brief Create an operator by bind symbol with context and arguments.
    *  If user do not want to compute the gradients of i-th argument, grad_req_type[i] can be kNullOp.
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 6fda8c37b41..e243eb71c47 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -678,10 +678,7 @@ class NDArray {
    */
   NDArray Reorder2Default() const;
 
-  void InvalidateMKLDNNData() {
-    // Removing mkl_mem_ means the NDArray will store data in the default format.
-    ptr_->mkl_mem_ = nullptr;
-  }
+  void InvalidateMKLDNNData();
 
   /*
    * This function is used inside operators to reshape an array.
diff --git a/mkldnn.mk b/mkldnn.mk
new file mode 100644
index 00000000000..a89e3e635af
--- /dev/null
+++ b/mkldnn.mk
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ifeq ($(USE_MKLDNN), 1)
+    MKLDNN_ROOTDIR = $(ROOTDIR)/3rdparty/mkldnn
+    MKLDNN_BUILDDIR = $(MKLDNN_ROOTDIR)/build
+    MKLDNN_INSTALLDIR = $(MKLDNN_ROOTDIR)/install
+    MKLDNN_LIBDIR = $(ROOTDIR)/lib
+ifeq ($(UNAME_S), Darwin)
+    OMP_LIBFILE = $(MKLDNN_INSTALLDIR)/lib/libiomp5.dylib
+    MKLML_LIBFILE = $(MKLDNN_INSTALLDIR)/lib/libmklml.dylib
+    MKLDNN_LIBFILE = $(MKLDNN_INSTALLDIR)/lib/libmkldnn.0.dylib
+else
+    OMP_LIBFILE = $(MKLDNN_INSTALLDIR)/lib/libiomp5.so
+    MKLML_LIBFILE = $(MKLDNN_INSTALLDIR)/lib/libmklml_intel.so
+    MKLDNN_LIBFILE = $(MKLDNN_INSTALLDIR)/lib/libmkldnn.so.0
+endif
+endif
+
+.PHONY: mkldnn mkldnn_clean mkldnn_lib_sync
+
+mkldnn_build: $(MKLDNN_INSTALLDIR)/lib/libmkldnn.so
+
+$(MKLDNN_INSTALLDIR)/lib/libmkldnn.so:
+	mkdir -p $(MKLDNN_INSTALLDIR)
+	cd $(MKLDNN_ROOTDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNN_INSTALLDIR)/.
+	cmake $(MKLDNN_ROOTDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNN_INSTALLDIR) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
+	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
+	$(MAKE) -C $(MKLDNN_BUILDDIR) install
+	mkdir -p $(MKLDNN_LIBDIR)
+	rsync -a $(OMP_LIBFILE) $(MKLDNN_LIBDIR)
+	rsync -a $(MKLML_LIBFILE) $(MKLDNN_LIBDIR)
+	rsync -a $(MKLDNN_LIBFILE) $(MKLDNN_LIBDIR)
+
+mkldnn_lib_sync:
+	mkdir -p $(MKLDNNROOT)
+	rsync -a $(MKLDNN_INSTALLDIR)/include $(MKLDNN_INSTALLDIR)/lib $(MKLDNNROOT)/.
+
+mkldnn_clean:
+	$(RM) -r 3rdparty/mkldnn/build
+	$(RM) -r 3rdparty/mkldnn/install/*
+
+ifeq ($(USE_MKLDNN), 1)
+ifeq ($(MKLDNNROOT), $(ROOTDIR)/3rdparty/mkldnn/install)
+mkldnn: mkldnn_build
+else
+mkldnn: mkldnn_lib_sync
+mkldnn_lib_sync: mkldnn_build
+endif
+else
+mkldnn:
+endif
diff --git a/prepare_mkl.sh b/prepare_mkl.sh
deleted file mode 100755
index 12e5df7ffe1..00000000000
--- a/prepare_mkl.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/bash
-
-# set -ex
-#
-# All modification made by Intel Corporation: © 2016 Intel Corporation
-#
-# All contributions by the University of California:
-# Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-# All rights reserved.
-#
-# All other contributions:
-# Copyright (c) 2014, 2015, the respective contributors
-# All rights reserved.
-# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
-#
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-GetVersionName()
-{
-VERSION_LINE=0
-if [ $1 ]; then
-  VERSION_LINE=`grep __INTEL_MKL_BUILD_DATE $1/include/mkl_version.h 2>/dev/null | sed -e 's/.* //'`
-fi
-if [ -z $VERSION_LINE ]; then
-  VERSION_LINE=0
-fi
-echo $VERSION_LINE  # Return Version Line
-}
-
-# MKL
-HOME_MKL=$1
-if [ ! -d "$HOME_MKL" ]; then
-   mkdir $HOME_MKL
-fi
-MXNET_ROOT=`dirname $0`
-USE_MKLML=0
-# NOTE: if you update the following line, please also update the dockerfile at
-# tests/ci_build/Dockerfile.mkl
-VERSION_MATCH=20171227
-PLATFORM=$(uname)
-if [ $PLATFORM == "Darwin" ]; then
-    INFIX=mac
-elif [ $PLATFORM == "Linux" ]; then
-    INFIX=lnx
-fi
-ARCHIVE_BASENAME=mklml_${INFIX}_2018.0.1.${VERSION_MATCH}.tgz
-MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
-MKLURL="https://github.com/01org/mkl-dnn/releases/download/v0.12/$ARCHIVE_BASENAME"
-# there are diffrent MKL lib to be used for GCC and for ICC
-reg='^[0-9]+$'
-VERSION_LINE=`GetVersionName $MKLROOT`
-#echo $VERSION_LINE
-# Check if MKLROOT is set if positive then set one will be used..
-if [ -z $MKLROOT ]; then
-  # ..if MKLROOT is not set then check if we have MKL downloaded in proper version
-    VERSION_LINE=`GetVersionName $HOME_MKL`
-    #echo $VERSION_LINE
-    if [ $VERSION_LINE -lt $VERSION_MATCH ] ; then
-      #...If it is not then downloaded and unpacked
-      if [ $PLATFORM == "Darwin" ]; then
-        curl -L -o $MXNET_ROOT/$ARCHIVE_BASENAME $MKLURL
-      elif [ $PLATFORM == "Linux" ]; then
-        wget --quiet --no-check-certificate -P $MXNET_ROOT $MKLURL -O $MXNET_ROOT/$ARCHIVE_BASENAME
-      fi
-      tar -xzf $MXNET_ROOT/$ARCHIVE_BASENAME -C $MXNET_ROOT
-      #echo $HOME_MKL
-      yes | cp -rf $MXNET_ROOT/$MKL_CONTENT_DIR/* $HOME_MKL
-      rm -rf $MXNET_ROOT/$MKL_CONTENT_DIR
-    fi
-  if [ $PLATFORM == "Darwin" ]; then
-    MKLLIB=`find $HOME_MKL -name libmklml.dylib`
-  elif [ $PLATFORM == "Linux" ]; then
-    MKLLIB=`find $HOME_MKL -name libmklml_gnu.so`
-  fi
-  MKLROOT=`echo $MKLLIB | sed -e 's/lib.*$//'`
-fi
-
-# Check what MKL lib we have in MKLROOT
-if [ -z `find $MKLROOT \( -name libmklml_gnu.so -o -name libmklml.dylib \) -print -quit` ]; then
-  USE_MKLML=0
-elif [ -z `find $MKLROOT -name libmkl_core.so -print -quit` ]; then
-  USE_MKLML=1
-fi
-
-# return value to calling script (Makefile,cmake)
-echo $MKLROOT $USE_MKLML
diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh
deleted file mode 100755
index 828cfe107ed..00000000000
--- a/prepare_mkldnn.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# set -ex
-#
-# All modification made by Intel Corporation: © 2016 Intel Corporation
-#
-# All contributions by the University of California:
-# Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-# All rights reserved.
-#
-# All other contributions:
-# Copyright (c) 2014, 2015, the respective contributors
-# All rights reserved.
-# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
-#
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-
-MXNET_ROOTDIR="$(pwd)"
-MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/"
-MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src"
-MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build"
-MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install"
-MKLDNN_LIBDIR="$MXNET_ROOTDIR/lib"
-
-# MKLDNN install destination
-HOME_MKLDNN=$1
-if [ ! -z "$HOME_MKLDNN" ]; then
-  mkdir -p $HOME_MKLDNN
-  if [ ! -w $HOME_MKLDNN ]; then
-    echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2
-    exit 1
-  fi
-fi
-
-if [ $OSTYPE == "darwin16" ]; then
-  OMP_LIBFILE="$MKLDNN_INSTALLDIR/lib/libiomp5.dylib"
-  MKLML_LIBFILE="$MKLDNN_INSTALLDIR/lib/libmklml.dylib"
-  MKLDNN_LIBFILE="$MKLDNN_INSTALLDIR/lib/libmkldnn.0.dylib"
-else
-  OMP_LIBFILE="$MKLDNN_INSTALLDIR/lib/libiomp5.so"
-  MKLML_LIBFILE="$MKLDNN_INSTALLDIR/lib/libmklml_intel.so"
-  MKLDNN_LIBFILE="$MKLDNN_INSTALLDIR/lib/libmkldnn.so.0"
-fi
-
-if [ -z $MKLDNNROOT ]; then
-if [ ! -f $MKLDNN_LIBFILE ]; then
-    mkdir -p $MKLDNN_INSTALLDIR
-	cd $MKLDNN_ROOTDIR
-    if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then
-        rm -rf external && cd scripts && ./prepare_mkl.sh >&2 && cd ..
-        cp -a external/*/* $MKLDNN_INSTALLDIR/.
-    fi
-    echo "Building MKLDNN ..." >&2
-    cd $MXNET_ROOTDIR
-	g++ --version >&2
-    cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR -DARCH_OPT_FLAGS="-mtune=generic" >&2
-    NUM_PROC=1
-    if [[ ! -z $(command -v nproc) ]]; then
-      NUM_PROC=$(nproc)
-    elif [[ ! -z $(command -v sysctl) ]]; then
-      NUM_PROC=$(sysctl -n hw.ncpu)
-    else
-      >&2 echo "Can't discover number of cores."
-    fi
-    make -C $MKLDNN_BUILDDIR -j${NUM_PROC} VERBOSE=1 >&2
-
-    make -C $MKLDNN_BUILDDIR install >&2
-    rm -rf $MKLDNN_BUILDDIR
-    mkdir -p $MKLDNN_LIBDIR
-    cp $OMP_LIBFILE $MKLDNN_LIBDIR
-    cp $MKLML_LIBFILE $MKLDNN_LIBDIR
-    cp $MKLDNN_LIBFILE $MKLDNN_LIBDIR
-fi
-MKLDNNROOT=$MKLDNN_INSTALLDIR
-fi
-
-if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then
-  MKLROOT=$MKLDNNROOT;
-fi
-
-# user specified MKLDNN install folder
-if [ -d "$HOME_MKLDNN" ]; then
-  # skip if user specificed MKLDNNROOT
-  [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/.
-  [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/.
-  # update ldconfig if possible
-  if [ -w /etc/ld.so.conf.d ]; then
-    echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig
-  fi
-# return value to calling script (Makefile,cmake)
-  echo $HOME_MKLDNN $HOME_MKLDNN
-else
-  echo $MKLDNNROOT $MKLROOT
-fi
-
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 58b8bd8d7fa..e960829e691 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -21,7 +21,7 @@
 """MXNet: a concise, fast and flexible framework for deep learning."""
 from __future__ import absolute_import
 
-from .context import Context, current_context, cpu, gpu
+from .context import Context, current_context, cpu, gpu, cpu_pinned
 from . import engine
 from .base import MXNetError
 from . import base
diff --git a/python/mxnet/attribute.py b/python/mxnet/attribute.py
index 15d38f81f2e..17044ddaef0 100644
--- a/python/mxnet/attribute.py
+++ b/python/mxnet/attribute.py
@@ -18,10 +18,12 @@
 # coding: utf-8
 """Attribute scoping support for symbolic API."""
 from __future__ import absolute_import
+import threading
+import warnings
 
-from .base import string_types
+from .base import string_types, classproperty, with_metaclass, _MXClassPropertyMetaClass
 
-class AttrScope(object):
+class AttrScope(with_metaclass(_MXClassPropertyMetaClass, object)):
     """Attribute manager for scoping.
 
     User can also inherit this object to change naming behavior.
@@ -31,7 +33,7 @@ class AttrScope(object):
     kwargs
         The attributes to set for all symbol creations in the scope.
     """
-    current = None
+    _current = threading.local()
 
     def __init__(self, **kwargs):
         self._old_scope = None
@@ -64,15 +66,35 @@ def get(self, attr):
 
     def __enter__(self):
         # pylint: disable=protected-access
-        self._old_scope = AttrScope.current
-        attr = AttrScope.current._attr.copy()
+        if not hasattr(AttrScope._current, "value"):
+            AttrScope._current.value = AttrScope()
+        self._old_scope = AttrScope._current.value
+        attr = AttrScope._current.value._attr.copy()
         attr.update(self._attr)
         self._attr = attr
-        AttrScope.current = self
+        AttrScope._current.value = self
         return self
 
     def __exit__(self, ptype, value, trace):
         assert self._old_scope
-        AttrScope.current = self._old_scope
+        AttrScope._current.value = self._old_scope
 
-AttrScope.current = AttrScope()
+    #pylint: disable=no-self-argument
+    @classproperty
+    def current(cls):
+        warnings.warn("AttrScope.current has been deprecated. "
+                      "It is advised to use the `with` statement with AttrScope.",
+                      DeprecationWarning)
+        if not hasattr(AttrScope._current, "value"):
+            cls._current.value = AttrScope()
+        return cls._current.value
+
+    @current.setter
+    def current(cls, val):
+        warnings.warn("AttrScope.current has been deprecated. "
+                      "It is advised to use the `with` statement with AttrScope.",
+                      DeprecationWarning)
+        cls._current.value = val
+    #pylint: enable=no-self-argument
+
+AttrScope._current.value = AttrScope()
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 9790e090e38..0fb73b3c7dd 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, no-member, trailing-comma-tuple
+# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
@@ -98,6 +98,67 @@ class MXCallbackList(ctypes.Structure):
         ('contexts', ctypes.POINTER(ctypes.c_void_p))
         ]
 
+# Please see: https://stackoverflow.com/questions/5189699/how-to-make-a-class-property
+class _MXClassPropertyDescriptor(object):
+    def __init__(self, fget, fset=None):
+        self.fget = fget
+        self.fset = fset
+
+    def __get__(self, obj, clas=None):
+        if clas is None:
+            clas = type(obj)
+        return self.fget.__get__(obj, clas)()
+
+    def __set__(self, obj, value):
+        if not self.fset:
+            raise MXNetError("cannot use the setter: %s to set attribute" % obj.__name__)
+        if inspect.isclass(obj):
+            type_ = obj
+            obj = None
+        else:
+            type_ = type(obj)
+        return self.fset.__get__(obj, type_)(value)
+
+    def setter(self, func):
+        if not isinstance(func, (classmethod, staticmethod)):
+            func = classmethod(func)
+        self.fset = func
+        return self
+
+class _MXClassPropertyMetaClass(type):
+    def __setattr__(cls, key, value):
+        if key in cls.__dict__:
+            obj = cls.__dict__.get(key)
+        if obj and isinstance(obj, _MXClassPropertyDescriptor):
+            return obj.__set__(cls, value)
+
+        return super(_MXClassPropertyMetaClass, cls).__setattr__(key, value)
+
+# with_metaclass function obtained from: https://github.com/benjaminp/six/blob/master/six.py
+#pylint: disable=unused-argument
+def with_metaclass(meta, *bases):
+    """Create a base class with a metaclass."""
+    # This requires a bit of explanation: the basic idea is to make a dummy
+    # metaclass for one level of class instantiation that replaces itself with
+    # the actual metaclass.
+    class metaclass(type):
+
+        def __new__(cls, name, this_bases, d):
+            return meta(name, bases, d)
+
+        @classmethod
+        def __prepare__(cls, name, this_bases):
+            return meta.__prepare__(name, bases)
+    return type.__new__(metaclass, 'temporary_class', (), {})
+#pylint: enable=unused-argument
+
+def classproperty(func):
+    if not isinstance(func, (classmethod, staticmethod)):
+        func = classmethod(func)
+
+    return _MXClassPropertyDescriptor(func)
+
+
 
 def _load_lib():
     """Load library by searching possible path."""
@@ -227,6 +288,7 @@ def c_str_array(strings):
         arr[:] = [s.encode('utf-8') for s in strings]
         return arr
 
+
 def c_array(ctype, values):
     """Create ctypes array from a Python array.
 
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index eb47614e333..61b70532dd7 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -18,8 +18,15 @@
 # coding: utf-8
 """Context management API of mxnet."""
 from __future__ import absolute_import
+import threading
+import warnings
+import ctypes
+from .base import classproperty, with_metaclass, _MXClassPropertyMetaClass
+from .base import _LIB
+from .base import check_call
 
-class Context(object):
+
+class Context(with_metaclass(_MXClassPropertyMetaClass, object)):
     """Constructs a context.
 
     MXNet can run operations on CPU and different GPUs.
@@ -61,7 +68,7 @@ class Context(object):
     gpu(1)
     """
     # static class variable
-    default_ctx = None
+    _default_ctx = threading.local()
     devtype2str = {1: 'cpu', 2: 'gpu', 3: 'cpu_pinned', 5: 'cpu_shared'}
     devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3, 'cpu_shared': 5}
     def __init__(self, device_type, device_id=0):
@@ -109,15 +116,37 @@ def __repr__(self):
         return self.__str__()
 
     def __enter__(self):
-        self._old_ctx = Context.default_ctx
-        Context.default_ctx = self
+        if not hasattr(Context._default_ctx, "value"):
+            Context._default_ctx.value = Context('cpu', 0)
+        self._old_ctx = Context._default_ctx.value
+        Context._default_ctx.value = self
         return self
 
     def __exit__(self, ptype, value, trace):
-        Context.default_ctx = self._old_ctx
+        Context._default_ctx.value = self._old_ctx
+
+    #pylint: disable=no-self-argument
+    @classproperty
+    def default_ctx(cls):
+        warnings.warn("Context.default_ctx has been deprecated. "
+                      "Please use Context.current_context() instead. "
+                      "Please use test_utils.set_default_context to set a default context",
+                      DeprecationWarning)
+        if not hasattr(Context._default_ctx, "value"):
+            cls._default_ctx.value = Context('cpu', 0)
+        return cls._default_ctx.value
+
+    @default_ctx.setter
+    def default_ctx(cls, val):
+        warnings.warn("Context.default_ctx has been deprecated. "
+                      "Please use Context.current_context() instead. "
+                      "Please use test_utils.set_default_context to set a default context",
+                      DeprecationWarning)
+        cls._default_ctx.value = val
+    #pylint: enable=no-self-argument
 
 # initialize the default context in Context
-Context.default_ctx = Context('cpu', 0)
+Context._default_ctx.value = Context('cpu', 0)
 
 
 def cpu(device_id=0):
@@ -212,6 +241,23 @@ def gpu(device_id=0):
     return Context('gpu', device_id)
 
 
+def num_gpus():
+    """Query CUDA for the number of GPUs present.
+
+    Raises
+    ------
+    Will raise an exception on any CUDA error.
+
+    Returns
+    -------
+    count : int
+        The number of GPUs.
+
+    """
+    count = ctypes.c_int()
+    check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
+    return count.value
+
 def current_context():
     """Returns the current context.
 
@@ -234,4 +280,6 @@ def current_context():
     -------
     default_ctx : Context
     """
-    return Context.default_ctx
+    if not hasattr(Context._default_ctx, "value"):
+        Context._default_ctx.value = Context('cpu', 0)
+    return Context._default_ctx.value
diff --git a/python/mxnet/contrib/onnx/__init__.py b/python/mxnet/contrib/onnx/__init__.py
index 169ac673455..fb8488ca4f2 100644
--- a/python/mxnet/contrib/onnx/__init__.py
+++ b/python/mxnet/contrib/onnx/__init__.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 """Module for ONNX model format support for Apache MXNet."""
 
-from ._import.import_model import import_model
+from ._import.import_model import import_model, get_model_metadata
diff --git a/python/mxnet/contrib/onnx/_import/import_model.py b/python/mxnet/contrib/onnx/_import/import_model.py
index 1bd4b418bc3..4e4d7863755 100644
--- a/python/mxnet/contrib/onnx/_import/import_model.py
+++ b/python/mxnet/contrib/onnx/_import/import_model.py
@@ -52,3 +52,33 @@ def import_model(model_file):
     model_proto = onnx.load(model_file)
     sym, arg_params, aux_params = graph.from_onnx(model_proto.graph)
     return sym, arg_params, aux_params
+
+def get_model_metadata(model_file):
+    """
+    Returns the name and shape information of input and output tensors of the given ONNX model file.
+
+    Parameters
+    ----------
+    model_file : str
+        ONNX model file name
+
+    Returns
+    -------
+    model_metadata : dict
+        A dictionary object mapping various metadata to its corresponding value.
+        The dictionary will have the following template.
+        {
+            'input_tensor_data' : <list of tuples representing the shape of the input paramters>,
+            'output_tensor_data' : <list of tuples representing the shape of the output
+                                    of the model>
+        }
+    """
+    graph = GraphProto()
+    try:
+        import onnx
+    except ImportError:
+        raise ImportError("Onnx and protobuf need to be installed. "
+                          + "Instructions to install - https://github.com/onnx/onnx")
+    model_proto = onnx.load(model_file)
+    metadata = graph.get_graph_metadata(model_proto.graph)
+    return metadata
diff --git a/python/mxnet/contrib/onnx/_import/import_onnx.py b/python/mxnet/contrib/onnx/_import/import_onnx.py
index 5192c6f8a85..db233578ff9 100644
--- a/python/mxnet/contrib/onnx/_import/import_onnx.py
+++ b/python/mxnet/contrib/onnx/_import/import_onnx.py
@@ -132,6 +132,29 @@ def from_onnx(self, graph):
             out = out[0]
         return out, argDict, auxDict
 
+    def get_graph_metadata(self, graph):
+        """
+        Get the model metadata from a given onnx graph.
+        """
+        _params = set()
+        for tensor_vals in graph.initializer:
+            _params.add(tensor_vals.name)
+
+        input_data = []
+        for graph_input in graph.input:
+            if graph_input.name not in _params:
+                shape = [val.dim_value for val in graph_input.type.tensor_type.shape.dim]
+                input_data.append((graph_input.name, tuple(shape)))
+
+        output_data = []
+        for graph_out in graph.output:
+            shape = [val.dim_value for val in graph_out.type.tensor_type.shape.dim]
+            output_data.append((graph_out.name, tuple(shape)))
+        metadata = {'input_tensor_data' : input_data,
+                    'output_tensor_data' : output_data
+                   }
+        return metadata
+
     def _parse_array(self, tensor_proto):
         """Grab data in TensorProto and convert to numpy array."""
         try:
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 579e6d3e35b..c0272c5bb43 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -20,15 +20,15 @@
 """Symbolic Executor component of MXNet."""
 from __future__ import absolute_import
 
+from array import array as py_array
 import ctypes
 import copy
 import numpy as np
 from .base import _LIB
-from .base import mx_uint, NDArrayHandle, ExecutorHandle
-from .base import check_call, c_handle_array, py_str
+from .base import mx_uint, NDArrayHandle, ExecutorHandle, py_str
+from .base import check_call, c_handle_array, c_array_buf, c_str_array
 from .ndarray import NDArray
 from .ndarray import _ndarray_cls
-from . import ndarray as nd
 
 # those functions are not used here, we just import them to keep backward compatibility
 # in case the end user calls them, as they originally lives here
@@ -399,62 +399,73 @@ def reshape(self, partial_shaping=False, allow_up_sizing=False, **kwargs):
         >>> texec.reshape(allow_up_sizing=True, **new_shape)
         """
         # pylint: disable=too-many-branches
-        arg_shapes, _, aux_shapes = self._symbol.infer_shape(**kwargs)
-        if arg_shapes is None:
-            raise ValueError("Insufficient argument shapes provided.")
-
-        new_arg_dict = {}
-        new_grad_dict = {}
-        for i, name in enumerate(self._symbol.list_arguments()):
-            new_shape = arg_shapes[i]
-            arr = self.arg_arrays[i]
-            darr = None if self.grad_arrays is None else self.grad_arrays[i]
-            if partial_shaping or name in kwargs or new_shape == arr.shape:
-                if np.prod(new_shape) > np.prod(arr.shape):
-                    assert allow_up_sizing, "New shape of arg:%s larger than original. "%name + \
-                        "First making a big executor and then down sizing it " + \
-                        "is more efficient than the reverse." + \
-                        "If you really want to up size, set allow_up_sizing=True " + \
-                        "to enable allocation of new arrays."
-                    new_arg_dict[name] = nd.empty(new_shape, ctx=arr.context, dtype=arr.dtype)
-                    if darr is not None:
-                        new_grad_dict[name] = nd.empty(new_shape, ctx=darr.context, dtype=arr.dtype)
-                else:
-                    new_arg_dict[name] = arr.reshape(new_shape)
-                    if darr is not None:
-                        new_grad_dict[name] = darr.reshape(new_shape)
-            else:
-                raise AssertionError("Shape of unspecified array arg:%s changed. "%name + \
-                    "This can cause the new executor to not share parameters " + \
-                    "with the old one. Please check for error in network." +\
-                    "If this is intended, set partial_shaping=True to suppress this warning.")
-
-        new_aux_dict = {}
-        for name, new_shape, arr in zip(self._symbol.list_auxiliary_states(),
-                                        aux_shapes, self.aux_arrays):
-            if partial_shaping or new_shape == arr.shape:
-                if np.prod(new_shape) > np.prod(arr.shape):
-                    assert allow_up_sizing, "New shape of arg:%s larger than original. "%name + \
-                        "First making a big executor and then down sizing it " + \
-                        "is more efficient than the reverse." + \
-                        "If you really want to up size, set allow_up_sizing=True " + \
-                        "to enable allocation of new arrays."
-                    new_aux_dict[name] = nd.empty(new_shape, ctx=arr.context, dtype=arr.dtype)
-                else:
-                    new_aux_dict[name] = arr.reshape(new_shape)
-            else:
-                raise AssertionError("Shape of unspecified array aux:%s changed. "%name + \
-                    "This can cause the new executor to not share parameters " + \
-                    "with the old one. Please check for error in network." +\
-                    "If this is intended, set partial_shaping=True to suppress this warning.")
-
-        return self._symbol.bind(self._ctx,
-                                 args=new_arg_dict,
-                                 args_grad=new_grad_dict,
-                                 grad_req=self._grad_req,
-                                 aux_states=new_aux_dict,
-                                 group2ctx=self._group2ctx,
-                                 shared_exec=self)
+        provided_arg_shape_data = []  # shape data
+        # argument shape index in sdata,
+        # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg
+        provided_arg_shape_idx = [0]
+        provided_arg_shape_names = []  # provided argument names
+        for k, v in kwargs.items():
+            if isinstance(v, tuple):
+                provided_arg_shape_names.append(k)
+                provided_arg_shape_data.extend(v)
+                provided_arg_shape_idx.append(len(provided_arg_shape_data))
+
+        ctx_map_keys = []
+        ctx_map_dev_types = []
+        ctx_map_dev_ids = []
+
+        if self._group2ctx:
+            for key, val in self._group2ctx.items():
+                ctx_map_keys.append(key)
+                ctx_map_dev_types.append(val.device_typeid)
+                ctx_map_dev_ids.append(val.device_id)
+
+        handle = ExecutorHandle()
+        shared_handle = self.handle
+
+        num_in_args = ctypes.c_uint()
+        in_arg_handles = ctypes.POINTER(NDArrayHandle)()
+        arg_grad_handles = ctypes.POINTER(NDArrayHandle)()
+        num_aux_states = ctypes.c_uint()
+        aux_state_handles = ctypes.POINTER(NDArrayHandle)()
+
+        check_call(_LIB.MXExecutorReshape(ctypes.c_int(int(partial_shaping)),
+                                          ctypes.c_int(int(allow_up_sizing)),
+                                          ctypes.c_int(self._ctx.device_typeid),
+                                          ctypes.c_int(self._ctx.device_id),
+                                          mx_uint(len(ctx_map_keys)),
+                                          c_str_array(ctx_map_keys),
+                                          c_array_buf(ctypes.c_int,
+                                                      py_array('i', ctx_map_dev_types)),
+                                          c_array_buf(ctypes.c_int,
+                                                      py_array('i', ctx_map_dev_ids)),
+                                          mx_uint(len(provided_arg_shape_names)),
+                                          c_str_array(provided_arg_shape_names),
+                                          c_array_buf(mx_uint,
+                                                      py_array('I', provided_arg_shape_data)),
+                                          c_array_buf(mx_uint,
+                                                      py_array('I', provided_arg_shape_idx)),
+                                          ctypes.byref(num_in_args),
+                                          ctypes.byref(in_arg_handles),
+                                          ctypes.byref(arg_grad_handles),
+                                          ctypes.byref(num_aux_states),
+                                          ctypes.byref(aux_state_handles),
+                                          shared_handle,
+                                          ctypes.byref(handle)))
+
+        arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i]))
+                      for i in range(num_in_args.value)]
+        grad_arrays = [_ndarray_cls(NDArrayHandle(arg_grad_handles[i]))
+                       if arg_grad_handles[i] is not None
+                       else None for i in range(num_in_args.value)]
+        aux_arrays = [_ndarray_cls(NDArrayHandle(aux_state_handles[i]))
+                      for i in range(num_aux_states.value)]
+
+        executor = Executor(handle, self._symbol, self._ctx, self._grad_req, self._group2ctx)
+        executor.arg_arrays = arg_arrays
+        executor.grad_arrays = grad_arrays
+        executor.aux_arrays = aux_arrays
+        return executor
 
     def debug_str(self):
         """Get a debug string about internal execution plan.
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index abc474850f2..dbe3c5e032b 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -20,6 +20,7 @@
 """Base container class for all neural network models."""
 __all__ = ['Block', 'HybridBlock', 'SymbolBlock']
 
+import threading
 import copy
 import warnings
 import re
@@ -30,12 +31,12 @@
 from ..ndarray import NDArray
 from .. import name as _name
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
-from .utils import _indent, _brief_print_list
+from .utils import _indent, _brief_print_list, HookHandle
 
 
 class _BlockScope(object):
     """Scope for collecting child `Block` s."""
-    _current = None
+    _current = threading.local()
 
     def __init__(self, block):
         self._block = block
@@ -46,10 +47,10 @@ def __init__(self, block):
     @staticmethod
     def create(prefix, params, hint):
         """Creates prefix and params for new `Block`."""
-        current = _BlockScope._current
+        current = getattr(_BlockScope._current, "value", None)
         if current is None:
             if prefix is None:
-                prefix = _name.NameManager.current.get(None, hint) + '_'
+                prefix = _name.NameManager._current.value.get(None, hint) + '_'
             if params is None:
                 params = ParameterDict(prefix)
             else:
@@ -70,8 +71,8 @@ def create(prefix, params, hint):
     def __enter__(self):
         if self._block._empty_prefix:
             return self
-        self._old_scope = _BlockScope._current
-        _BlockScope._current = self
+        self._old_scope = getattr(_BlockScope._current, "value", None)
+        _BlockScope._current.value = self
         self._name_scope = _name.Prefix(self._block.prefix)
         self._name_scope.__enter__()
         return self
@@ -81,7 +82,7 @@ def __exit__(self, ptype, value, trace):
             return
         self._name_scope.__exit__(ptype, value, trace)
         self._name_scope = None
-        _BlockScope._current = self._old_scope
+        _BlockScope._current.value = self._old_scope
 
 
 def _flatten(args, inout_str):
@@ -172,6 +173,8 @@ def __init__(self, prefix=None, params=None):
         self._scope = _BlockScope(self)
         self._children = OrderedDict()
         self._reg_params = {}
+        self._forward_hooks = OrderedDict()
+        self._forward_pre_hooks = OrderedDict()
 
     def __repr__(self):
         s = '{name}(\n{modstr}\n)'
@@ -354,7 +357,6 @@ def load_params(self, filename, ctx=None, allow_missing=False,
                         name, filename, _brief_print_list(self._params.keys())))
             params[name]._load_init(loaded[name], ctx)
 
-
     def register_child(self, block, name=None):
         """Registers block as a child of self. :py:class:`Block` s assigned to self as
         attributes will be registered automatically."""
@@ -362,6 +364,61 @@ def register_child(self, block, name=None):
             name = str(len(self._children))
         self._children[name] = block
 
+    def register_forward_pre_hook(self, hook):
+        r"""Registers a forward pre-hook on the block.
+
+        The hook function is called immediately before :func:`forward`.
+        It should not modify the input or output.
+
+        Parameters
+        ----------
+        hook : callable
+            The forward hook function of form `hook(block, input) -> None`.
+
+        Returns
+        -------
+        :class:`mxnet.gluon.utils.HookHandle`
+        """
+        handle = HookHandle()
+        handle.attach(self._forward_pre_hooks, hook)
+        return handle
+
+    def register_forward_hook(self, hook):
+        r"""Registers a forward hook on the block.
+
+        The hook function is called immediately after :func:`forward`.
+        It should not modify the input or output.
+
+        Parameters
+        ----------
+        hook : callable
+            The forward hook function of form `hook(block, input, output) -> None`.
+
+        Returns
+        -------
+        :class:`mxnet.gluon.utils.HookHandle`
+        """
+        handle = HookHandle()
+        handle.attach(self._forward_hooks, hook)
+        return handle
+
+    def apply(self, fn):
+        r"""Applies ``fn`` recursively to every child block as well as self.
+
+        Parameters
+        ----------
+        fn : callable
+            Function to be applied to each submodule, of form `fn(block)`.
+
+        Returns
+        -------
+        this block
+        """
+        for cld in self._children.values():
+            cld.apply(fn)
+        fn(self)
+        return self
+
     def initialize(self, init=initializer.Uniform(), ctx=None, verbose=False,
                    force_reinit=False):
         """Initializes :py:class:`Parameter` s of this :py:class:`Block` and its children.
@@ -410,7 +467,15 @@ def cast(self, dtype):
 
     def __call__(self, *args):
         """Calls forward. Only accepts positional arguments."""
-        return self.forward(*args)
+        for hook in self._forward_pre_hooks.values():
+            hook(self, args)
+
+        out = self.forward(*args)
+
+        for hook in self._forward_hooks.values():
+            hook(self, args, out)
+
+        return out
 
     def forward(self, *args):
         """Overrides to implement forward computation using :py:class:`NDArray`. Only
@@ -424,6 +489,105 @@ def forward(self, *args):
         # pylint: disable= invalid-name
         raise NotImplementedError
 
+    def summary(self, *inputs):
+        """Print the summary of the model's output and parameters.
+
+        The network must have been initialized, and must not have been hybridized.
+
+        Parameters
+        ----------
+        inputs : object
+            Any input that the model supports. For any tensor in the input, only
+            :class:`mxnet.ndarray.NDArray` is supported.
+        """
+        summary = OrderedDict()
+        hooks = []
+
+        def _get_shape_str(args):
+            def flatten(args):
+                if not isinstance(args, (list, tuple)):
+                    return [args], int(0)
+                flat = []
+                fmts = []
+                for i in args:
+                    arg, fmt = flatten(i)
+                    flat.extend(arg)
+                    fmts.append(fmt)
+                return flat, fmts
+
+            def regroup(args, fmt):
+                if isinstance(fmt, int):
+                    if fmt == 0:
+                        return args[0], args[1:]
+                    return args[:fmt], args[fmt:]
+                ret = []
+                for i in fmt:
+                    res, args = regroup(args, i)
+                    ret.append(res)
+                return ret, args
+
+            flat_args, fmts = flatten(args)
+            flat_arg_shapes = [x.shape if isinstance(x, ndarray.NDArray) else x
+                               for x in flat_args]
+            shapes = regroup(flat_arg_shapes, fmts)[0]
+            if isinstance(shapes, list):
+                shape_str = str(shapes)[1:-1]
+            else:
+                shape_str = str(shapes)
+            return shape_str.replace('L', '')
+
+        def _register_summary_hook(block):
+            assert not isinstance(block, HybridBlock) or not block._active, \
+                    '"{}" must not be hybridized to print summary.'.format(block.name)
+            def _summary_hook(block, _, outputs):
+                class_name = block.__class__.__name__
+                block_idx = len(summary) - 1
+
+                m_key = '%s-%i' % (class_name, block_idx+1)
+                summary[m_key] = OrderedDict()
+                summary[m_key]['output_shape'] = _get_shape_str(outputs)
+
+                params = 0
+                summary[m_key]['trainable'] = 0
+                for p in block._reg_params.values():
+                    params += p.data().size
+                    summary[m_key]['trainable'] += 0 if p.grad_req == 'null' else p.data().size
+                summary[m_key]['n_params'] = params
+
+            from .nn.basic_layers import Sequential, HybridSequential
+            if not isinstance(block, (Sequential, HybridSequential)):
+                hooks.append(block.register_forward_hook(_summary_hook))
+
+        summary['Input'] = OrderedDict()
+        summary['Input']['output_shape'] = _get_shape_str(inputs)
+        summary['Input']['n_params'] = 0
+        summary['Input']['trainable'] = 0
+
+        try:
+            self.apply(_register_summary_hook)
+            self(*inputs)
+
+            line_format = '{:>20}  {:>42} {:>15}'
+            print('-'*80)
+            print(line_format.format('Layer (type)', 'Output Shape', 'Param #'))
+            print('='*80)
+            total_params = 0
+            trainable_params = 0
+            for layer in summary:
+                print(line_format.format(layer,
+                                         str(summary[layer]['output_shape']),
+                                         summary[layer]['n_params']))
+                total_params += summary[layer]['n_params']
+                trainable_params += summary[layer]['trainable']
+            print('='*80)
+            print('Total params: ' + str(total_params))
+            print('Trainable params: ' + str(trainable_params))
+            print('Non-trainable params: ' + str(total_params - trainable_params))
+            print('-'*80)
+        finally:
+            for h in hooks:
+                h.detach()
+
 
 class HybridBlock(Block):
     """`HybridBlock` supports forwarding with both Symbol and NDArray.
@@ -548,6 +712,9 @@ def hybridize(self, active=True, **kwargs):
         self._active = active
         self._flags = kwargs.items()
         self._clear_cached_op()
+        if active and self._forward_hooks or self._forward_pre_hooks:
+            warnings.warn('"{}" is being hybridized while still having forward hook/pre-hook. '
+                          'If "{}" is a child of HybridBlock, the hooks will not take effect.')
         super(HybridBlock, self).hybridize(active, **kwargs)
 
     def cast(self, dtype):
@@ -648,6 +815,18 @@ def hybrid_forward(self, F, x, *args, **kwargs):
         # pylint: disable= invalid-name
         raise NotImplementedError
 
+def _common_prefix(names):
+    """Get the common prefix for all names"""
+    if not names:
+        return ''
+    prefix = names[0]
+    for name in names:
+        i = 0
+        while i < len(prefix) and i < len(name) and prefix[i] == name[i]:
+            i += 1
+        prefix = prefix[:i]
+    return prefix
+
 
 class SymbolBlock(HybridBlock):
     """Construct block from symbol. This is useful for using pre-trained models
@@ -709,6 +888,8 @@ def __init__(self, outputs, inputs, params=None):
                 self.params.get(i, grad_req='null', allow_deferred_init=True)
 
         self._cached_graph = syms, out
+        len_prefix = len(_common_prefix(list(self._params.keys())))
+        self._reg_params = {key[len_prefix:]: val for key, val in self._params.items()}
 
     def forward(self, x, *args):
         if isinstance(x, NDArray):
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 7f09e286742..151b49d457a 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -20,27 +20,49 @@
 """Dataset generator."""
 __all__ = ['DataLoader']
 
-import multiprocessing
-import multiprocessing.queues
-from multiprocessing.reduction import ForkingPickler
 import pickle
 import io
 import sys
+import multiprocessing
+import multiprocessing.queues
+from multiprocessing.reduction import ForkingPickler
 import numpy as np
 
+try:
+    import multiprocessing.resource_sharer
+except ImportError:
+    pass
+
 from . import sampler as _sampler
 from ... import nd, context
 
-
-def rebuild_ndarray(*args):
-    """Rebuild ndarray from pickled shared memory"""
-    # pylint: disable=no-value-for-parameter
-    return nd.NDArray(nd.ndarray._new_from_shared_mem(*args))
-
-
-def reduce_ndarray(data):
-    """Reduce ndarray to shared memory handle"""
-    return rebuild_ndarray, data._to_shared_mem()
+if sys.platform == 'darwin' or sys.platform == 'win32':
+    def rebuild_ndarray(*args):
+        """Rebuild ndarray from pickled shared memory"""
+        # pylint: disable=no-value-for-parameter
+        return nd.NDArray(nd.ndarray._new_from_shared_mem(*args))
+
+    def reduce_ndarray(data):
+        """Reduce ndarray to shared memory handle"""
+        return rebuild_ndarray, data._to_shared_mem()
+else:
+    def rebuild_ndarray(pid, fd, shape, dtype):
+        """Rebuild ndarray from pickled shared memory"""
+        # pylint: disable=no-value-for-parameter
+        if sys.version_info[0] == 2:
+            fd = multiprocessing.reduction.rebuild_handle(fd)
+        else:
+            fd = fd.detach()
+        return nd.NDArray(nd.ndarray._new_from_shared_mem(pid, fd, shape, dtype))
+
+    def reduce_ndarray(data):
+        """Reduce ndarray to shared memory handle"""
+        pid, fd, shape, dtype = data._to_shared_mem()
+        if sys.version_info[0] == 2:
+            fd = multiprocessing.reduction.reduce_handle(fd)
+        else:
+            fd = multiprocessing.reduction.DupFd(fd)
+        return rebuild_ndarray, (pid, fd, shape, dtype)
 
 ForkingPickler.register(nd.NDArray, reduce_ndarray)
 
@@ -83,6 +105,21 @@ def __init__(self, *args, **kwargs):
         self._recv = self._reader.recv
 
 
+class SimpleQueue(multiprocessing.queues.SimpleQueue):
+    """Wrapper for multiprocessing SimpleQueue that dumps NDArray with shared memory.
+       SimpleQueue don't use threading internally.
+    """
+    def __init__(self, *args, **kwargs):
+        if sys.version_info[0] <= 2:
+            super(SimpleQueue, self).__init__(*args, **kwargs)
+        else:
+            super(SimpleQueue, self).__init__(*args, ctx=multiprocessing.get_context(),
+                                              **kwargs)
+        self._reader = ConnectionWrapper(self._reader)
+        self._writer = ConnectionWrapper(self._writer)
+        self._send = self._writer.send
+        self._recv = self._reader.recv
+
 def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
@@ -128,7 +165,7 @@ def __init__(self, num_workers, dataset, batchify_fn, batch_sampler):
         self._batchify_fn = batchify_fn
         self._batch_sampler = batch_sampler
         self._key_queue = Queue()
-        self._data_queue = Queue(2*self._num_workers)
+        self._data_queue = Queue() if sys.version_info[0] <= 2 else SimpleQueue()
         self._data_buffer = {}
         self._rcvd_idx = 0
         self._sent_idx = 0
@@ -170,10 +207,10 @@ def __next__(self):
             raise StopIteration
 
         while True:
+            self._push_next()
             if self._rcvd_idx in self._data_buffer:
                 batch = self._data_buffer.pop(self._rcvd_idx)
                 self._rcvd_idx += 1
-                self._push_next()
                 return batch
             idx, batch = self._data_queue.get()
             self._data_buffer[idx] = batch
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 5e65715e1b2..7ec1c32d5e3 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -200,7 +200,7 @@ def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0/4.0, 4.0/3.0),
         super(RandomResizedCrop, self).__init__()
         if isinstance(size, numeric_types):
             size = (size, size)
-        self._args = (size, scale[0], ratio, interpolation)
+        self._args = (size, scale, ratio, interpolation)
 
     def forward(self, x):
         return image.random_size_crop(x, *self._args)[0]
diff --git a/python/mxnet/gluon/model_zoo/model_store.py b/python/mxnet/gluon/model_zoo/model_store.py
index 14ec8d0a236..cbd95cfcc8b 100644
--- a/python/mxnet/gluon/model_zoo/model_store.py
+++ b/python/mxnet/gluon/model_zoo/model_store.py
@@ -35,7 +35,10 @@
     ('8e9d539cc66aa5efa71c4b6af983b936ab8701c3', 'mobilenet0.5'),
     ('529b2c7f4934e6cb851155b22c96c9ab0a7c4dc2', 'mobilenet0.75'),
     ('6b8c5106c730e8750bcd82ceb75220a3351157cd', 'mobilenet1.0'),
-    ('3ab4967b7a12a9246a144c9dfff74506cb78a526', 'mobilenetv2_1.0'),
+    ('36da4ff1867abccd32b29592d79fc753bca5a215', 'mobilenetv2_1.0'),
+    ('e2be7b72a79fe4a750d1dd415afedf01c3ea818d', 'mobilenetv2_0.75'),
+    ('aabd26cd335379fcb72ae6c8fac45a70eab11785', 'mobilenetv2_0.5'),
+    ('ae8f9392789b04822cbb1d98c27283fc5f8aa0a7', 'mobilenetv2_0.25'),
     ('e54b379f50fa4b10bbd2506237e3bd74e6164778', 'resnet18_v1'),
     ('c1dc0967a3d25ee9127e03bc1046a5d44d92e2ba', 'resnet34_v1'),
     ('c940b1a062b32e3a5762f397c9d1e178b5abd007', 'resnet50_v1'),
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index d86c3e6ce4f..abde51b433a 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -381,7 +381,8 @@ class Embedding(HybridBlock):
         Data type of output embeddings.
     weight_initializer : Initializer
         Initializer for the `embeddings` matrix.
-
+    sparse_grad: bool
+        If True, gradient w.r.t. weight will be a 'row_sparse' NDArray.
 
     Inputs:
         - **data**: (N-1)-D tensor with shape: `(x1, x2, ..., xN-1)`.
@@ -390,13 +391,14 @@ class Embedding(HybridBlock):
         - **out**: N-D tensor with shape: `(x1, x2, ..., xN-1, output_dim)`.
     """
     def __init__(self, input_dim, output_dim, dtype='float32',
-                 weight_initializer=None, **kwargs):
+                 weight_initializer=None, sparse_grad=False, **kwargs):
         super(Embedding, self).__init__(**kwargs)
+        grad_stype = 'row_sparse' if sparse_grad else 'default'
         self._kwargs = {'input_dim': input_dim, 'output_dim': output_dim,
-                        'dtype': dtype}
+                        'dtype': dtype, 'sparse_grad': sparse_grad}
         self.weight = self.params.get('weight', shape=(input_dim, output_dim),
                                       init=weight_initializer, dtype=dtype,
-                                      allow_deferred_init=True)
+                                      allow_deferred_init=True, grad_stype=grad_stype)
 
     def hybrid_forward(self, F, x, weight):
         return F.Embedding(x, weight, name='fwd', **self._kwargs)
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 87a62bc8c70..7b4a6be9096 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -196,7 +196,7 @@ class Conv1D(_Conv):
         layers side by side, each seeing half the input channels, and producing
         half the output channels, and both subsequently concatenated.
     layout: str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Convolution is applied on the 'W' dimension.
     in_channels : int, default 0
@@ -229,6 +229,7 @@ def __init__(self, channels, kernel_size, strides=1, padding=0, dilation=1,
                  groups=1, layout='NCW', activation=None, use_bias=True,
                  weight_initializer=None, bias_initializer='zeros',
                  in_channels=0, **kwargs):
+        assert layout == 'NCW', "Only supports 'NCW' layout for now"
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)
         assert len(kernel_size) == 1, "kernel_size must be a number or a list of 1 ints"
@@ -271,9 +272,9 @@ class Conv2D(_Conv):
         layers side by side, each seeing half the input channels, and producing
         half the output channels, and both subsequently concatenated.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
-        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
-        dimensions respectively. Convolution is applied on the 'H' and
+        Dimension ordering of data and weight. Only supports 'NCHW' and 'NHWC'
+        layout for now. 'N', 'C', 'H', 'W' stands for batch, channel, height,
+        and width dimensions respectively. Convolution is applied on the 'H' and
         'W' dimensions.
     in_channels : int, default 0
         The number of input channels to this layer. If not specified,
@@ -293,12 +294,12 @@ class Conv2D(_Conv):
 
     Inputs:
         - **data**: 4D input tensor with shape
-          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          `(batch_size, in_channels, height, width)` when `layout` is `NCHW`.
           For other layouts shape is permuted accordingly.
 
     Outputs:
         - **out**: 4D output tensor with shape
-          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCHW`.
           out_height and out_width are calculated as::
 
               out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
@@ -308,6 +309,8 @@ def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
                  dilation=(1, 1), groups=1, layout='NCHW',
                  activation=None, use_bias=True, weight_initializer=None,
                  bias_initializer='zeros', in_channels=0, **kwargs):
+        assert layout == 'NCHW' or layout == 'NHWC', \
+            "Only supports 'NCHW' and 'NHWC' layout for now"
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*2
         assert len(kernel_size) == 2, "kernel_size must be a number or a list of 2 ints"
@@ -350,9 +353,9 @@ class Conv3D(_Conv):
         layers side by side, each seeing half the input channels, and producing
         half the output channels, and both subsequently concatenated.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
-        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
-        depth dimensions respectively. Convolution is applied on the 'D',
+        Dimension ordering of data and weight. Only supports 'NCDHW' and 'NDHWC'
+        layout for now. 'N', 'C', 'H', 'W', 'D' stands for batch, channel, height,
+        width and depth dimensions respectively. Convolution is applied on the 'D',
         'H' and 'W' dimensions.
     in_channels : int, default 0
         The number of input channels to this layer. If not specified,
@@ -372,12 +375,12 @@ class Conv3D(_Conv):
 
     Inputs:
         - **data**: 5D input tensor with shape
-          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCW`.
+          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCDHW`.
           For other layouts shape is permuted accordingly.
 
     Outputs:
         - **out**: 5D output tensor with shape
-          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCDHW`.
           out_depth, out_height and out_width are calculated as::
 
               out_depth = floor((depth+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
@@ -388,6 +391,8 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
                  dilation=(1, 1, 1), groups=1, layout='NCDHW', activation=None,
                  use_bias=True, weight_initializer=None, bias_initializer='zeros',
                  in_channels=0, **kwargs):
+        assert layout == 'NCDHW' or layout == 'NDHWC', \
+            "Only supports 'NCDHW' and 'NDHWC' layout for now"
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*3
         assert len(kernel_size) == 3, "kernel_size must be a number or a list of 3 ints"
@@ -431,7 +436,7 @@ class Conv1DTranspose(_Conv):
         layers side by side, each seeing half the input channels, and producing
         half the output channels, and both subsequently concatenated.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Convolution is applied on the 'W' dimension.
     in_channels : int, default 0
@@ -464,6 +469,7 @@ def __init__(self, channels, kernel_size, strides=1, padding=0, output_padding=0
                  dilation=1, groups=1, layout='NCW', activation=None, use_bias=True,
                  weight_initializer=None, bias_initializer='zeros',
                  in_channels=0, **kwargs):
+        assert layout == 'NCW', "Only supports 'NCW' layout for now"
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)
         if isinstance(output_padding, numeric_types):
@@ -513,9 +519,9 @@ class Conv2DTranspose(_Conv):
         layers side by side, each seeing half the input channels, and producing
         half the output channels, and both subsequently concatenated.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
-        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
-        dimensions respectively. Convolution is applied on the 'H' and
+        Dimension ordering of data and weight. Only supports 'NCHW' and 'NHWC'
+        layout for now. 'N', 'C', 'H', 'W' stands for batch, channel, height,
+        and width dimensions respectively. Convolution is applied on the 'H' and
         'W' dimensions.
     in_channels : int, default 0
         The number of input channels to this layer. If not specified,
@@ -535,12 +541,12 @@ class Conv2DTranspose(_Conv):
 
     Inputs:
         - **data**: 4D input tensor with shape
-          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          `(batch_size, in_channels, height, width)` when `layout` is `NCHW`.
           For other layouts shape is permuted accordingly.
 
     Outputs:
         - **out**: 4D output tensor with shape
-          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCHW`.
           out_height and out_width are calculated as::
 
               out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
@@ -550,6 +556,8 @@ def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
                  output_padding=(0, 0), dilation=(1, 1), groups=1, layout='NCHW',
                  activation=None, use_bias=True, weight_initializer=None,
                  bias_initializer='zeros', in_channels=0, **kwargs):
+        assert layout == 'NCHW' or layout == 'NHWC', \
+            "Only supports 'NCHW' and 'NHWC' layout for now"
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*2
         if isinstance(output_padding, numeric_types):
@@ -599,10 +607,10 @@ class Conv3DTranspose(_Conv):
         layers side by side, each seeing half the input channels, and producing
         half the output channels, and both subsequently concatenated.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
-        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
-        depth dimensions respectively. Convolution is applied on the 'D',
-        'H', and 'W' dimensions.
+        Dimension ordering of data and weight. Only supports 'NCDHW' and 'NDHWC'
+        layout for now. 'N', 'C', 'H', 'W', 'D' stands for batch, channel, height,
+        width and depth dimensions respectively. Convolution is applied on the 'D',
+        'H' and 'W' dimensions.
     in_channels : int, default 0
         The number of input channels to this layer. If not specified,
         initialization will be deferred to the first time `forward` is called
@@ -621,12 +629,12 @@ class Conv3DTranspose(_Conv):
 
     Inputs:
         - **data**: 5D input tensor with shape
-          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCW`.
+          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCDHW`.
           For other layouts shape is permuted accordingly.
 
     Outputs:
         - **out**: 5D output tensor with shape
-          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCDHW`.
           out_depth, out_height and out_width are calculated as::
 
             out_depth = (depth-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
@@ -637,6 +645,8 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
                  output_padding=(0, 0, 0), dilation=(1, 1, 1), groups=1, layout='NCDHW',
                  activation=None, use_bias=True, weight_initializer=None,
                  bias_initializer='zeros', in_channels=0, **kwargs):
+        assert layout == 'NCDHW' or layout == 'NDHWC', \
+            "Only supports 'NCDHW' and 'NDHWC' layout for now"
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*3
         if isinstance(output_padding, numeric_types):
@@ -694,7 +704,7 @@ class MaxPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
     ceil_mode : bool, default False
@@ -716,7 +726,7 @@ class MaxPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCW', "Only supports NCW layout for now"
+        assert layout == 'NCW', "Only supports 'NCW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
@@ -739,7 +749,7 @@ class MaxPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
+        Dimension ordering of data and weight. Only supports 'NCHW' layout for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -748,12 +758,12 @@ class MaxPool2D(_Pooling):
 
     Inputs:
         - **data**: 4D input tensor with shape
-          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          `(batch_size, in_channels, height, width)` when `layout` is `NCHW`.
           For other layouts shape is permuted accordingly.
 
     Outputs:
         - **out**: 4D output tensor with shape
-          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCHW`.
           out_height and out_width are calculated as::
 
               out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
@@ -764,7 +774,7 @@ class MaxPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0, layout='NCHW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCHW', "Only supports NCHW layout for now"
+        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
@@ -787,7 +797,7 @@ class MaxPool3D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
+        Dimension ordering of data and weight. Only supports 'NCDHW' layout for now.
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -802,7 +812,7 @@ class MaxPool3D(_Pooling):
 
     Outputs:
         - **out**: 5D output tensor with shape
-          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCDHW`.
           out_depth, out_height and out_width are calculated as::
 
               out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
@@ -814,7 +824,7 @@ class MaxPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports NCDHW layout for now"
+        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
@@ -836,7 +846,7 @@ class AvgPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
     ceil_mode : bool, default False
@@ -858,7 +868,7 @@ class AvgPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCW', "Only supports NCW layout for now"
+        assert layout == 'NCW', "Only supports 'NCW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
@@ -880,7 +890,7 @@ class AvgPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
+        Dimension ordering of data and weight. Only supports 'NCHW' layout for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -889,12 +899,12 @@ class AvgPool2D(_Pooling):
 
     Inputs:
         - **data**: 4D input tensor with shape
-          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          `(batch_size, in_channels, height, width)` when `layout` is `NCHW`.
           For other layouts shape is permuted accordingly.
 
     Outputs:
         - **out**: 4D output tensor with shape
-          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCHW`.
           out_height and out_width are calculated as::
 
               out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
@@ -905,7 +915,7 @@ class AvgPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCHW', **kwargs):
-        assert layout == 'NCHW', "Only supports NCHW layout for now"
+        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
@@ -937,12 +947,12 @@ class AvgPool3D(_Pooling):
 
     Inputs:
         - **data**: 5D input tensor with shape
-          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCW`.
+          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCDHW`.
           For other layouts shape is permuted accordingly.
 
     Outputs:
         - **out**: 5D output tensor with shape
-          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCDHW`.
           out_depth, out_height and out_width are calculated as::
 
               out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
@@ -954,7 +964,7 @@ class AvgPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports NCDHW layout for now"
+        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
@@ -965,7 +975,7 @@ def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
 class GlobalMaxPool1D(_Pooling):
     """Global max pooling operation for temporal data."""
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW', "Only supports NCW layout for now"
+        assert layout == 'NCW', "Only supports 'NCW' layout for now"
         super(GlobalMaxPool1D, self).__init__(
             (1,), None, 0, True, True, 'max', **kwargs)
 
@@ -973,7 +983,7 @@ def __init__(self, layout='NCW', **kwargs):
 class GlobalMaxPool2D(_Pooling):
     """Global max pooling operation for spatial data."""
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW', "Only supports NCHW layout for now"
+        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
         super(GlobalMaxPool2D, self).__init__(
             (1, 1), None, 0, True, True, 'max', **kwargs)
 
@@ -981,7 +991,7 @@ def __init__(self, layout='NCHW', **kwargs):
 class GlobalMaxPool3D(_Pooling):
     """Global max pooling operation for 3D data."""
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports NCDHW layout for now"
+        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
         super(GlobalMaxPool3D, self).__init__(
             (1, 1, 1), None, 0, True, True, 'max', **kwargs)
 
@@ -989,7 +999,7 @@ def __init__(self, layout='NCDHW', **kwargs):
 class GlobalAvgPool1D(_Pooling):
     """Global average pooling operation for temporal data."""
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW', "Only supports NCW layout for now"
+        assert layout == 'NCW', "Only supports 'NCW' layout for now"
         super(GlobalAvgPool1D, self).__init__(
             (1,), None, 0, True, True, 'avg', **kwargs)
 
@@ -997,7 +1007,7 @@ def __init__(self, layout='NCW', **kwargs):
 class GlobalAvgPool2D(_Pooling):
     """Global average pooling operation for spatial data."""
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW', "Only supports NCHW layout for now"
+        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
         super(GlobalAvgPool2D, self).__init__(
             (1, 1), None, 0, True, True, 'avg', **kwargs)
 
@@ -1005,7 +1015,7 @@ def __init__(self, layout='NCHW', **kwargs):
 class GlobalAvgPool3D(_Pooling):
     """Global max pooling operation for 3D data."""
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports NCDHW layout for now"
+        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
         super(GlobalAvgPool3D, self).__init__(
             (1, 1, 1), None, 0, True, True, 'avg', **kwargs)
 
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 04694dfa545..c7cbcccc95e 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -81,6 +81,8 @@ class Parameter(object):
         Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult.
     init : Initializer, default None
         Initializer of this parameter. Will use the global initializer by default.
+    grad_stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'.
+        The storage type of the parameter's gradient.
 
     Attributes
     ----------
@@ -97,7 +99,7 @@ class Parameter(object):
     """
     def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
                  lr_mult=1.0, wd_mult=1.0, init=None, allow_deferred_init=False,
-                 differentiable=True):
+                 differentiable=True, grad_stype='default'):
         self._var = None
         self._data = None
         self._grad = None
@@ -114,6 +116,11 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
         self.wd_mult = wd_mult
         self.grad_req = grad_req
         self.init = init
+        assert grad_stype in ['default', 'row_sparse', 'csr'], \
+            "grad_stype for Parameter '%s' must be one of 'default', 'row_sparse', or 'csr'," \
+            " but got '%s'" % (name, grad_stype)
+        self._grad_stype = grad_stype
+
 
     def __repr__(self):
         s = 'Parameter {name} (shape={shape}, dtype={dtype})'
@@ -261,7 +268,9 @@ def _init_grad(self):
             self._grad = None
             return
 
-        self._grad = [ndarray.zeros_like(i) for i in self._data]
+        self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
+                                    stype=self._grad_stype) for i in self._data]
+
         autograd.mark_variables(self.list_data(), self.list_grad(), self.grad_req)
 
     def _reduce(self):
@@ -366,7 +375,7 @@ def set_data(self, data):
         self.shape = data.shape
 
         if self._data is None:
-            assert self._deferred_init is not None, \
+            assert self._deferred_init, \
                 "Parameter '%s' has not been initialized"%self.name
             self._deferred_init = self._deferred_init[:3] + (data,)
             return
@@ -431,7 +440,7 @@ def zero_grad(self):
         if self._grad is None:
             return
         for i in self._grad:
-            i[:] = 0
+            ndarray.zeros_like(i, out=i)
 
     def var(self):
         """Returns a symbol representing this parameter."""
@@ -503,6 +512,17 @@ def __repr__(self):
         s = 'Constant {name} (shape={shape}, dtype={dtype})'
         return s.format(name=self.name, shape=self.shape, dtype=self.dtype)
 
+    @property
+    def grad_req(self):
+        return 'null'
+
+    @grad_req.setter
+    def grad_req(self, req):
+        if req != 'null':
+            warnings.warn('Constant parameter "{}" does not support '
+                          'grad_req other than "null", and new value "{}" '
+                          'is ignored.'.format(self.name, req))
+
 
 class ParameterDict(object):
     """A dictionary managing a set of parameters.
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 59dd74754ed..056c1d517c0 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -23,7 +23,6 @@
 from __future__ import print_function
 __all__ = ['RNN', 'LSTM', 'GRU']
 
-from ...autograd import is_training
 from ... import ndarray
 from .. import Block
 from . import rnn_cell
@@ -79,6 +78,10 @@ def __init__(self, hidden_size, num_layers, layout,
                                     allow_deferred_init=True))
             ni = nh * self._dir
 
+        for param_list in [self.i2h_weight, self.h2h_weight, self.i2h_bias, self.h2h_bias]:
+            for p in param_list:
+                self._reg_params[p.name] = p
+
         self._unfused = self._unfuse()
 
     def __repr__(self):
@@ -187,7 +190,7 @@ def forward(self, inputs, states=None):
                 self.i2h_weight[i].shape = (self._gates*self._hidden_size, inputs.shape[2])
                 self.i2h_weight[i]._finish_deferred_init()
         if inputs.context.device_type == 'gpu' or \
-            (not is_training() and self._mode == 'lstm'):
+           self._mode == 'lstm' and not self._dropout:
             out = self._forward_kernel(inputs, states)
         else:
             out = self._forward(inputs, states)
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 5ae0e46b7dc..f285b9187e8 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -49,6 +49,9 @@ class Trainer(object):
         on the type of compression being used. For example, 2bit compression requires a threshold.
         Arguments would then be {'type':'2bit', 'threshold':0.5}
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
+    update_on_kvstore : bool, default None
+        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
+        suitable option depending on the type of kvstore.
 
     Properties
     ----------
@@ -57,7 +60,7 @@ class Trainer(object):
         optimizer, its learning rate can be accessed as optimizer.learning_rate.
     """
     def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
-                 compression_params=None):
+                 compression_params=None, update_on_kvstore=None):
         if isinstance(params, (dict, ParameterDict)):
             params = list(params.values())
         if not isinstance(params, (list, tuple)):
@@ -73,11 +76,12 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
             self._params.append(param)
         self._compression_params = compression_params
         optimizer_params = optimizer_params if optimizer_params else {}
-        self._scale = optimizer_params.get('rescale_grad', 1.0)
+        self._scale = float(optimizer_params.get('rescale_grad', 1.0))
         self._contexts = self._check_contexts()
         self._init_optimizer(optimizer, optimizer_params)
         self._kv_initialized = False
         self._kvstore = kvstore
+        self._update_on_kvstore = update_on_kvstore
 
     def _check_contexts(self):
         contexts = None
@@ -106,14 +110,30 @@ def _init_optimizer(self, optimizer, optimizer_params):
                             for _ in self._contexts]
 
     def _init_kvstore(self):
-        arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params}
+        arg_arrays = {}
+        contains_sparse = False
+        for param in self._params:
+            arg_arrays[param.name] = param.data(self._contexts[0])
+            if param._grad_stype != 'default':
+                contains_sparse = True
+                # update_on_kvstore is set to False by the user
+                if self._update_on_kvstore is False:
+                    raise RuntimeError("Cannot set update_on_kvstore to False when sparse "
+                                       "gradients and/or sparse weights are present for "
+                                       "Parameter %s." % param.name)
         kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
                                                      arg_arrays)
+        update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \
+                            else update_on_kvstore
         if kvstore:
             if self._compression_params:
                 kvstore.set_gradient_compression(self._compression_params)
-            if 'dist' in kvstore.type:
-                update_on_kvstore = False
+            # kv.pull(row_sparse_grad) is not supported
+            if contains_sparse:
+                update_on_kvstore = True
+            else:
+                if 'dist' in kvstore.type:
+                    update_on_kvstore = False
             if update_on_kvstore:
                 kvstore.set_optimizer(self._optimizer)
             # optimizer preferably needs to be set before init for multiprecision
@@ -129,7 +149,6 @@ def _init_kvstore(self):
 
         self._kv_initialized = True
 
-
     @property
     def learning_rate(self):
         if not isinstance(self._optimizer, opt.Optimizer):
@@ -138,7 +157,6 @@ def learning_rate(self):
         else:
             return self._optimizer.learning_rate
 
-
     def set_learning_rate(self, lr):
         """Sets a new learning rate of the optimizer.
 
@@ -153,10 +171,73 @@ def set_learning_rate(self, lr):
         else:
             self._optimizer.set_learning_rate(lr)
 
-
     def step(self, batch_size, ignore_stale_grad=False):
         """Makes one step of parameter update. Should be called after
-        `autograd.compute_gradient` and outside of `record()` scope.
+        `autograd.backward()` and outside of `record()` scope.
+
+        For normal parameter updates, `step()` should be used, which internally calls
+        `allreduce_grads()` and then `update()`. However, if you need to get the reduced
+        gradients to perform certain transformation, such as in gradient clipping, then
+        you may want to manually call `allreduce_grads()` and `update()` separately.
+
+        Parameters
+        ----------
+        batch_size : int
+            Batch size of data processed. Gradient will be normalized by `1/batch_size`.
+            Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
+        ignore_stale_grad : bool, optional, default=False
+            If true, ignores Parameters with stale gradient (gradient that has not
+            been updated by `backward` after last step) and skip update.
+        """
+        if not self._kv_initialized:
+            self._init_kvstore()
+
+        self._optimizer.rescale_grad = self._scale / batch_size
+
+        self._allreduce_grads()
+        self._update(ignore_stale_grad)
+
+    def allreduce_grads(self):
+        """For each parameter, reduce the gradients from different contexts.
+
+        Should be called after `autograd.backward()`, outside of `record()` scope,
+        and before `trainer.update()`.
+
+        For normal parameter updates, `step()` should be used, which internally calls
+        `allreduce_grads()` and then `update()`. However, if you need to get the reduced
+        gradients to perform certain transformation, such as in gradient clipping, then
+        you may want to manually call `allreduce_grads()` and `update()` separately.
+        """
+        if not self._kv_initialized:
+            self._init_kvstore()
+        assert not (self._kvstore and self._update_on_kvstore), \
+                'allreduce_grads() when parameters are updated on kvstore ' \
+                'is not supported. Try setting `update_on_kvstore` ' \
+                'to False when creating trainer.'
+
+        self._allreduce_grads()
+
+    def _allreduce_grads(self):
+        if self._kvstore:
+            for i, param in enumerate(self._params):
+                if param.grad_req != 'null':
+
+                    self._kvstore.push(i, param.list_grad(), priority=-i)
+
+                    if not self._update_on_kvstore:
+                        self._kvstore.pull(i, param.list_grad(), priority=-i)
+
+    def update(self, batch_size, ignore_stale_grad=False):
+        """Makes one step of parameter update.
+
+        Should be called after `autograd.backward()` and outside of `record()` scope,
+        and after `trainer.update()`.
+
+
+        For normal parameter updates, `step()` should be used, which internally calls
+        `allreduce_grads()` and then `update()`. However, if you need to get the reduced
+        gradients to perform certain transformation, such as in gradient clipping, then
+        you may want to manually call `allreduce_grads()` and `update()` separately.
 
         Parameters
         ----------
@@ -169,12 +250,19 @@ def step(self, batch_size, ignore_stale_grad=False):
         """
         if not self._kv_initialized:
             self._init_kvstore()
+        assert not (self._kvstore and self._update_on_kvstore), \
+                'update() when parameters are updated on kvstore ' \
+                'is not supported. Try setting `update_on_kvstore` ' \
+                'to False when creating trainer.'
 
         self._optimizer.rescale_grad = self._scale / batch_size
+        self._update(ignore_stale_grad)
 
+    def _update(self, ignore_stale_grad=False):
         for i, param in enumerate(self._params):
             if param.grad_req == 'null':
                 continue
+
             if not ignore_stale_grad:
                 for data in param.list_data():
                     if not data._fresh_grad:
@@ -187,13 +275,9 @@ def step(self, batch_size, ignore_stale_grad=False):
                             "warning and skip updating of Parameters with stale gradient" \
                             %(param.name, str(data.context)))
 
-            if self._kvstore:
-                self._kvstore.push(i, param.list_grad(), priority=-i)
-                if self._update_on_kvstore:
-                    self._kvstore.pull(i, param.list_data(), priority=-i)
-                    continue
-                else:
-                    self._kvstore.pull(i, param.list_grad(), priority=-i)
+            if self._kvstore and self._update_on_kvstore:
+                self._kvstore.pull(i, param.list_data(), priority=-i)
+                continue
 
             for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
                 if not ignore_stale_grad or arr._fresh_grad:
@@ -233,6 +317,8 @@ def load_states(self, fname):
         if self._update_on_kvstore:
             self._kvstore.load_optimizer_states(fname)
             self._optimizer = self._kvstore._updater.optimizer
+            param_dict = {i: param for i, param in enumerate(self._params)}
+            self._optimizer.param_dict = param_dict
         else:
             with open(fname, 'rb') as f:
                 states = f.read()
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index fbf42f46a04..818aa3d2a3b 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -24,6 +24,8 @@
 import os
 import hashlib
 import warnings
+import collections
+import weakref
 try:
     import requests
 except ImportError:
@@ -250,3 +252,38 @@ def _brief_print_list(lst, limit=7):
         return _brief_print_list(lst[:limit//2], limit) + ', ..., ' + \
             _brief_print_list(lst[-limit//2:], limit)
     return ', '.join(["'%s'"%str(i) for i in lst])
+
+
+class HookHandle(object):
+    """A handle that can attach/detach a hook."""
+
+    def __init__(self):
+        self._hooks_dict_ref = None
+        self._id = None
+
+    def attach(self, hooks_dict, hook):
+        assert not self._hooks_dict_ref, 'The same handle cannot be attached twice.'
+        self._id = id(hook)
+        hooks_dict[self._id] = hook
+        self._hooks_dict_ref = weakref.ref(hooks_dict)
+
+    def detach(self):
+        hooks_dict = self._hooks_dict_ref()
+        if hooks_dict is not None and self._id in hooks_dict:
+            del hooks_dict[self._id]
+
+    def __getstate__(self):
+        return (self._hooks_dict_ref(), self._id)
+
+    def __setstate__(self, state):
+        if state[0] is None:
+            self._hooks_dict_ref = weakref.ref(collections.OrderedDict())
+        else:
+            self._hooks_dict_ref = weakref.ref(state[0])
+        self._id = state[1]
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        self.detach()
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index ace9cb18fff..b4b9cc2f1c0 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -25,6 +25,7 @@
 import random
 import logging
 import json
+import warnings
 import numpy as np
 
 try:
@@ -432,7 +433,7 @@ def color_normalize(src, mean, std=None):
     return src
 
 
-def random_size_crop(src, size, min_area, ratio, interp=2):
+def random_size_crop(src, size, area, ratio, interp=2, **kwargs):
     """Randomly crop src with size. Randomize area and aspect ratio.
 
     Parameters
@@ -441,8 +442,9 @@ def random_size_crop(src, size, min_area, ratio, interp=2):
         Input image
     size : tuple of (int, int)
         Size of the crop formatted as (width, height).
-    min_area : int
-        Minimum area to be maintained after cropping
+    area : float in (0, 1] or tuple of (float, float)
+        If tuple, minimum area and maximum area to be maintained after cropping
+        If float, minimum area to be maintained after cropping, maximum area is set to 1.0
     ratio : tuple of (float, float)
         Aspect ratio range as (min_aspect_ratio, max_aspect_ratio)
     interp: int, optional, default=2
@@ -457,9 +459,18 @@ def random_size_crop(src, size, min_area, ratio, interp=2):
 
     """
     h, w, _ = src.shape
-    area = h * w
+    src_area = h * w
+
+    if 'min_area' in kwargs:
+        warnings.warn('`min_area` is deprecated. Please use `area` instead.',
+                      DeprecationWarning)
+        area = kwargs.pop('min_area')
+    assert not kwargs, "unexpected keyword arguments for `random_size_crop`."
+
+    if isinstance(area, numeric_types):
+        area = (area, 1.0)
     for _ in range(10):
-        target_area = random.uniform(min_area, 1.0) * area
+        target_area = random.uniform(area[0], area[1]) * src_area
         new_ratio = random.uniform(*ratio)
 
         new_w = int(round(np.sqrt(target_area * new_ratio)))
@@ -596,24 +607,31 @@ class RandomSizedCropAug(Augmenter):
     ----------
     size : tuple of (int, int)
         Size of the crop formatted as (width, height).
-    min_area : int
-        Minimum area to be maintained after cropping
+    area : float in (0, 1] or tuple of (float, float)
+        If tuple, minimum area and maximum area to be maintained after cropping
+        If float, minimum area to be maintained after cropping, maximum area is set to 1.0
     ratio : tuple of (float, float)
         Aspect ratio range as (min_aspect_ratio, max_aspect_ratio)
     interp: int, optional, default=2
         Interpolation method. See resize_short for details.
     """
-    def __init__(self, size, min_area, ratio, interp=2):
-        super(RandomSizedCropAug, self).__init__(size=size, min_area=min_area,
+    def __init__(self, size, area, ratio, interp=2, **kwargs):
+        super(RandomSizedCropAug, self).__init__(size=size, area=area,
                                                  ratio=ratio, interp=interp)
         self.size = size
-        self.min_area = min_area
+        if 'min_area' in kwargs:
+            warnings.warn('`min_area` is deprecated. Please use `area` instead.',
+                          DeprecationWarning)
+            self.area = kwargs.pop('min_area')
+        else:
+            self.area = area
         self.ratio = ratio
         self.interp = interp
+        assert not kwargs, "unexpected keyword arguments for `RandomSizedCropAug`."
 
     def __call__(self, src):
         """Augmenter body"""
-        return random_size_crop(src, self.size, self.min_area, self.ratio, self.interp)[0]
+        return random_size_crop(src, self.size, self.area, self.ratio, self.interp)[0]
 
 
 class CenterCropAug(Augmenter):
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 3220b5a3352..74617f73f94 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -22,6 +22,7 @@
 import platform
 import logging
 
+
 def find_lib_path():
     """Find MXNet dynamic library files.
 
@@ -37,6 +38,8 @@ def find_lib_path():
                 logging.warning("MXNET_LIBRARY_PATH should be an absolute path, instead of: %s",
                                 lib_from_env)
             else:
+                if os.name == 'nt':
+                    os.environ['PATH'] = os.environ['PATH'] + ';' + os.path.dirname(lib_from_env)
                 return [lib_from_env]
         else:
             logging.warning("MXNET_LIBRARY_PATH '%s' doesn't exist", lib_from_env)
@@ -60,7 +63,7 @@ def find_lib_path():
         os.environ['PATH'] = os.path.dirname(__file__) + ';' + os.environ['PATH']
         dll_path = [os.path.join(p, 'libmxnet.dll') for p in dll_path]
     elif platform.system() == 'Darwin':
-        dll_path = [os.path.join(p, 'libmxnet.dylib') for p in dll_path]+ \
+        dll_path = [os.path.join(p, 'libmxnet.dylib') for p in dll_path] + \
                    [os.path.join(p, 'libmxnet.so') for p in dll_path]
     else:
         dll_path.append('../../../')
@@ -69,8 +72,10 @@ def find_lib_path():
     if len(lib_path) == 0:
         raise RuntimeError('Cannot find the MXNet library.\n' +
                            'List of candidates:\n' + str('\n'.join(dll_path)))
+    if os.name == 'nt':
+        os.environ['PATH'] = os.environ['PATH'] + ';' + os.path.dirname(lib_path[0])
     return lib_path
 
 
 # current version
-__version__ = "1.2.0"
+__version__ = "1.3.0"
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 76118ccfd2c..aa3ab44c48a 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -421,7 +421,7 @@ def update(self, labels, preds):
             label = label.flat
             pred_label = pred_label.flat
 
-            labels, preds = check_label_shapes(label, pred_label)
+            check_label_shapes(label, pred_label)
 
             self.sum_metric += (pred_label == label).sum()
             self.num_inst += len(pred_label)
@@ -1159,6 +1159,10 @@ def __init__(self, name='loss',
             name, output_names=output_names, label_names=label_names)
 
     def update(self, _, preds):
+
+        if isinstance(preds, ndarray.ndarray.NDArray):
+            preds = [preds]
+
         for pred in preds:
             self.sum_metric += ndarray.sum(pred).asscalar()
             self.num_inst += pred.size
diff --git a/python/mxnet/name.py b/python/mxnet/name.py
index 966d38280ef..4149d1db273 100644
--- a/python/mxnet/name.py
+++ b/python/mxnet/name.py
@@ -18,13 +18,16 @@
 # coding: utf-8
 """Automatic naming support for symbolic API."""
 from __future__ import absolute_import
+import threading
+import warnings
+from .base import classproperty, with_metaclass, _MXClassPropertyMetaClass
 
-class NameManager(object):
+class NameManager(with_metaclass(_MXClassPropertyMetaClass, object)):
     """NameManager to do automatic naming.
 
     Developers can also inherit from this class to change naming behavior.
     """
-    current = None
+    _current = threading.local()
 
     def __init__(self):
         self._counter = {}
@@ -62,14 +65,30 @@ def get(self, name, hint):
         return name
 
     def __enter__(self):
-        self._old_manager = NameManager.current
-        NameManager.current = self
+        if not hasattr(NameManager._current, "value"):
+            NameManager._current.value = NameManager()
+        self._old_manager = NameManager._current.value
+        NameManager._current.value = self
         return self
 
     def __exit__(self, ptype, value, trace):
         assert self._old_manager
-        NameManager.current = self._old_manager
-
+        NameManager._current.value = self._old_manager
+
+    #pylint: disable=no-self-argument
+    @classproperty
+    def current(cls):
+        warnings.warn("NameManager.current has been deprecated. "
+                      "It is advised to use the `with` statement with NameManager.",
+                      DeprecationWarning)
+        if not hasattr(NameManager._current, "value"):
+            cls._current.value = NameManager()
+        return cls._current.value
+
+    @current.setter
+    def current(cls, val):
+        cls._current.value = val
+    #pylint: enable=no-self-argument
 
 class Prefix(NameManager):
     """A name manager that attaches a prefix to all names.
@@ -92,4 +111,4 @@ def get(self, name, hint):
         return self._prefix + name
 
 # initialize the default name manager
-NameManager.current = NameManager()
+NameManager._current.value = NameManager()
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 6b2ff23fba2..f017d7e65e7 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -37,7 +37,7 @@
 from ..base import c_array, c_array_buf, c_handle_array, mx_real_t
 from ..base import mx_uint, NDArrayHandle, check_call
 from ..base import ctypes2buffer
-from ..context import Context
+from ..context import Context, current_context
 from . import _internal
 from . import op
 from ._internal import NDArrayBase
@@ -682,17 +682,20 @@ def _set_nd_basic_indexing(self, key, value):
         on the values of slices' steps."""
         shape = self.shape
         if isinstance(key, integer_types):
-            sliced_arr = self._at(key)
-            sliced_arr[:] = value
-            return
-        elif isinstance(key, py_slice):
-            if key.step is None or key.step == 1:  # trivial step
-                if key.start is not None or key.stop is not None:
-                    sliced_arr = self._slice(key.start, key.stop)
-                    sliced_arr[:] = value
-                    return
-                # assign value to the whole NDArray
-                # may need to broadcast first
+            if key < 0:
+                key += shape[0]
+            if key < 0 or key >= shape[0]:
+                if key < 0:
+                    key -= shape[0]
+                raise IndexError('index %d is out of bounds for axis 0 with size %d'
+                                 % (key, shape[0]))
+            key = py_slice(key, key+1)  # key must be >= 0 here
+
+        if isinstance(key, py_slice):
+            assign_to_self = key.step is None or key.step == 1
+            assign_to_self &= key.start is None or key.start == 0
+            assign_to_self &= key.stop is None or key.stop == shape[0]
+            if assign_to_self:  # trivial case, assign value to self
                 if isinstance(value, NDArray):
                     if value.handle is not self.handle:
                         if value.shape != shape:
@@ -709,7 +712,7 @@ def _set_nd_basic_indexing(self, key, value):
                     value_nd = self._prepare_value_nd(value, shape)
                     value_nd.copyto(self)
                 return
-            else:  # non-trivial step, use _slice_assign or _slice_assign_scalar
+            else:  # non-trivial case, use _slice_assign or _slice_assign_scalar
                 key = (key,)
 
         assert isinstance(key, tuple), "key=%s must be a tuple of slices and integers" % str(key)
@@ -762,7 +765,8 @@ def _set_nd_advanced_indexing(self, key, value):
         indices = self._get_index_nd(key)
         vshape = _get_oshape_of_gather_nd_op(self.shape, indices.shape)
         value_nd = self._prepare_value_nd(value, vshape)
-        _internal._scatter_set_nd(data=value_nd, indices=indices, shape=self.shape, out=self)
+        _internal._scatter_set_nd(lhs=self, rhs=value_nd, indices=indices,
+                                  shape=self.shape, out=self)
 
     def _get_nd_basic_indexing(self, key):
         """This function is called when key is a slice, or an integer,
@@ -989,6 +993,19 @@ def reshape(self, *shape, **kwargs):
               - input shape = (2,3,4), shape = (-4,1,2,-2), output shape =(1,2,3,4)
               - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
 
+            - If the argument `reverse` is set to 1, then the special values are inferred from right
+              to left.
+
+              Example::
+
+              - without reverse=1, for input shape = (10,5,4), shape = (-1,0), output shape would be
+                (40,5).
+              - with reverse=1, output shape will be (50,4).
+
+        reverse : bool, default False
+            If true then the special values are inferred from right to left. Only supported as
+            keyword argument.
+
 
         Returns
         -------
@@ -1029,18 +1046,19 @@ def reshape(self, *shape, **kwargs):
         elif not shape:
             shape = kwargs.get('shape')
             assert shape, "Shape must be provided."
-            if len(kwargs) != 1:
-                raise TypeError("Only 'shape' is supported as keyword argument. Got: {}."
-                                .format(', '.join(kwargs.keys())))
-        else:
-            assert not kwargs,\
-                "Specifying both positional and keyword arguments is not allowed in reshape."
+        if not all(k in ['shape', 'reverse'] for k in kwargs):
+            raise TypeError(
+                "Got unknown keywords in reshape: {}. " \
+                "Accepted keyword arguments are 'shape' and 'reverse'.".format(
+                    ', '.join([k for k in kwargs if k not in ['shape', 'reverse']])))
+        reverse = kwargs.get('reverse', False)
         handle = NDArrayHandle()
 
         # Actual reshape
         check_call(_LIB.MXNDArrayReshape64(self.handle,
                                            len(shape),
                                            c_array(ctypes.c_int64, shape),
+                                           reverse,
                                            ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
@@ -2243,7 +2261,7 @@ def ones(shape, ctx=None, dtype=None, **kwargs):
         The shape of the empty array.
     ctx : Context, optional
         An optional device context.
-        Defaults to the current default context (``mxnet.Context.default_ctx``).
+        Defaults to the current default context (``mxnet.context.current_context()``).
     dtype : str or numpy.dtype, optional
         An optional value type (default is `float32`).
     out : NDArray, optional
@@ -2265,7 +2283,7 @@ def ones(shape, ctx=None, dtype=None, **kwargs):
     """
     # pylint: disable= unused-argument
     if ctx is None:
-        ctx = Context.default_ctx
+        ctx = current_context()
     dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
@@ -2421,7 +2439,7 @@ def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t):
     array([2, 2, 2, 4, 4, 4], dtype=int32)
     """
     if ctx is None:
-        ctx = Context.default_ctx
+        ctx = current_context()
     return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
                              dtype=dtype, ctx=str(ctx))
 # pylint: enable= no-member, protected-access, too-many-arguments
@@ -3648,7 +3666,7 @@ def zeros(shape, ctx=None, dtype=None, **kwargs):
     """
     # pylint: disable= unused-argument
     if ctx is None:
-        ctx = Context.default_ctx
+        ctx = current_context()
     dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
@@ -3656,6 +3674,7 @@ def zeros(shape, ctx=None, dtype=None, **kwargs):
 
 def eye(N, M=0, k=0, ctx=None, dtype=None, **kwargs):
     """Return a 2-D array with ones on the diagonal and zeros elsewhere.
+
     Parameters
     ----------
     N: int
@@ -3670,10 +3689,12 @@ def eye(N, M=0, k=0, ctx=None, dtype=None, **kwargs):
         An optional device context (default is the current default context)
     dtype: str or numpy.dtype, optional
         An optional value type (default is `float32`)
+
     Returns
     -------
     NDArray
         A created array
+
     Examples
     --------
     >>> mx.nd.eye(2)
@@ -3687,7 +3708,7 @@ def eye(N, M=0, k=0, ctx=None, dtype=None, **kwargs):
     """
     # pylint: disable= unused-argument
     if ctx is None:
-        ctx = Context.default_ctx
+        ctx = current_context()
     dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._eye(N=N, M=M, k=k, ctx=ctx, dtype=dtype, **kwargs)
@@ -3715,7 +3736,7 @@ def empty(shape, ctx=None, dtype=None):
     if isinstance(shape, int):
         shape = (shape, )
     if ctx is None:
-        ctx = Context.default_ctx
+        ctx = current_context()
     if dtype is None:
         dtype = mx_real_t
     return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index c7355c2e46d..9c02b8e2cf2 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -42,7 +42,7 @@
 from ..base import _LIB, numeric_types
 from ..base import c_array_buf, mx_real_t, integer_types
 from ..base import mx_uint, NDArrayHandle, check_call
-from ..context import Context
+from ..context import Context, current_context
 from . import _internal
 from . import op
 try:
@@ -977,7 +977,7 @@ def _csr_matrix_from_definition(data, indices, indptr, shape=None, ctx=None,
     # pylint: disable= no-member, protected-access
     storage_type = 'csr'
     # context
-    ctx = Context.default_ctx if ctx is None else ctx
+    ctx = current_context() if ctx is None else ctx
     # types
     dtype = _prepare_default_dtype(data, dtype)
     indptr_type = _STORAGE_AUX_TYPES[storage_type][0] if indptr_type is None else indptr_type
@@ -1140,7 +1140,7 @@ def _row_sparse_ndarray_from_definition(data, indices, shape=None, ctx=None,
     """Create a `RowSparseNDArray` based on data and indices"""
     storage_type = 'row_sparse'
     # context
-    ctx = Context.default_ctx if ctx is None else ctx
+    ctx = current_context() if ctx is None else ctx
     # types
     dtype = _prepare_default_dtype(data, dtype)
     indices_type = _STORAGE_AUX_TYPES[storage_type][0] if indices_type is None else indices_type
@@ -1529,7 +1529,7 @@ def zeros(stype, shape, ctx=None, dtype=None, **kwargs):
     if stype == 'default':
         return _zeros_ndarray(shape, ctx=ctx, dtype=dtype, **kwargs)
     if ctx is None:
-        ctx = Context.default_ctx
+        ctx = current_context()
     dtype = mx_real_t if dtype is None else dtype
     if stype == 'row_sparse' or stype == 'csr':
         aux_types = _STORAGE_AUX_TYPES[stype]
@@ -1562,7 +1562,7 @@ def empty(stype, shape, ctx=None, dtype=None):
     if isinstance(shape, int):
         shape = (shape, )
     if ctx is None:
-        ctx = Context.default_ctx
+        ctx = current_context()
     if dtype is None:
         dtype = mx_real_t
     assert(stype is not None)
@@ -1603,7 +1603,7 @@ def array(source_array, ctx=None, dtype=None):
     >>> mx.nd.sparse.array(mx.nd.sparse.zeros('row_sparse', (3, 2)))
     <RowSparseNDArray 3x2 @cpu(0)>
     """
-    ctx = Context.default_ctx if ctx is None else ctx
+    ctx = current_context() if ctx is None else ctx
     if isinstance(source_array, NDArray):
         assert(source_array.stype != 'default'), \
                "Please use `tostype` to create RowSparseNDArray or CSRNDArray from an NDArray"
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 1d2fd2e73df..0c3fc904fb1 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -426,6 +426,17 @@ def _get_wd(self, index):
             wd *= self.wd_mult.get(self.idx2name[index], 1.0)
         return wd
 
+    def __getstate__(self):
+        ret = self.__dict__.copy()
+        # do not include param_dict in the state
+        del ret['param_dict']
+        return ret
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        # param_dict needs to be explicitly set by the trainer
+        self.param_dict = {}
+
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register   # pylint: disable=invalid-name
 
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index 6f9e868e232..3e81dcf3a6c 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -113,9 +113,9 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             dtype_name, dtype_name, dtype_name))
             code.append("""
     attr = kwargs.pop('attr', None)
-    kwargs.update(AttrScope.current.get(attr))
+    kwargs.update(AttrScope._current.value.get(attr))
     name = kwargs.pop('name', None)
-    name = NameManager.current.get(name, '%s')
+    name = NameManager._current.value.get(name, '%s')
     _ = kwargs.pop('out', None)
     keys = []
     vals = []
@@ -141,7 +141,7 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
 def %s(%s):"""%(func_name, ', '.join(signature)))
         if not signature_only:
             code.append("""
-    kwargs.update(AttrScope.current.get(attr))
+    kwargs.update(AttrScope._current.value.get(attr))
     sym_kwargs = dict()
     _keys = []
     _vals = []
@@ -172,7 +172,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         _vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
             code.append("""
-    name = NameManager.current.get(name, '%s')
+    name = NameManager._current.value.get(name, '%s')
     return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name)"""%(
         func_name.lower(), handle.value))
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 1ab7cf87bf5..fc1a71c203b 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -37,7 +37,7 @@
 from ..base import mx_uint, py_str, string_types
 from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
-from ..context import Context
+from ..context import Context, current_context
 from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP
 from ..ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from ..ndarray import _ndarray_cls
@@ -1259,9 +1259,12 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
             if len(args) != len(arg_names):
                 raise ValueError('Length of %s does not match the number of arguments' % arg_key)
             for narr in args:
-                if not isinstance(narr, NDArray):
+                if narr is None and allow_missing:
+                    arg_handles.append(None)
+                elif not isinstance(narr, NDArray):
                     raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
-                arg_handles.append(narr.handle)
+                else:
+                    arg_handles.append(narr.handle)
             arg_arrays = args
         elif isinstance(args, dict):
             for name in arg_names:
@@ -1767,7 +1770,7 @@ def eval(self, ctx=None, **kwargs):
         the result will be a list with one element.
         """
         if ctx is None:
-            ctx = Context.default_ctx
+            ctx = current_context()
         return self.bind(ctx, kwargs).forward()
 
     def reshape(self, *args, **kwargs):
@@ -2448,7 +2451,7 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
     handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateVariable(c_str(name), ctypes.byref(handle)))
     ret = Symbol(handle)
-    attr = AttrScope.current.get(attr)
+    attr = AttrScope._current.value.get(attr)
     attr = {} if attr is None else attr
     if shape is not None:
         attr['__shape__'] = str(shape)
@@ -2746,8 +2749,8 @@ def hypot(left, right):
         raise TypeError('types (%s, %s) not supported' % (str(type(left)), str(type(right))))
 
 def eye(N, M=0, k=0, dtype=None, **kwargs):
-    """Returns a new symbol of 2-D shpae, filled with ones on the diagonal
-       and zeros elsewhere.
+    """Returns a new symbol of 2-D shpae, filled with ones on the diagonal and zeros elsewhere.
+
     Parameters
     ----------
     N: int
@@ -2760,6 +2763,7 @@ def eye(N, M=0, k=0, dtype=None, **kwargs):
         and a negative value to a lower diagonal.
     dtype : str or numpy.dtype, optional
         The value type of the inner value, default to ``np.float32``.
+
     Returns
     -------
     out : Symbol
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index aa388c14ea1..bcdcc9c6408 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -44,7 +44,7 @@
     # in rare cases requests may be not installed
     pass
 import mxnet as mx
-from .context import Context
+from .context import Context, current_context
 from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from .ndarray import array
 from .symbol import Symbol
@@ -54,12 +54,12 @@ def default_context():
     """Get default context for regression test."""
     # _TODO: get context from environment variable to support
     # testing with GPUs
-    return Context.default_ctx
+    return current_context()
 
 
 def set_default_context(ctx):
     """Set default context."""
-    Context.default_ctx = ctx
+    Context._default_ctx.value = ctx
 
 
 def default_dtype():
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index aeabd4feec8..45e69ee706e 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index a9bb6e514c1..aca4db0a548 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index b06a7c6cacf..c8c21eba7b3 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index bc8a5c03141..83d6c732a4b 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 63cebb721f5..361bfab5d61 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -71,13 +71,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
   </dependencies>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala b/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala
index 7a9c1a76e6f..d9c767cb1fa 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala
@@ -19,9 +19,10 @@ package org.apache.mxnet
 
 import org.apache.mxnet.Base._
 import org.apache.mxnet.DType.DType
-import org.apache.mxnet.io.{MXDataPack, MXDataIter}
+import org.apache.mxnet.io.{MXDataIter, MXDataPack}
 import org.slf4j.LoggerFactory
 
+import scala.annotation.varargs
 import scala.collection.immutable.ListMap
 import scala.collection.mutable.ListBuffer
 
@@ -160,6 +161,108 @@ class DataBatch(val data: IndexedSeq[NDArray],
   def provideLabel: ListMap[String, Shape] = providedLabel
 }
 
+object DataBatch {
+  /**
+   * Builder class for DataBatch.
+   */
+  class Builder() {
+    private var data: IndexedSeq[NDArray] = null
+    private var label: IndexedSeq[NDArray] = null
+    private var index: IndexedSeq[Long] = null
+    private var pad: Int = 0
+    private var bucketKey: AnyRef = null
+    private var datatShapes: ListMap[String, Shape] = null
+    private var labelShapes: ListMap[String, Shape] = null
+
+    /**
+     * Set the input data.
+     * @param data a list of data.
+     * @return this.
+     */
+    @varargs def setData(data: NDArray*): Builder = {
+      this.data = data.toIndexedSeq
+      this
+    }
+
+    /**
+     * Set the labels in the same order of data.
+     * @param label a list of labels.
+     * @return this.
+     */
+    @varargs def setLabel(label: NDArray*): Builder = {
+      this.label = label.toIndexedSeq
+      this
+    }
+
+    /**
+     * Set the example indices in this batch.
+     * @param index indices in the same order of data.
+     * @return this.
+     */
+    @varargs def setIndex(index: Long*): Builder = {
+      this.index = index.toIndexedSeq
+      this
+    }
+
+    /**
+     * Set the pad.
+     * @param pad The number of examples padded at the end of a batch. It is used when the
+     *            total number of examples read is not divisible by the `batch_size`.
+     *            These extra padded examples are ignored in prediction.
+     * @return this
+     */
+    def setPad(pad: Int): Builder = {
+      this.pad = pad
+      this
+    }
+
+    /**
+     * Set the bucket key, used for bucketing module.
+     * @param bucketKey the bucket key related to this batch.
+     * @return this.
+     */
+    def setBucketKey(bucketKey: AnyRef): Builder = {
+      this.bucketKey = bucketKey
+      this
+    }
+
+    /**
+     * Provide the shape of a data.
+     * @param name data name.
+     * @param shape data shape.
+     * @return this.
+     */
+    def provideDataShape(name: String, shape: Shape): Builder = {
+      if (datatShapes == null) {
+        datatShapes = ListMap((name, shape))
+      } else {
+        datatShapes = datatShapes.updated(name, shape)
+      }
+      this
+    }
+
+    /**
+     * Provide the shape of a label.
+     * @param name label name.
+     * @param shape label shape.
+     * @return this.
+     */
+    def provideLabelShape(name: String, shape: Shape): Builder = {
+      if (labelShapes == null) {
+        labelShapes = ListMap((name, shape))
+      } else {
+        labelShapes = labelShapes.updated(name, shape)
+      }
+      this
+    }
+
+    def build(): DataBatch = {
+      require(data != null, "data is required.")
+      new DataBatch(data, label, index, pad, bucketKey, datatShapes, labelShapes)
+    }
+  }
+}
+
 /**
  * DataIter object in mxnet.
  */
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 416f2d74e82..469107aa58c 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -37,6 +37,8 @@ object NDArray {
 
   private val functions: Map[String, NDArrayFunction] = initNDArrayModule()
 
+  val api = NDArrayAPI
+
   private def addDependency(froms: Array[NDArray], tos: Array[NDArray]): Unit = {
     froms.foreach { from =>
       val weakRef = new WeakReference(from)
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
new file mode 100644
index 00000000000..d234ac66bdd
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mxnet
+@AddNDArrayAPIs(false)
+/**
+  * typesafe NDArray API: NDArray.api._
+  * Main code will be generated during compile time through Macros
+  */
+object NDArrayAPI {
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Shape.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Shape.scala
index e632ade808e..68917621772 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Shape.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Shape.scala
@@ -17,6 +17,8 @@
 
 package org.apache.mxnet
 
+import scala.annotation.varargs
+
 /**
  * Shape of [[NDArray]] or other data
  */
@@ -28,6 +30,7 @@ class Shape(dims: Traversable[Int]) extends Serializable {
   }
 
   def apply(dim: Int): Int = shape(dim)
+  def get(dim: Int): Int = apply(dim)
   def size: Int = shape.size
   def length: Int = shape.length
   def drop(dim: Int): Shape = new Shape(shape.drop(dim))
@@ -56,4 +59,5 @@ class Shape(dims: Traversable[Int]) extends Serializable {
 object Shape {
   def apply(dims: Int *): Shape = new Shape(dims: _*)
   def apply(dims: Traversable[Int]): Shape = new Shape(dims)
+  @varargs def create(dims: Int*): Shape = new Shape(dims)
 }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 13f85a731dc..a17fe57dde6 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -101,7 +101,6 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends WarnIfNotD
     var index: Int = -1
     for ((output, i) <- listOutputs().view.zipWithIndex) {
       if (output == name) {
-        require(index == -1, s"There are multiple outputs with name $name")
         index = i
       }
     }
@@ -830,6 +829,8 @@ object Symbol {
   private val functions: Map[String, SymbolFunction] = initSymbolModule()
   private val bindReqMap = Map("null" -> 0, "write" -> 1, "add" -> 3)
 
+  val api = SymbolAPI
+
   def pow(sym1: Symbol, sym2: Symbol): Symbol = {
     Symbol.createFromListedSymbols("_Power")(Array(sym1, sym2))
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
new file mode 100644
index 00000000000..49de9ae7321
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mxnet
+
+
+@AddSymbolAPIs(false)
+/**
+  * typesafe Symbol API: Symbol.api._
+  * Main code will be generated during compile time through Macros
+  */
+object SymbolAPI {
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
index 108cff44965..60b80f25285 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
@@ -23,6 +23,8 @@ import org.apache.mxnet.optimizer.SGD
 import org.apache.mxnet._
 import org.slf4j.LoggerFactory
 import org.slf4j.Logger
+
+import scala.annotation.varargs
 import scala.collection.mutable.ArrayBuffer
 
 object BaseModule {
@@ -468,6 +470,15 @@ abstract class BaseModule {
    */
   def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit
 
+  /**
+   * Forward computation.
+   * @param dataBatch a batch of data.
+   * @param isTrain Whether it is for training or not.
+   */
+  def forward(dataBatch: DataBatch, isTrain: Boolean): Unit = {
+    forward(dataBatch, Option(isTrain))
+  }
+
   /**
    * Backward computation.
    * @param outGrads Gradient on the outputs to be propagated back.
@@ -549,6 +560,25 @@ abstract class BaseModule {
            forceRebind: Boolean = false, sharedModule: Option[BaseModule] = None,
            gradReq: String = "write"): Unit
 
+
+ /**
+  * Bind the symbols to construct executors.
+  * This is necessary before one can perform computation with the module.
+  * @param forTraining Default is `True`. Whether the executors should be bind for training.
+  * @param inputsNeedGrad  Default is `False`.
+  *                        Whether the gradients to the input data need to be computed.
+  *                        Typically this is not needed.
+  *                        But this might be needed when implementing composition of modules.
+  * @param forceRebind Default is `False`. This function does nothing
+  *                    if the executors are already binded. But with this `True`,
+  *                    the executors will be forced to rebind.
+  * @param dataShape Typically is `DataIter.provideData`.
+  */
+  @varargs def bind(forTraining: Boolean, inputsNeedGrad: Boolean,
+                    forceRebind: Boolean, dataShape: DataDesc*): Unit = {
+    bind(dataShape.toVector, None, forTraining, inputsNeedGrad, forceRebind, None)
+  }
+
   // Install and initialize optimizers.
   def initOptimizer(kvstore: String = "local", optimizer: Optimizer = new SGD(),
                     resetOptimizer: Boolean = true, forceInit: Boolean = false): Unit
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
index ac3d645b333..d55a42653ce 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
@@ -17,13 +17,16 @@
 
 package org.apache.mxnet.module
 
-import java.io.{FileInputStream, BufferedInputStream, BufferedOutputStream, FileOutputStream}
+import java.io.{BufferedInputStream, BufferedOutputStream, FileInputStream, FileOutputStream}
+
 import org.apache.mxnet.DType.DType
 import org.apache.mxnet._
 import org.apache.mxnet.module.DataParallelExecutorGroup.Builder
 import org.apache.mxnet.optimizer.SGD
 import org.slf4j.LoggerFactory
 
+import scala.annotation.varargs
+
 /**
  * Module is a basic module that wrap a `Symbol`. It is functionally the same
  * as the `FeedForward` model, except under the module API.
@@ -642,4 +645,72 @@ object Module {
     }
     mod
   }
+
+  /**
+   * Builder class for Module.
+   * @param modelDef model definition in Symbol.
+   */
+  class Builder(private val modelDef: Symbol) {
+    private var dataNames: IndexedSeq[String] = IndexedSeq("data")
+    private var labelNames: IndexedSeq[String] = IndexedSeq("softmax_label")
+    private var contexts: Array[Context] = Array(Context.cpu())
+    private var workLoadList: IndexedSeq[Float] = _
+    private var fixedParamNames: Set[String] = _
+
+    /**
+     * Set the context for execution.
+     * @param ctx a list of contexts.
+     * @return this.
+     */
+    @varargs def setContext(ctx: Context*): Builder = {
+      contexts = ctx.toArray
+      this
+    }
+
+    /**
+     * Set the input data names.
+     * @param name a list of data names. Cannot be null.
+     * @return this.
+     */
+    @varargs def setDataNames(name: String*): Builder = {
+      dataNames = name.toVector
+      this
+    }
+
+    /**
+     * Set the label names.
+     * @param name a list of label names.
+     *             Set to null if no label is required.
+     * @return this.
+     */
+    @varargs def setLabelNames(name: String*): Builder = {
+      labelNames = if (name == null) IndexedSeq.empty[String] else name.toVector
+      this
+    }
+
+    /**
+     * Set the workloads.
+     * @param workloads a list of workloads
+     * @return this.
+     */
+    @varargs def setWorkLoadList(workloads: Float*): Builder = {
+      workLoadList = workloads.toVector
+      this
+    }
+
+    /**
+     * Specify the parameters need to be fixed.
+     * @param name a list of parameter names.
+     * @return this.
+     */
+    @varargs def setFixedParamNames(name: String*): Builder = {
+      fixedParamNames = name.toSet
+      this
+    }
+
+    def build(): Module = {
+      new Module(modelDef, dataNames, labelNames, contexts,
+        Option(workLoadList), Option(fixedParamNames))
+    }
+  }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
index a9cac131dd2..22b9c3bdaf3 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
@@ -18,7 +18,6 @@
 package org.apache.mxnet
 
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.apache.mxnet.CheckUtils._
 import org.apache.mxnet.module._
 import org.apache.mxnet.optimizer._
 import org.apache.mxnet.io._
@@ -52,8 +51,11 @@ class ModuleSuite extends FunSuite with BeforeAndAfterAll {
     import SymbolConversions._
     c = a + 2 * b + 3 * c
 
-    val mod = new Module(c, IndexedSeq("b", "c", "a"), null,
-      contexts = Array(Context.cpu(0), Context.cpu(1)))
+    val mod = new Module.Builder(c)
+      .setDataNames("b", "c", "a")
+      .setLabelNames(null)
+      .setContext(Context.cpu(0), Context.cpu(1))
+      .build()
     mod.bind(dataShapes = IndexedSeq(
       DataDesc("b", Shape(5, 5), layout = "NT"),
       DataDesc("c", Shape(5, 5), layout = "NT"),
@@ -342,11 +344,13 @@ class ModuleSuite extends FunSuite with BeforeAndAfterAll {
     dShape1 = Shape(20, 3, 120, 120)
     dShape2 = Shape(20, 3, 32, 64)
     lShape = Shape(20)
-    dataBatch = new DataBatch(
-      data = IndexedSeq(
+    dataBatch = new DataBatch.Builder()
+      .setData(
         NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
-        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()),
-      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
+        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))())
+      .setLabel(NDArray.ones(lShape))
+      .setPad(0)
+      .build()
     mod.forward(dataBatch)
     assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
     mod.backward()
@@ -355,11 +359,13 @@ class ModuleSuite extends FunSuite with BeforeAndAfterAll {
     dShape1 = Shape(5, 3, 28, 40)
     dShape2 = Shape(5, 3, 24, 16)
     lShape = Shape(5)
-    dataBatch = new DataBatch(
-      data = IndexedSeq(
+    dataBatch = new DataBatch.Builder()
+      .setData(
         NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
-        NDArray.random_uniform(Map("low" -> 15, "high" -> 25, "shape" -> dShape2.toString()))()),
-      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
+        NDArray.random_uniform(Map("low" -> 15, "high" -> 25, "shape" -> dShape2.toString()))())
+      .setLabel(NDArray.ones(lShape))
+      .setPad(0)
+      .build()
     mod.forward(dataBatch)
     assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
     mod.backward()
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 0b8aaa36cc0..4c7a0ed2e12 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -145,13 +145,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
index d1ec88d67c6..e9171bd47c2 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
@@ -30,40 +30,40 @@ object TrainMnist {
   // multi-layer perceptron
   def getMlp: Symbol = {
     val data = Symbol.Variable("data")
-    val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
-    val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
-    val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
-    val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
-    val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
-    val mlp = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc3))
+
+    val fc1 = Symbol.api.FullyConnected(data = Some(data), num_hidden = 128, name = "fc1")
+    val act1 = Symbol.api.Activation (data = Some(fc1), "relu", name = "relu")
+    val fc2 = Symbol.api.FullyConnected(Some(act1), None, None, 64, name = "fc2")
+    val act2 = Symbol.api.Activation(data = Some(fc2), "relu", name = "relu2")
+    val fc3 = Symbol.api.FullyConnected(Some(act2), None, None, 10, name = "fc3")
+    val mlp = Symbol.api.SoftmaxOutput(name = "softmax", data = Some(fc3))
     mlp
   }
 
   // LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
   // Haffner. "Gradient-based learning applied to document recognition."
   // Proceedings of the IEEE (1998)
+
   def getLenet: Symbol = {
     val data = Symbol.Variable("data")
     // first conv
-    val conv1 = Symbol.Convolution()()(
-      Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
-    val tanh1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "tanh"))
-    val pool1 = Symbol.Pooling()()(Map("data" -> tanh1, "pool_type" -> "max",
-                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
+    val conv1 = Symbol.api.Convolution(data = Some(data), kernel = Shape(5, 5), num_filter = 20)
+    val tanh1 = Symbol.api.tanh(data = Some(conv1))
+    val pool1 = Symbol.api.Pooling(data = Some(tanh1), pool_type = Some("max"),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)))
     // second conv
-    val conv2 = Symbol.Convolution()()(
-      Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
-    val tanh2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "tanh"))
-    val pool2 = Symbol.Pooling()()(Map("data" -> tanh2, "pool_type" -> "max",
-                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
+    val conv2 = Symbol.api.Convolution(data = Some(pool1), kernel = Shape(5, 5), num_filter = 50)
+    val tanh2 = Symbol.api.tanh(data = Some(conv2))
+    val pool2 = Symbol.api.Pooling(data = Some(tanh2), pool_type = Some("max"),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)))
     // first fullc
-    val flatten = Symbol.Flatten()()(Map("data" -> pool2))
-    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 500))
-    val tanh3 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "tanh"))
+    val flatten = Symbol.api.Flatten(data = Some(pool2))
+    val fc1 = Symbol.api.FullyConnected(data = Some(flatten), num_hidden = 500)
+    val tanh3 = Symbol.api.tanh(data = Some(fc1))
     // second fullc
-    val fc2 = Symbol.FullyConnected()()(Map("data" -> tanh3, "num_hidden" -> 10))
+    val fc2 = Symbol.api.FullyConnected(data = Some(tanh3), num_hidden = 10)
     // loss
-    val lenet = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc2))
+    val lenet = Symbol.api.SoftmaxOutput(name = "softmax", data = Some(fc2))
     lenet
   }
 
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
index 0ee0c119e43..47b4c100ea5 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
@@ -31,9 +31,9 @@ import scala.collection.mutable.ListBuffer
 /**
   * <p>
   * Example inference showing usage of the Infer package on a resnet-152 model.
-  * @see <a href="https://github.com/apache/incubator-mxnet\
-  * blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/inferexample\
-  * imageclassifier/" target="_blank">Instructions to run this example</a>
+  * @see <a href="https://github.com/apache/incubator-mxnet/tree/m\
+  * aster/scala-package/examples/src/main/scala/org/apache/mxnetexamples/in\
+  * fer/imageclassifier" target="_blank">Instructions to run this example</a>
   */
 object ImageClassifierExample {
 
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala
index f4f7f589789..f55a60f0144 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala
@@ -33,9 +33,9 @@ import scala.collection.mutable.ListBuffer
   * <p>
   * Example single shot detector (SSD) using the Infer package
   * on a ssd_resnet50_512 model.
-  * @see <a href="https://github.com/apache/incubator-mxnet\
-  * blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/inferexample\
-  * objectdetector/" target="_blank">Instructions to run this example</a>
+  * @see <a href="https://github.com/apache/incubator-mxnet/tree/master/sca\
+  * la-package/examples/src/main/scala/org/apache/mxnetexamples/infer/object\
+  * detector" target="_blank">Instructions to run this example</a>
   */
 class SSDClassifierExample {
   @Option(name = "--model-path-prefix", usage = "the input model directory and prefix of the model")
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index f047f89006e..13d3cc1387e 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <artifactId>mxnet-parent_2.11</artifactId>
         <groupId>org.apache.mxnet</groupId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -71,7 +71,7 @@
         <dependency>
             <groupId>org.apache.mxnet</groupId>
             <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
             <scope>provided</scope>
         </dependency>
         <!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index af1544d1d7d..2ddeaba7acb 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index e3146c1094f..120854986af 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 4b566260252..8ac369d1d71 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index 8238929a429..ef1d67b5dda 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
 <!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
diff --git a/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala b/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
index 7af2e052255..7402dbd3bc1 100644
--- a/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
+++ b/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
@@ -37,7 +37,12 @@ object Base {
 
   @throws(classOf[UnsatisfiedLinkError])
   private def tryLoadInitLibrary(): Unit = {
-    val baseDir = System.getProperty("user.dir") + "/init-native"
+    var baseDir = System.getProperty("user.dir") + "/init-native"
+    // TODO(lanKing520) Update this to use relative path to the MXNet director.
+    // TODO(lanking520) baseDir = sys.env("MXNET_BASEDIR") + "/scala-package/init-native"
+    if (System.getenv().containsKey("MXNET_BASEDIR")) {
+      baseDir = sys.env("MXNET_BASEDIR")
+    }
     val os = System.getProperty("os.name")
     // ref: http://lopica.sourceforge.net/os.html
     if (os.startsWith("Linux")) {
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index 0aa3030e7ce..73d90541ba1 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -41,15 +41,53 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
   </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <exclude>META-INF/*.SF</exclude>
+            <exclude>META-INF/*.DSA</exclude>
+            <exclude>META-INF/*.RSA</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+        <configuration>
+          <environmentVariables>
+            <MXNET_BASEDIR>${project.parent.basedir}/init-native</MXNET_BASEDIR>
+          </environmentVariables>
+          <argLine>
+            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
+            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
+          </argLine>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.scalastyle</groupId>
+        <artifactId>scalastyle-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index 036b9ec4753..c1c3a429b40 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -29,18 +29,26 @@ private[mxnet] class AddNDArrayFunctions(isContrib: Boolean) extends StaticAnnot
   private[mxnet] def macroTransform(annottees: Any*) = macro NDArrayMacro.addDefs
 }
 
+private[mxnet] class AddNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) = macro NDArrayMacro.typeSafeAPIDefs
+}
+
 private[mxnet] object NDArrayMacro {
-  case class NDArrayFunction(handle: NDArrayHandle)
+  case class NDArrayArg(argName: String, argType: String, isOptional : Boolean)
+  case class NDArrayFunction(name: String, listOfArgs: List[NDArrayArg])
 
   // scalastyle:off havetype
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
-    impl(c)(false, annottees: _*)
+    impl(c)(annottees: _*)
+  }
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
+    typeSafeAPIImpl(c)(annottees: _*)
   }
   // scalastyle:off havetype
 
-  private val ndarrayFunctions: Map[String, NDArrayFunction] = initNDArrayModule()
+  private val ndarrayFunctions: List[NDArrayFunction] = initNDArrayModule()
 
-  private def impl(c: blackbox.Context)(addSuper: Boolean, annottees: c.Expr[Any]*): c.Expr[Any] = {
+  private def impl(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     import c.universe._
 
     val isContrib: Boolean = c.prefix.tree match {
@@ -48,40 +56,104 @@ private[mxnet] object NDArrayMacro {
     }
 
     val newNDArrayFunctions = {
-      if (isContrib) ndarrayFunctions.filter(_._1.startsWith("_contrib_"))
-      else ndarrayFunctions.filter(!_._1.startsWith("_contrib_"))
+      if (isContrib) ndarrayFunctions.filter(_.name.startsWith("_contrib_"))
+      else ndarrayFunctions.filter(!_.name.startsWith("_contrib_"))
     }
 
-    val functionDefs = newNDArrayFunctions flatMap { case (funcName, funcProp) =>
-      val functionScope = {
-        if (isContrib) Modifiers()
-        else {
-          if (funcName.startsWith("_")) Modifiers(Flag.PRIVATE) else Modifiers()
+     val functionDefs = newNDArrayFunctions flatMap { NDArrayfunction =>
+        val funcName = NDArrayfunction.name
+        val termName = TermName(funcName)
+        if (!NDArrayfunction.name.startsWith("_") || NDArrayfunction.name.startsWith("_contrib_")) {
+          Seq(
+            // scalastyle:off
+            // (yizhi) We are investigating a way to make these functions type-safe
+            // and waiting to see the new approach is stable enough.
+            // Thus these functions may be deprecated in the future.
+            // e.g def transpose(kwargs: Map[String, Any] = null)(args: Any*)
+            q"def $termName(kwargs: Map[String, Any] = null)(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, kwargs)}".asInstanceOf[DefDef],
+            // e.g def transpose(args: Any*)
+            q"def $termName(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, null)}".asInstanceOf[DefDef]
+            // scalastyle:on
+          )
+        } else {
+          // Default private
+          Seq(
+            // scalastyle:off
+            q"private def $termName(kwargs: Map[String, Any] = null)(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, kwargs)}".asInstanceOf[DefDef],
+            q"private def $termName(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, null)}".asInstanceOf[DefDef]
+            // scalastyle:on
+          )
         }
       }
-      val newName = {
-        if (isContrib) funcName.substring(funcName.indexOf("_contrib_") + "_contrib_".length())
-        else funcName
-      }
-      val termName = TermName(funcName)
-      // It will generate definition something like,
-      Seq(
-        // scalastyle:off
-        // def transpose(kwargs: Map[String, Any] = null)(args: Any*)
-        q"def $termName(kwargs: Map[String, Any] = null)(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, kwargs)}",
-        // def transpose(args: Any*)
-        q"def $termName(args: Any*) = {genericNDArrayFunctionInvoke($funcName, args, null)}"
-        // scalastyle:on
-      )
+
+    structGeneration(c)(functionDefs, annottees : _*)
+  }
+
+  private def typeSafeAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Any] = {
+    import c.universe._
+
+    val isContrib: Boolean = c.prefix.tree match {
+      case q"new AddNDArrayAPIs($b)" => c.eval[Boolean](c.Expr(b))
     }
 
+    val newNDArrayFunctions = {
+      if (isContrib) ndarrayFunctions.filter(
+        func => func.name.startsWith("_contrib_") || !func.name.startsWith("_"))
+      else ndarrayFunctions.filterNot(_.name.startsWith("_"))
+    }
+
+    val functionDefs = newNDArrayFunctions map { ndarrayfunction =>
+
+      // Construct argument field
+      var argDef = ListBuffer[String]()
+      // Construct Implementation field
+      var impl = ListBuffer[String]()
+      impl += "val map = scala.collection.mutable.Map[String, Any]()"
+      ndarrayfunction.listOfArgs.foreach({ ndarrayarg =>
+        // var is a special word used to define variable in Scala,
+        // need to changed to something else in order to make it work
+        val currArgName = ndarrayarg.argName match {
+          case "var" => "vari"
+          case "type" => "typeOf"
+          case default => ndarrayarg.argName
+        }
+        if (ndarrayarg.isOptional) {
+          argDef += s"${currArgName} : Option[${ndarrayarg.argType}] = None"
+        }
+        else {
+          argDef += s"${currArgName} : ${ndarrayarg.argType}"
+        }
+        var base = "map(\"" + ndarrayarg.argName + "\") = " + currArgName
+        if (ndarrayarg.isOptional) {
+          base = "if (!" + currArgName + ".isEmpty)" + base + ".get"
+        }
+        impl += base
+      })
+      // scalastyle:off
+      impl += "org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(\"" + ndarrayfunction.name + "\", null, map.toMap)"
+      // scalastyle:on
+      // Combine and build the function string
+      val returnType = "org.apache.mxnet.NDArray"
+      var finalStr = s"def ${ndarrayfunction.name}New"
+      finalStr += s" (${argDef.mkString(",")}) : $returnType"
+      finalStr += s" = {${impl.mkString("\n")}}"
+      c.parse(finalStr).asInstanceOf[DefDef]
+    }
+
+    structGeneration(c)(functionDefs, annottees : _*)
+  }
+
+  private def structGeneration(c: blackbox.Context)
+                              (funcDef : List[c.universe.DefDef], annottees: c.Expr[Any]*)
+  : c.Expr[Any] = {
+    import c.universe._
     val inputs = annottees.map(_.tree).toList
     // pattern match on the inputs
     val modDefs = inputs map {
       case ClassDef(mods, name, something, template) =>
         val q = template match {
           case Template(superMaybe, emptyValDef, defs) =>
-            Template(superMaybe, emptyValDef, defs ++ functionDefs)
+            Template(superMaybe, emptyValDef, defs ++ funcDef)
           case ex =>
             throw new IllegalArgumentException(s"Invalid template: $ex")
         }
@@ -89,7 +161,7 @@ private[mxnet] object NDArrayMacro {
       case ModuleDef(mods, name, template) =>
         val q = template match {
           case Template(superMaybe, emptyValDef, defs) =>
-            Template(superMaybe, emptyValDef, defs ++ functionDefs)
+            Template(superMaybe, emptyValDef, defs ++ funcDef)
           case ex =>
             throw new IllegalArgumentException(s"Invalid template: $ex")
         }
@@ -102,20 +174,80 @@ private[mxnet] object NDArrayMacro {
     result
   }
 
+
+  // Convert C++ Types to Scala Types
+  private def typeConversion(in : String, argType : String = "") : String = {
+    in match {
+      case "Shape(tuple)" | "ShapeorNone" => "org.apache.mxnet.Shape"
+      case "Symbol" | "NDArray" | "NDArray-or-Symbol" => "org.apache.mxnet.NDArray"
+      case "Symbol[]" | "NDArray[]" | "NDArray-or-Symbol[]" | "SymbolorSymbol[]"
+      => "Array[org.apache.mxnet.NDArray]"
+      case "float" | "real_t" | "floatorNone" => "org.apache.mxnet.Base.MXFloat"
+      case "int" | "intorNone" | "int(non-negative)" => "Int"
+      case "long" | "long(non-negative)" => "Long"
+      case "double" | "doubleorNone" => "Double"
+      case "string" => "String"
+      case "boolean" | "booleanorNone" => "Boolean"
+      case "tupleof<float>" | "tupleof<double>" | "ptr" | "" => "Any"
+      case default => throw new IllegalArgumentException(
+        s"Invalid type for args: $default, $argType")
+    }
+  }
+
+
+  /**
+    * By default, the argType come from the C++ API is a description more than a single word
+    * For Example:
+    *   <C++ Type>, <Required/Optional>, <Default=>
+    * The three field shown above do not usually come at the same time
+    * This function used the above format to determine if the argument is
+    * optional, what is it Scala type and possibly pass in a default value
+    * @param argType Raw arguement Type description
+    * @return (Scala_Type, isOptional)
+    */
+  private def argumentCleaner(argType : String) : (String, Boolean) = {
+    val spaceRemoved = argType.replaceAll("\\s+", "")
+    var commaRemoved : Array[String] = new Array[String](0)
+    // Deal with the case e.g: stype : {'csr', 'default', 'row_sparse'}
+    if (spaceRemoved.charAt(0)== '{') {
+      val endIdx = spaceRemoved.indexOf('}')
+      commaRemoved = spaceRemoved.substring(endIdx + 1).split(",")
+      commaRemoved(0) = "string"
+    } else {
+      commaRemoved = spaceRemoved.split(",")
+    }
+    // Optional Field
+    if (commaRemoved.length >= 3) {
+      // arg: Type, optional, default = Null
+      require(commaRemoved(1).equals("optional"))
+      require(commaRemoved(2).startsWith("default="))
+      (typeConversion(commaRemoved(0), argType), true)
+    } else if (commaRemoved.length == 2 || commaRemoved.length == 1) {
+      val tempType = typeConversion(commaRemoved(0), argType)
+      val tempOptional = tempType.equals("org.apache.mxnet.NDArray")
+      (tempType, tempOptional)
+    } else {
+      throw new IllegalArgumentException(
+        s"Unrecognized arg field: $argType, ${commaRemoved.length}")
+    }
+
+  }
+
+
   // List and add all the atomic symbol functions to current module.
-  private def initNDArrayModule(): Map[String, NDArrayFunction] = {
+  private def initNDArrayModule(): List[NDArrayFunction] = {
     val opNames = ListBuffer.empty[String]
     _LIB.mxListAllOpNames(opNames)
     opNames.map(opName => {
       val opHandle = new RefLong
       _LIB.nnGetOpHandle(opName, opHandle)
       makeNDArrayFunction(opHandle.value, opName)
-    }).toMap
+    }).toList
   }
 
   // Create an atomic symbol function by handle and function name.
   private def makeNDArrayFunction(handle: NDArrayHandle, aliasName: String)
-    : (String, NDArrayFunction) = {
+  : NDArrayFunction = {
     val name = new RefString
     val desc = new RefString
     val keyVarNumArgs = new RefString
@@ -136,10 +268,14 @@ private[mxnet] object NDArrayMacro {
     val docStr = s"$aliasName $realName\n${desc.value}\n\n$paramStr\n$extraDoc\n"
     // scalastyle:off println
     if (System.getenv("MXNET4J_PRINT_OP_DEF") != null
-          && System.getenv("MXNET4J_PRINT_OP_DEF").toLowerCase == "true") {
+      && System.getenv("MXNET4J_PRINT_OP_DEF").toLowerCase == "true") {
       println("NDArray function definition:\n" + docStr)
     }
     // scalastyle:on println
-    (aliasName, new NDArrayFunction(handle))
+    val argList = argNames zip argTypes map { case (argName, argType) =>
+      val typeAndOption = argumentCleaner(argType)
+      new NDArrayArg(argName, typeAndOption._1, typeAndOption._2)
+    }
+    new NDArrayFunction(aliasName, argList.toList)
   }
 }
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index b6ddaafc7ad..234a8604cb9 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -21,7 +21,6 @@ import scala.annotation.StaticAnnotation
 import scala.collection.mutable.ListBuffer
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
-
 import org.apache.mxnet.init.Base._
 import org.apache.mxnet.utils.OperatorBuildUtils
 
@@ -29,18 +28,29 @@ private[mxnet] class AddSymbolFunctions(isContrib: Boolean) extends StaticAnnota
   private[mxnet] def macroTransform(annottees: Any*) = macro SymbolImplMacros.addDefs
 }
 
+private[mxnet] class AddSymbolAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) = macro SymbolImplMacros.typeSafeAPIDefs
+}
+
 private[mxnet] object SymbolImplMacros {
-  case class SymbolFunction(handle: SymbolHandle, keyVarNumArgs: String)
+  case class SymbolArg(argName: String, argType: String, isOptional : Boolean)
+  case class SymbolFunction(name: String, listOfArgs: List[SymbolArg])
 
   // scalastyle:off havetype
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
-    impl(c)(false, annottees: _*)
+    impl(c)(annottees: _*)
   }
-  // scalastyle:off havetype
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
+    newAPIImpl(c)(annottees: _*)
+  }
+  // scalastyle:on havetype
 
-  private val symbolFunctions: Map[String, SymbolFunction] = initSymbolModule()
+  private val symbolFunctions: List[SymbolFunction] = initSymbolModule()
 
-  private def impl(c: blackbox.Context)(addSuper: Boolean, annottees: c.Expr[Any]*): c.Expr[Any] = {
+  /**
+    * Implementation for fixed input API structure
+    */
+  private def impl(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     import c.universe._
 
     val isContrib: Boolean = c.prefix.tree match {
@@ -48,74 +58,106 @@ private[mxnet] object SymbolImplMacros {
     }
 
     val newSymbolFunctions = {
-      if (isContrib) symbolFunctions.filter(_._1.startsWith("_contrib_"))
-      else symbolFunctions.filter(!_._1.startsWith("_contrib_"))
+      if (isContrib) symbolFunctions.filter(
+        func => func.name.startsWith("_contrib_") || !func.name.startsWith("_"))
+      else symbolFunctions.filter(!_.name.startsWith("_"))
     }
 
-    val AST_TYPE_MAP_STRING_ANY = AppliedTypeTree(Ident(TypeName("Map")),
-      List(Ident(TypeName("String")), Ident(TypeName("Any"))))
-    val AST_TYPE_MAP_STRING_STRING = AppliedTypeTree(Ident(TypeName("Map")),
-      List(Ident(TypeName("String")), Ident(TypeName("String"))))
-    val AST_TYPE_SYMBOL_VARARG = AppliedTypeTree(
-      Select(
-        Select(Ident(termNames.ROOTPKG), TermName("scala")),
-        TypeName("<repeated>")
-      ),
-      List(Select(Select(Select(
-        Ident(TermName("org")), TermName("apache")), TermName("mxnet")), TypeName("Symbol")))
-    )
-
-    val functionDefs = newSymbolFunctions map { case (funcName, funcProp) =>
-      val functionScope = {
-        if (isContrib) Modifiers()
-        else {
-          if (funcName.startsWith("_")) Modifiers(Flag.PRIVATE) else Modifiers()
-        }
-      }
-      val newName = {
-        if (isContrib) funcName.substring(funcName.indexOf("_contrib_") + "_contrib_".length())
-        else funcName
+
+    val functionDefs = newSymbolFunctions map { symbolfunction =>
+        val funcName = symbolfunction.name
+        val tName = TermName(funcName)
+        q"""
+            def $tName(name : String = null, attr : Map[String, String] = null)
+            (args : org.apache.mxnet.Symbol*)(kwargs : Map[String, Any] = null)
+             : org.apache.mxnet.Symbol = {
+              createSymbolGeneral($funcName,name,attr,args,kwargs)
+              }
+         """.asInstanceOf[DefDef]
       }
 
-      // It will generate definition something like,
-      // def Concat(name: String = null, attr: Map[String, String] = null)
-      //           (args: Symbol*)(kwargs: Map[String, Any] = null)
-      DefDef(functionScope, TermName(newName), List(),
-        List(
-          List(
-            ValDef(Modifiers(Flag.PARAM | Flag.DEFAULTPARAM), TermName("name"),
-              Ident(TypeName("String")), Literal(Constant(null))),
-            ValDef(Modifiers(Flag.PARAM | Flag.DEFAULTPARAM), TermName("attr"),
-              AST_TYPE_MAP_STRING_STRING, Literal(Constant(null)))
-          ),
-          List(
-            ValDef(Modifiers(), TermName("args"), AST_TYPE_SYMBOL_VARARG, EmptyTree)
-          ),
-          List(
-            ValDef(Modifiers(Flag.PARAM | Flag.DEFAULTPARAM), TermName("kwargs"),
-              AST_TYPE_MAP_STRING_ANY, Literal(Constant(null)))
-          )
-        ), TypeTree(),
-        Apply(
-          Ident(TermName("createSymbolGeneral")),
-          List(
-            Literal(Constant(funcName)),
-            Ident(TermName("name")),
-            Ident(TermName("attr")),
-            Ident(TermName("args")),
-            Ident(TermName("kwargs"))
-          )
-        )
-      )
+    structGeneration(c)(functionDefs, annottees : _*)
+  }
+
+  /**
+    * Implementation for Dynamic typed API Symbol.api.<functioname>
+    */
+  private def newAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Any] = {
+    import c.universe._
+
+    val isContrib: Boolean = c.prefix.tree match {
+      case q"new AddSymbolAPIs($b)" => c.eval[Boolean](c.Expr(b))
+    }
+
+    // TODO: Put Symbol.api.foo --> Stable APIs
+    // Symbol.contrib.bar--> Contrib APIs
+    val newSymbolFunctions = {
+      if (isContrib) symbolFunctions.filter(
+        func => func.name.startsWith("_contrib_") || !func.name.startsWith("_"))
+      else symbolFunctions.filter(!_.name.startsWith("_"))
+    }
+
+    val functionDefs = newSymbolFunctions map { symbolfunction =>
+
+      // Construct argument field
+      var argDef = ListBuffer[String]()
+      // Construct Implementation field
+      var impl = ListBuffer[String]()
+      impl += "val map = scala.collection.mutable.Map[String, Any]()"
+      symbolfunction.listOfArgs.foreach({ symbolarg =>
+        // var is a special word used to define variable in Scala,
+        // need to changed to something else in order to make it work
+        val currArgName = symbolarg.argName match {
+          case "var" => "vari"
+          case "type" => "typeOf"
+          case default => symbolarg.argName
+        }
+        if (symbolarg.isOptional) {
+          argDef += s"${currArgName} : Option[${symbolarg.argType}] = None"
+        }
+        else {
+          argDef += s"${currArgName} : ${symbolarg.argType}"
+        }
+        var base = "map(\"" + symbolarg.argName + "\") = " + currArgName
+        if (symbolarg.isOptional) {
+          base = "if (!" + currArgName + ".isEmpty)" + base + ".get"
+        }
+        impl += base
+      })
+      argDef += "name : String = null"
+      argDef += "attr : Map[String, String] = null"
+      // scalastyle:off
+      // TODO: Seq() here allows user to place Symbols rather than normal arguments to run, need to fix if old API deprecated
+      impl += "org.apache.mxnet.Symbol.createSymbolGeneral(\"" + symbolfunction.name + "\", name, attr, Seq(), map.toMap)"
+      // scalastyle:on
+      // Combine and build the function string
+      val returnType = "org.apache.mxnet.Symbol"
+      var finalStr = s"def ${symbolfunction.name}"
+      finalStr += s" (${argDef.mkString(",")}) : $returnType"
+      finalStr += s" = {${impl.mkString("\n")}}"
+      c.parse(finalStr).asInstanceOf[DefDef]
     }
+    structGeneration(c)(functionDefs, annottees : _*)
+  }
 
+  /**
+    * Generate class structure for all function APIs
+    * @param c
+    * @param funcDef DefDef type of function definitions
+    * @param annottees
+    * @return
+    */
+  private def structGeneration(c: blackbox.Context)
+                              (funcDef : List[c.universe.DefDef], annottees: c.Expr[Any]*)
+  : c.Expr[Any] = {
+    import c.universe._
     val inputs = annottees.map(_.tree).toList
     // pattern match on the inputs
     val modDefs = inputs map {
       case ClassDef(mods, name, something, template) =>
         val q = template match {
           case Template(superMaybe, emptyValDef, defs) =>
-            Template(superMaybe, emptyValDef, defs ++ functionDefs)
+            Template(superMaybe, emptyValDef, defs ++ funcDef)
           case ex =>
             throw new IllegalArgumentException(s"Invalid template: $ex")
         }
@@ -123,7 +165,7 @@ private[mxnet] object SymbolImplMacros {
       case ModuleDef(mods, name, template) =>
         val q = template match {
           case Template(superMaybe, emptyValDef, defs) =>
-            Template(superMaybe, emptyValDef, defs ++ functionDefs)
+            Template(superMaybe, emptyValDef, defs ++ funcDef)
           case ex =>
             throw new IllegalArgumentException(s"Invalid template: $ex")
         }
@@ -136,20 +178,80 @@ private[mxnet] object SymbolImplMacros {
     result
   }
 
+  // Convert C++ Types to Scala Types
+  def typeConversion(in : String, argType : String = "") : String = {
+    in match {
+      case "Shape(tuple)" | "ShapeorNone" => "org.apache.mxnet.Shape"
+      case "Symbol" | "NDArray" | "NDArray-or-Symbol" => "org.apache.mxnet.Symbol"
+      case "Symbol[]" | "NDArray[]" | "NDArray-or-Symbol[]" | "SymbolorSymbol[]"
+      => "Array[org.apache.mxnet.Symbol]"
+      case "float" | "real_t" | "floatorNone" => "org.apache.mxnet.Base.MXFloat"
+      case "int" | "intorNone" | "int(non-negative)" => "Int"
+      case "long" | "long(non-negative)" => "Long"
+      case "double" | "doubleorNone" => "Double"
+      case "string" => "String"
+      case "boolean" => "Boolean"
+      case "tupleof<float>" | "tupleof<double>" | "ptr" | "" => "Any"
+      case default => throw new IllegalArgumentException(
+        s"Invalid type for args: $default, $argType")
+    }
+  }
+
+
+  /**
+    * By default, the argType come from the C++ API is a description more than a single word
+    * For Example:
+    *   <C++ Type>, <Required/Optional>, <Default=>
+    * The three field shown above do not usually come at the same time
+    * This function used the above format to determine if the argument is
+    * optional, what is it Scala type and possibly pass in a default value
+    * @param argType Raw arguement Type description
+    * @return (Scala_Type, isOptional)
+    */
+  def argumentCleaner(argType : String) : (String, Boolean) = {
+    val spaceRemoved = argType.replaceAll("\\s+", "")
+    var commaRemoved : Array[String] = new Array[String](0)
+    // Deal with the case e.g: stype : {'csr', 'default', 'row_sparse'}
+    if (spaceRemoved.charAt(0)== '{') {
+      val endIdx = spaceRemoved.indexOf('}')
+      commaRemoved = spaceRemoved.substring(endIdx + 1).split(",")
+      commaRemoved(0) = "string"
+    } else {
+      commaRemoved = spaceRemoved.split(",")
+    }
+    // Optional Field
+    if (commaRemoved.length >= 3) {
+      // arg: Type, optional, default = Null
+      require(commaRemoved(1).equals("optional"))
+      require(commaRemoved(2).startsWith("default="))
+      (typeConversion(commaRemoved(0), argType), true)
+    } else if (commaRemoved.length == 2 || commaRemoved.length == 1) {
+      val tempType = typeConversion(commaRemoved(0), argType)
+      val tempOptional = tempType.equals("org.apache.mxnet.Symbol")
+      (tempType, tempOptional)
+    } else {
+      throw new IllegalArgumentException(
+        s"Unrecognized arg field: $argType, ${commaRemoved.length}")
+    }
+
+  }
+
+
   // List and add all the atomic symbol functions to current module.
-  private def initSymbolModule(): Map[String, SymbolFunction] = {
+  private def initSymbolModule(): List[SymbolFunction] = {
     val opNames = ListBuffer.empty[String]
     _LIB.mxListAllOpNames(opNames)
+    // TODO: Add '_linalg_', '_sparse_', '_image_' support
     opNames.map(opName => {
       val opHandle = new RefLong
       _LIB.nnGetOpHandle(opName, opHandle)
       makeAtomicSymbolFunction(opHandle.value, opName)
-    }).toMap
+    }).toList
   }
 
   // Create an atomic symbol function by handle and function name.
   private def makeAtomicSymbolFunction(handle: SymbolHandle, aliasName: String)
-      : (String, SymbolFunction) = {
+      : SymbolFunction = {
     val name = new RefString
     val desc = new RefString
     val keyVarNumArgs = new RefString
@@ -174,6 +276,10 @@ private[mxnet] object SymbolImplMacros {
       println("Symbol function definition:\n" + docStr)
     }
     // scalastyle:on println
-    (aliasName, new SymbolFunction(handle, keyVarNumArgs.value))
+    val argList = argNames zip argTypes map { case (argName, argType) =>
+        val typeAndOption = argumentCleaner(argType)
+        new SymbolArg(argName, typeAndOption._1, typeAndOption._2)
+    }
+    new SymbolFunction(aliasName, argList.toList)
   }
 }
diff --git a/ci/docker/install/amzn_linux_openblas.sh b/scala-package/macros/src/test/resources/log4j.properties
old mode 100755
new mode 100644
similarity index 70%
rename from ci/docker/install/amzn_linux_openblas.sh
rename to scala-package/macros/src/test/resources/log4j.properties
index 94088d6ccf1..d82fd7ea4f3
--- a/ci/docker/install/amzn_linux_openblas.sh
+++ b/scala-package/macros/src/test/resources/log4j.properties
@@ -1,5 +1,3 @@
-#!/bin/bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -7,9 +5,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-#
+# 
 #   http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -17,13 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+# for development debugging
+log4j.rootLogger = debug, stdout
 
-set -ex
-pushd .
-git clone https://github.com/xianyi/OpenBLAS
-cd OpenBLAS
-make FC=gfortran -j $(($(nproc) + 1))
-make PREFIX=/usr/local install
-popd
\ No newline at end of file
+log4j.appender.stdout = org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target = System.out
+log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n
diff --git a/scala-package/macros/src/test/scala/org/apache/mxnet/MacrosSuite.scala b/scala-package/macros/src/test/scala/org/apache/mxnet/MacrosSuite.scala
new file mode 100644
index 00000000000..bc8be7df5fb
--- /dev/null
+++ b/scala-package/macros/src/test/scala/org/apache/mxnet/MacrosSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.slf4j.LoggerFactory
+
+class MacrosSuite extends FunSuite with BeforeAndAfterAll {
+
+  private val logger = LoggerFactory.getLogger(classOf[MacrosSuite])
+
+
+  test("MacrosSuite-testArgumentCleaner") {
+    val input = List(
+      "Symbol, optional, default = Null",
+      "int, required",
+      "Shape(tuple), optional, default = []",
+      "{'csr', 'default', 'row_sparse'}, optional, default = 'csr'",
+      ", required"
+    )
+    val output = List(
+      ("org.apache.mxnet.Symbol", true),
+      ("Int", false),
+      ("org.apache.mxnet.Shape", true),
+      ("String", true),
+      ("Any", false)
+    )
+
+    for (idx <- input.indices) {
+      val result = SymbolImplMacros.argumentCleaner(input(idx))
+      assert(result._1 === output(idx)._1 && result._2 === output(idx)._2)
+    }
+  }
+
+}
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index 43c168e7daa..2504b1f3155 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index 9d36c07ef5d..aca290f6d1f 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 82687e903cf..73b85905197 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index 0af2d5de651..54ba2b57afa 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 7753f59527a..9dcfa7ca27e 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -5,7 +5,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>1.2.0-SNAPSHOT</version>
+  <version>1.3.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/apache/incubator-mxnet/tree/master/scala-package</url>
   <description>MXNet Scala Package</description>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index d99fb72c18f..281fad4056f 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -21,7 +21,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.2.0-SNAPSHOT</version>
+      <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/setup-utils/install-mxnet-osx-python.sh b/setup-utils/install-mxnet-osx-python.sh
index 56f6998abc2..b24650a5711 100755
--- a/setup-utils/install-mxnet-osx-python.sh
+++ b/setup-utils/install-mxnet-osx-python.sh
@@ -506,8 +506,8 @@ print ((a*2).asnumpy());
 END
 	rm -f mxnet_test.expected
 	cat << END > mxnet_test.expected
-[[ 2.  2.  2.]
- [ 2.  2.  2.]]
+[[2. 2. 2.]
+ [2. 2. 2.]]
 END
 	diff mxnet_test.log mxnet_test.expected
 	if [[ $? = 0 ]]; then
diff --git a/snapcraft.yaml b/snapcraft.yaml
index 6aca20a4ebb..284efa2db8b 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '1.2.0'
+version: '1.3.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 34b4fd22f85..467118b9921 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -116,6 +116,12 @@ int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size) {
   API_END();
 }
 
+int MXGetGPUCount(int* out) {
+  API_BEGIN();
+  *out = Context::GetGPUCount();
+  API_END();
+}
+
 int MXGetVersion(int *out) {
   API_BEGIN();
   *out = static_cast<int>(MXNET_VERSION);
@@ -431,12 +437,13 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
 MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
                                  int ndim,
                                  dim_t *dims,
+                                 bool reverse,
                                  NDArrayHandle *out) {
   NDArray *ptr = new NDArray();
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
   nnvm::Tuple<dim_t> shape(dims, dims+ndim);
-  TShape new_shape = mxnet::op::InferReshapeShape(shape, arr->shape(), false);
+  TShape new_shape = mxnet::op::InferReshapeShape(shape, arr->shape(), reverse);
   *ptr = arr->ReshapeWithRecord(new_shape);
   *out = ptr;
   API_END_HANDLE_ERROR(delete ptr);
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 40df49144fa..09bc23934e5 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -510,6 +510,93 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
   API_END();
 }
 
+int MXExecutorReshape(int partial_shaping,
+                      int allow_up_sizing,
+                      int dev_type,
+                      int dev_id,
+                      mx_uint num_map_keys,
+                      const char** map_keys,
+                      const int* map_dev_types,
+                      const int* map_dev_ids,
+                      const mx_uint num_provided_arg_shapes,
+                      const char** provided_arg_shape_names,
+                      const mx_uint* provided_arg_shape_data,
+                      const mx_uint* provided_arg_shape_idx,
+                      mx_uint* num_in_args,
+                      NDArrayHandle** in_args,
+                      NDArrayHandle** arg_grads,
+                      mx_uint* num_aux_states,
+                      NDArrayHandle** aux_states,
+                      ExecutorHandle shared_exec,
+                      ExecutorHandle *out) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  // create shape map for in_args and aux_states
+  std::unordered_map<std::string, TShape> kwargs(num_provided_arg_shapes);
+  for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
+    auto p = kwargs.emplace(provided_arg_shape_names[i],
+        TShape(provided_arg_shape_data+provided_arg_shape_idx[i],
+          provided_arg_shape_data+provided_arg_shape_idx[i+1]));
+    CHECK(p.second) << "Duplicate shapes are provided for argument "
+      << provided_arg_shape_names[i] << " in reshape of executor";
+  }
+
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  std::map<std::string, Context> ctx_map;
+  for (mx_uint i = 0; i < num_map_keys; ++i) {
+    ctx_map[std::string(map_keys[i])] = Context::Create(
+        static_cast<Context::DeviceType>(map_dev_types[i]), map_dev_ids[i]);
+  }
+  std::vector<NDArray> in_arg_vec;
+  std::vector<NDArray> arg_grad_vec;
+  std::vector<NDArray> aux_state_vec;
+
+  Executor* exec = static_cast<Executor*>(shared_exec);
+  *out = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
+                       &in_arg_vec, &arg_grad_vec, &aux_state_vec);
+
+  ret->ret_handles.clear();
+  ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size());
+
+  size_t nd_idx = 0;
+  for (const auto& nd : in_arg_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Input argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (in_arg_vec.size() > 0) {
+    *num_in_args = in_arg_vec.size();
+    *in_args = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : arg_grad_vec) {
+    if (nd.is_none()) {
+      ret->ret_handles.push_back(nullptr);
+    } else {
+      ret->ret_handles.push_back(new NDArray(nd));
+    }
+  }
+  if (arg_grad_vec.size() > 0) {
+    *arg_grads = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : aux_state_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Auxiliary argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (aux_state_vec.size() > 0) {
+    *num_aux_states = aux_state_vec.size();
+    *aux_states = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+  API_END_HANDLE_ERROR(delete out);
+}
+
 int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                  ExecutorMonitorCallback callback,
                                  void* callback_handle) {
diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc
index c946e3b6bd5..c5841775794 100644
--- a/src/c_api/c_api_profile.cc
+++ b/src/c_api/c_api_profile.cc
@@ -112,7 +112,11 @@ class ProfilingThreadData {
 #endif  // PROFILE_API_INCLUDE_AS_EVENT
 };
 
+#if DMLC_CXX11_THREAD_LOCAL
 static thread_local ProfilingThreadData thread_profiling_data;
+#else
+static MX_THREAD_LOCAL ProfilingThreadData thread_profiling_data;
+#endif
 
 extern void on_enter_api(const char *function) {
   if (profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kAPI)) {
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index 3ac86fba684..731d03d9be2 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -25,6 +25,8 @@
 #define MXNET_COMMON_EXEC_UTILS_H_
 
 #include <vector>
+#include <string>
+#include <utility>
 #include "../common/utils.h"
 
 namespace mxnet {
@@ -76,8 +78,8 @@ inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
 }
 
 inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
-                                 const std::vector<OpReqType> &req,
                                  const std::vector<NDArray> *bufs,
+                                 std::vector<OpReqType> *req,
                                  std::vector<TBlob> *blobs,
                                  std::vector<NDArray> *temp_src,
                                  std::vector<NDArray> *temp_dst) {
@@ -86,6 +88,12 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
     auto& nd = src[i];
     bool is_default = nd.storage_type() == kDefaultStorage;
 #if MXNET_USE_MKLDNN == 1
+    if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
+      // If it's write inplace and the output array doesn't use the default
+      // layout, we'll generate a temporary output array below, which means
+      // the input array and the output array are no longer the same array.
+      // we should change the request type.
+      req->at(i) = kWriteTo;
     // We have to make sure it's default storage and default layout.
     is_default = nd.IsDefaultData();
 #endif
@@ -115,9 +123,9 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
  */
 inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    const std::vector<NDArray> &ndoutputs,
-                                   const std::vector<OpReqType> &req,
                                    const std::vector<NDArray> *in_bufs,
                                    const std::vector<NDArray> *out_bufs,
+                                   std::vector<OpReqType> *req,
                                    std::vector<TBlob> *input_blobs,
                                    std::vector<TBlob> *output_blobs,
                                    std::vector<NDArray> *pre_temp_src,
@@ -130,7 +138,7 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
   SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
                       in_temp_idx_map);
   // populate output blobs
-  SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
+  SetupDefaultBlobsOut(ndoutputs, out_bufs, req, output_blobs, post_temp_dst,
                        post_temp_src);
   // add mutable inputs to post temp list
   for (const auto idx : mutate_idx) {
@@ -226,7 +234,129 @@ inline bool DefaultStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+// string representation of storage id
+inline std::string storage_str(int storage_id) {
+  std::string str;
+  if (storage_id == -1) {
+    str = "var (-1)";
+  } else if (storage_id == -2) {
+    str = "external storage (-2)";
+  } else {
+    str = "group " + std::to_string(storage_id);
+  }
+  return str;
+}
+
+/* log the static memory plan of the graph. Example:
+   node 0 var
+   node 1 _copy
+            input 0: [80,3,224,224] (47040 KB) -> var storage (-1)
+            output 1: [80,3,224,224] (47040 KB) -> group 0
+   node 2 var
+   node 3 var
+   node 4 var
+   node 5 var
+   node 6 BatchNorm
+            input 1: [80,3,224,224] (47040 KB) -> group 0
+            input 2: [3] (0 KB) -> var storage (-1)
+            input 3: [3] (0 KB) -> var storage (-1)
+            input 4: [3] (0 KB) -> var storage (-1)
+            input 5: [3] (0 KB) -> var storage (-1)
+            output 6: [80,3,224,224] (47040 KB) -> group 1
+            output 7: [3] (0 KB) -> group 3
+            output 8: [3] (0 KB) -> group 2
+   ...
+ */
+inline void LogMemoryPlan(const nnvm::Graph& g) {
+  const auto &idx = g.indexed_graph();
+  const auto& vshape = g.GetAttr<nnvm::ShapeVector>("shape");
+  const auto& vtype = g.GetAttr<nnvm::DTypeVector>("dtype");
+  const auto& vstorage = g.GetAttr<nnvm::StorageVector>("storage_id");
+  // find node range
+  uint32_t node_start = 0, node_end = idx.num_nodes();
+  if (g.attrs.count("node_range")) {
+    const auto& range = g.GetAttr<std::pair<uint32_t, uint32_t> >("node_range");
+    node_start = range.first;
+    node_end = range.second;
+  }
+  for (uint32_t nid = node_start; nid < node_end; ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      LOG(INFO) << "node " << nid << " var";
+    } else {
+      LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name;
+      for (const auto& e : inode.inputs) {
+        auto eid = idx.entry_id(e);
+        size_t kilo_bytes = vshape[eid].Size() * mshadow::mshadow_sizeof(vtype[eid]) / 1024;
+        LOG(INFO) << "\t\tinput " << eid << ": " << vshape[eid] << " ("
+                  << kilo_bytes << " KB) -> " << storage_str(vstorage[eid]);
+      }
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        uint32_t eid = idx.entry_id(nid, index);
+        size_t kilo_bytes = vshape[eid].Size() * mshadow::mshadow_sizeof(vtype[eid]) / 1024;
+        LOG(INFO) << "\t\toutput " << eid << ": " << vshape[eid] << " ("
+                  << kilo_bytes << " KB) -> " << storage_str(vstorage[eid]);
+      }
+    }
+  }
+}
+
+/* log the static memory plan of the graph. Example:
+    node 0 var
+    node 1 _copy: fcompute
+                input 0: default
+                output 1: default
+    node 2 var
+    node 3 Convolution: fcompute
+                input 1: default
+                input 2: default
+                output 3: default
+    node 4 var
+    node 5 var
+    node 6 var
+    node 7 var
+    node 8 BatchNorm: fcompute
+                input 3: default
+                input 4: default
+                input 5: default
+                input 6: default
+                input 7: default
+                output 8: default
+                output 9: default
+                output 10: default
+    ...
+ */
+inline void LogInferStorage(const nnvm::Graph& g) {
+  const auto &idx = g.indexed_graph();
+  const auto& vstorage_type = g.GetAttr<StorageTypeVector>("storage_type");
+  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
+  uint32_t node_start = 0, node_end = idx.num_nodes();
+  if (g.attrs.count("node_range")) {
+    const auto& range = g.GetAttr<std::pair<uint32_t, uint32_t> >("node_range");
+    node_start = range.first;
+    node_end = range.second;
+  }
+  for (uint32_t nid = node_start; nid < node_end; ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      LOG(INFO) << "node " << nid << " var";
+    } else {
+      LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name
+                << ": " << dispatch_mode_string(dispatch_modes[nid]);
+      for (const auto& e : inode.inputs) {
+        auto eid = idx.entry_id(e);
+        LOG(INFO) << "\t\tinput " << eid << ": " << stype_string(vstorage_type[eid]);
+      }
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        uint32_t eid = idx.entry_id(nid, index);
+        LOG(INFO) << "\t\toutput " << eid << ": " << stype_string(vstorage_type[eid]);
+      }
+    }
+  }
+}
+
 
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_EXEC_UTILS_H_
+
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 1fa530696b3..8196af2de2f 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -63,6 +63,12 @@ class NaiveEngine final : public Engine {
 #endif
   }
 
+  void Stop() override {
+  }
+
+  void Start() override {
+  }
+
   // new variables
   VarHandle NewVariable() override {
     size_t v = ++counter_;
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index a8227886f84..2f77380baf8 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -53,22 +53,6 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
 
   ThreadedEnginePerDevice() noexcept(false) {
     this->Start();
-#ifndef _WIN32
-    pthread_atfork(
-      []() {
-        Engine::Get()->Stop();
-      },
-      []() {
-        Engine::Get()->Start();
-      },
-      []() {
-        // Make children single threaded since they are typically workers
-        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
-        dmlc::SetEnv("OMP_NUM_THREADS", 1);
-        OpenMP::Get()->set_enabled(false);
-        Engine::Get()->Start();
-      });
-#endif
   }
   ~ThreadedEnginePerDevice() noexcept(false) {
     this->StopNoWait();
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 074ea4e8472..574e83244a0 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -27,6 +27,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/concurrency.h>
 #include <cassert>
+#include <utility>
 #include "./threaded_engine.h"
 #include "./thread_pool.h"
 #include "./stream_manager.h"
@@ -42,14 +43,38 @@ namespace engine {
  */
 class ThreadedEnginePooled : public ThreadedEngine {
  public:
-  ThreadedEnginePooled() :
-      thread_pool_(kNumWorkingThreads, [this]() { ThreadWorker(&task_queue_); }),
-      io_thread_pool_(1, [this]() { ThreadWorker(&io_task_queue_); }) {}
+  ThreadedEnginePooled() {
+    this->Start();
+  }
 
   ~ThreadedEnginePooled() noexcept(false) {
-    streams_.Finalize();
-    task_queue_.SignalForKill();
-    io_task_queue_.SignalForKill();
+    StopNoWait();
+  }
+
+  void StopNoWait() {
+    streams_->Finalize();
+    task_queue_->SignalForKill();
+    io_task_queue_->SignalForKill();
+    task_queue_ = nullptr;
+    io_task_queue_ = nullptr;
+    thread_pool_ = nullptr;
+    io_thread_pool_ = nullptr;
+    streams_ = nullptr;
+  }
+
+  void Stop() override {
+    WaitForAll();
+    StopNoWait();
+  }
+
+  void Start() override {
+    streams_.reset(new StreamManager<kMaxNumGpus, kNumStreamsPerGpu>());
+    task_queue_.reset(new dmlc::ConcurrentBlockingQueue<OprBlock*>());
+    io_task_queue_.reset(new dmlc::ConcurrentBlockingQueue<OprBlock*>());
+    thread_pool_.reset(new ThreadPool(kNumWorkingThreads, [this]() {
+      ThreadWorker(task_queue_); }));
+    io_thread_pool_.reset(new ThreadPool(1, [this]() {
+      ThreadWorker(io_task_queue_); }));
   }
 
  protected:
@@ -71,24 +96,24 @@ class ThreadedEnginePooled : public ThreadedEngine {
   /*!
    * \brief Streams.
    */
-  StreamManager<kMaxNumGpus, kNumStreamsPerGpu> streams_;
+  std::unique_ptr<StreamManager<kMaxNumGpus, kNumStreamsPerGpu>> streams_;
   /*!
    * \brief Task queues.
    */
-  dmlc::ConcurrentBlockingQueue<OprBlock*> task_queue_;
-  dmlc::ConcurrentBlockingQueue<OprBlock*> io_task_queue_;
+  std::shared_ptr<dmlc::ConcurrentBlockingQueue<OprBlock*>> task_queue_;
+  std::shared_ptr<dmlc::ConcurrentBlockingQueue<OprBlock*>> io_task_queue_;
   /*!
    * \brief Thread pools.
    */
-  ThreadPool thread_pool_;
-  ThreadPool io_thread_pool_;
+  std::unique_ptr<ThreadPool> thread_pool_;
+  std::unique_ptr<ThreadPool> io_thread_pool_;
   /*!
    * \brief Worker.
    * \param task_queue Queue to work on.
    *
    * The method to pass to thread pool to parallelize.
    */
-  void ThreadWorker(dmlc::ConcurrentBlockingQueue<OprBlock*>* task_queue) {
+  void ThreadWorker(std::shared_ptr<dmlc::ConcurrentBlockingQueue<OprBlock*>> task_queue) {
     OprBlock* opr_block;
     while (task_queue->Pop(&opr_block)) {
       DoExecute(opr_block);
@@ -110,8 +135,8 @@ class ThreadedEnginePooled : public ThreadedEngine {
     bool is_copy = (opr_block->opr->prop == FnProperty::kCopyFromGPU ||
                     opr_block->opr->prop == FnProperty::kCopyToGPU);
     auto&& rctx = is_copy
-        ? streams_.GetIORunContext(opr_block->ctx)
-        : streams_.GetRunContext(opr_block->ctx);
+        ? streams_->GetIORunContext(opr_block->ctx)
+        : streams_->GetRunContext(opr_block->ctx);
     this->ExecuteOprBlock(rctx, opr_block);
   }
   /*!
@@ -122,11 +147,11 @@ class ThreadedEnginePooled : public ThreadedEngine {
     switch (opr_block->opr->prop) {
       case FnProperty::kCopyFromGPU:
       case FnProperty::kCopyToGPU: {
-        io_task_queue_.Push(opr_block);
+        io_task_queue_->Push(opr_block);
         break;
       }
       default: {
-        task_queue_.Push(opr_block);
+        task_queue_->Push(opr_block);
         break;
       }
     }
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index f7ac772ec76..697e4869a04 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -78,7 +78,8 @@ class StorageFallbackOpExecutor : public OpExecutor {
     pre_temp_src_.clear(); pre_temp_dst_.clear();
     post_temp_src_.clear(); post_temp_dst_.clear();
     in_temp_idx_map_.clear();
-    SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
+    tmp_req = req;
+    SetupDefaultBlobsInOut(in_array, out_array, &pre_temp_buf_, &post_temp_buf_, &req,
                            &in_data_, &out_data_,
                            &pre_temp_src_, &pre_temp_dst_,
                            &post_temp_src_, &post_temp_dst_,
@@ -89,8 +90,12 @@ class StorageFallbackOpExecutor : public OpExecutor {
   // storage fallback after fcompute is completed
   void PostFCompute(bool is_gpu) {
     common::CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx, is_gpu);
+    req = tmp_req;
   }
 
+  // output requirement on each output array.
+  // This temporarily saves the original output requirements.
+  std::vector<OpReqType> tmp_req;
   // default storage tensor blobs for fcompute
   std::vector<TBlob> in_data_, out_data_;
   // These are NDArray buffers for cast storage.
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index d5dacf751a6..e28867d5488 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -32,6 +32,7 @@
 #include "./graph_executor.h"
 #include "../profiler/profiler.h"
 #include "../common/utils.h"
+#include "../common/exec_utils.h"
 
 namespace mxnet {
 namespace exec {
@@ -904,6 +905,12 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol,
   }
   g = DetectInplaceAddTo(g);
 
+  // log the static memory plan of the graph
+  static bool mem_log_verbose = dmlc::GetEnv("MXNET_MEM_PLAN_VERBOSE_LOGGING", false);
+  if (mem_log_verbose) {
+    common::LogMemoryPlan(g);
+  }
+
   g = AttachOpExecs(g);
   g = AttachOpResources(g);
   graph_ = std::move(g);
@@ -1036,6 +1043,117 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
   FinishInitGraph(symbol, g, shared_exec, feed_dict);
 }
 
+/*!
+ * \brief Return a new executor with the same symbol and shared memory,
+ * but different input/output shapes.
+ * For runtime reshaping, variable length sequences, etc.
+ * The returned executor shares state with the current one,
+ * and cannot be used in parallel with it.
+ */
+Executor* GraphExecutor::Reshape(const bool partial_shaping,
+                                 const bool allow_up_sizing,
+                                 const Context& default_ctx,
+                                 const std::map<std::string, Context>& ctx_map,
+                                 const std::unordered_map<std::string, TShape>&
+                                   provided_arg_shapes,
+                                 std::vector<NDArray>* in_args,
+                                 std::vector<NDArray>* arg_grads,
+                                 std::vector<NDArray>* aux_states) {
+  nnvm::Graph g;
+  g.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
+    graph_.outputs.begin() + num_forward_outputs_);
+  nnvm::Symbol symbol;
+  symbol.outputs = g.outputs;
+  const nnvm::IndexedGraph& idx = g.indexed_graph();
+  nnvm::ShapeVector arg_shapes(idx.input_nodes().size(), TShape());
+  for (size_t i = 0; i < num_forward_inputs_; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const std::string& name = idx[nid].source->attrs.name;
+    auto it = provided_arg_shapes.find(name);
+    if (provided_arg_shapes.end() != it) {
+      arg_shapes[i] = it->second;
+    }
+  }
+  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
+  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
+    HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
+                          g.GetAttr<nnvm::ShapeVector>("shape"));
+  }
+  const nnvm::ShapeVector& shape_vec = g.GetAttr<nnvm::ShapeVector>("shape");
+  std::vector<OpReqType> grad_req_types;
+  size_t grad_top = 0;
+  const size_t num_args = in_arg_map_.size();
+  const size_t num_aux = aux_state_map_.size();
+  in_args->reserve(num_args);
+  grad_req_types.reserve(num_args);
+  arg_grads->reserve(num_args);
+  aux_states->reserve(num_aux);
+  for (uint32_t nid : idx.input_nodes()) {
+    std::string name = idx[nid].source->attrs.name;
+    const TShape& new_shape = shape_vec[idx.entry_id(nid, 0)];
+    if (idx.mutable_input_nodes().count(nid) == 0) {
+      NDArray& arr = in_arg_map_.at(name);
+      auto it = arg_grad_map_.find(name);
+      if (partial_shaping || provided_arg_shapes.count(name) || new_shape == arr.shape()) {
+        if (new_shape.Size() > arr.shape().Size()) {
+          CHECK(allow_up_sizing) << "New shape of arg: " << name << " is larger than original."
+            << "First making a big executor and then down sizing it "
+            << "is more efficient than the reverse."
+            << "If you really want to up size, set allow_up_sizing=True "
+            << "to enable allocation of new arrays.";
+          in_args->emplace_back(new_shape, arr.ctx(), false, arr.dtype());
+          if (it != arg_grad_map_.end()) {
+            NDArray& darr = it->second;
+            arg_grads->emplace_back(new_shape, darr.ctx(), false, darr.dtype());
+            grad_req_types.push_back(grad_store_.at(grad_top++).first);
+          } else {
+            arg_grads->emplace_back();
+            grad_req_types.push_back(kNullOp);
+          }
+        } else {
+          in_args->push_back(arr.Reshape(new_shape));
+          if (it != arg_grad_map_.end()) {
+            NDArray& darr = it->second;
+            arg_grads->push_back(darr.Reshape(new_shape));
+            grad_req_types.push_back(grad_store_.at(grad_top++).first);
+          } else {
+            arg_grads->emplace_back();
+            grad_req_types.push_back(kNullOp);
+          }
+        }
+      } else {
+        LOG(FATAL) << "Shape of unspecifie arg: " << name << " changed. "
+          << "This can cause the new executor to not share parameters "
+          << "with the old one. Please check for error in network."
+          << "If this is intended, set partial_shaping=True to suppress this warning.";
+      }
+    } else {
+      NDArray& arr = aux_state_map_.at(name);
+      if (partial_shaping || new_shape == arr.shape()) {
+        if (new_shape.Size() > arr.shape().Size()) {
+          CHECK(allow_up_sizing) << "New shape of arg: " << name << " is larger than original."
+            << "First making a big executor and then down sizing it "
+            << "is more efficient than the reverse."
+            << "If you really want to up size, set allow_up_sizing=True "
+            << "to enable allocation of new arrays.";
+          aux_states->emplace_back(new_shape, arr.ctx(), false, arr.dtype());
+        } else {
+          aux_states->push_back(arr.Reshape(new_shape));
+        }
+      } else {
+        LOG(FATAL) << "Shape of unspecifie arg: " << name << " changed. "
+          << "This can cause the new executor to not share parameters "
+          << "with the old one. Please check for error in network."
+          << "If this is intended, set partial_shaping=True to suppress this warning.";
+      }
+    }
+  }
+  auto exec = new GraphExecutor();
+  exec->Init(symbol, default_ctx, ctx_map,
+             *in_args, *arg_grads, grad_req_types, *aux_states,
+             this);
+  return exec;
+}
 /*!
  * \brief This function is triggered by both simple_bind
  * and bind flows.
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index bcde41d508e..24f98894912 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -107,6 +107,16 @@ class GraphExecutor : public Executor {
             const nnvm::NodeEntryMap<NDArray>& feed_dict
               = nnvm::NodeEntryMap<NDArray>());
 
+  Executor* Reshape(const bool partial_shaping,
+                    const bool allow_up_sizing,
+                    const Context& default_ctx,
+                    const std::map<std::string, Context>& ctx_map,
+                    const std::unordered_map<std::string, TShape>&
+                      provided_arg_shapes,
+                    std::vector<NDArray>* in_args,
+                    std::vector<NDArray>* arg_grads,
+                    std::vector<NDArray>* aux_states) override;
+
  protected:
   friend class mxnet::Imperative;
   // Information about operational node
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index 191fbe949ac..0abee04b59d 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -391,36 +391,9 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph,
       common::DefaultStorageType, false, "dispatch_mode", DispatchMode::kVariable);
 
   // log the storage types and dispatch modes of the graph
-  bool log_verbose = dmlc::GetEnv("MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING", false);
+  static bool log_verbose = dmlc::GetEnv("MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING", false);
   if (log_verbose) {
-    const auto &idx = ret.indexed_graph();
-    const auto& vstorage_type = ret.GetAttr<StorageTypeVector>("storage_type");
-    const auto& dispatch_modes = ret.GetAttr<DispatchModeVector>("dispatch_mode");
-    uint32_t node_start = 0, node_end = idx.num_nodes();
-    if (ret.attrs.count("node_range")) {
-      const auto& range = ret.GetAttr<std::pair<uint32_t, uint32_t> >("node_range");
-      node_start = range.first;
-      node_end = range.second;
-    }
-    for (uint32_t nid = node_start; nid < node_end; ++nid) {
-      const auto& inode = idx[nid];
-      if (inode.source->is_variable()) {
-        LOG(INFO) << "node " << nid << " var";
-      } else {
-        LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name
-                  << ": " << common::dispatch_mode_string(dispatch_modes[nid]);
-        for (const auto& e : inode.inputs) {
-          auto eid = idx.entry_id(e);
-          LOG(INFO) << "\t\tinput " << eid << ": "
-                    << common::stype_string(vstorage_type[eid]);
-        }
-        for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
-          uint32_t eid = idx.entry_id(nid, index);
-          LOG(INFO) << "\t\toutput " << eid << ": "
-                    << common::stype_string(vstorage_type[eid]);
-        }
-      }
-    }
+    common::LogInferStorage(ret);
   }
   return ret;
 }
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index c5a47407c0c..7caf305eac7 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -194,7 +194,7 @@ void Imperative::RecordOp(
       << "will cause undefined behavior when evaluating gradients. "
       << "Please call backward first to clear the graph or do this out side of "
       << "a record section. Also note that you cannot use inplace operations "
-      << "like +=, *=, relu(x, out=x), etc inside a record section.";
+      << "like +=, *=, relu(x, out=x), y[idx]=x, etc inside a record section.";
   }
 
   bool need_grad = false;
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 10a011e88b3..d7bb37b7cfe 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -373,8 +373,9 @@ inline void PushFCompute(const FCompute& fn,
 #if MXNET_USE_MKLDNN == 1
       InvalidateOutputs(outputs, req);
 #endif
+      std::vector<OpReqType> tmp_req = req;
       // setup blobs
-      SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+      SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
                              &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                              &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
@@ -382,7 +383,7 @@ inline void PushFCompute(const FCompute& fn,
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
       // pre-fcompute fallback, cast to default storage type
       CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
-      fn(attrs, opctx, input_blobs, req, output_blobs);
+      fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
       // post-fcompute fallback, cast to original storage type
       CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
       if (is_gpu) {
@@ -492,15 +493,16 @@ inline void PushOperator(const OpStatePtr& state,
 #if MXNET_USE_MKLDNN == 1
         InvalidateOutputs(outputs, req);
 #endif
+        std::vector<OpReqType> tmp_req = req;
         // populate input blobs and output blobs
-        SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+        SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
                                &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                                &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
         // setup contexts
         bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
         // pre-fcompute fallback
         CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
-        fcompute(state, opctx, input_blobs, req, output_blobs);
+        fcompute(state, opctx, input_blobs, tmp_req, output_blobs);
         // post-fcompute fallback, cast to original storage type, if necessary
         CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
         if (is_gpu && exec_type == ExecType::kSync) {
diff --git a/src/initialize.cc b/src/initialize.cc
index 69d408d7d87..1fd92628e9b 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -25,6 +25,7 @@
 #include <signal.h>
 #include <dmlc/logging.h>
 #include <mxnet/engine.h>
+#include "./engine/openmp.h"
 
 namespace mxnet {
 #if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
@@ -42,6 +43,24 @@ class LibraryInitializer {
 #if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
     signal(SIGSEGV, SegfaultLogger);
 #endif
+
+// disable openmp for multithreaded workers
+#ifndef _WIN32
+    pthread_atfork(
+      []() {
+        Engine::Get()->Stop();
+      },
+      []() {
+        Engine::Get()->Start();
+      },
+      []() {
+        // Make children single threaded since they are typically workers
+        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+        dmlc::SetEnv("OMP_NUM_THREADS", 1);
+        engine::OpenMP::Get()->set_enabled(false);
+        Engine::Get()->Start();
+      });
+#endif
   }
 
   static LibraryInitializer* Get();
diff --git a/src/io/iter_csv.cc b/src/io/iter_csv.cc
index a9e650b6387..ca3f042f45a 100644
--- a/src/io/iter_csv.cc
+++ b/src/io/iter_csv.cc
@@ -57,23 +57,54 @@ struct CSVIterParam : public dmlc::Parameter<CSVIterParam> {
   }
 };
 
-class CSVIter: public IIterator<DataInst> {
+class CSVIterBase: public IIterator<DataInst> {
  public:
-  CSVIter() {
+  CSVIterBase() {
     out_.data.resize(2);
   }
-  virtual ~CSVIter() {}
+  virtual ~CSVIterBase() {}
+
+  // initialize iterator loads data in
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) = 0;
+  /*! \brief reset the iterator */
+  virtual void BeforeFirst(void) = 0;
+  /*! \brief move to next item */
+  virtual bool Next(void) = 0;
+  /*! \brief get current data */
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ protected:
+  CSVIterParam param_;
+
+  DataInst out_;
+
+  // internal instance counter
+  unsigned inst_counter_{0};
+  // at end
+  bool end_{false};
+
+  // label parser
+  size_t label_ptr_{0}, label_size_{0};
+  size_t data_ptr_{0}, data_size_{0};
+};
 
+template <typename DType>
+class CSVIterTyped: public CSVIterBase {
+ public:
+  virtual ~CSVIterTyped() {}
   // intialize iterator loads data in
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     param_.InitAllowUnknown(kwargs);
-    data_parser_.reset(dmlc::Parser<uint32_t>::Create(param_.data_csv.c_str(), 0, 1, "csv"));
+    data_parser_.reset(dmlc::Parser<uint32_t, DType>::Create(param_.data_csv.c_str(), 0, 1, "csv"));
     if (param_.label_csv != "NULL") {
-      label_parser_.reset(dmlc::Parser<uint32_t>::Create(param_.label_csv.c_str(), 0, 1, "csv"));
+      label_parser_.reset(
+        dmlc::Parser<uint32_t, DType>::Create(param_.label_csv.c_str(), 0, 1, "csv"));
     } else {
       dummy_label.set_pad(false);
       dummy_label.Resize(mshadow::Shape1(1));
-      dummy_label = 0.0f;
+      dummy_label = 0;
     }
   }
 
@@ -116,33 +147,63 @@ class CSVIter: public IIterator<DataInst> {
     return true;
   }
 
-  virtual const DataInst &Value(void) const {
-    return out_;
-  }
-
  private:
-  inline TBlob AsTBlob(const dmlc::Row<uint32_t>& row, const TShape& shape) {
+  inline TBlob AsTBlob(const dmlc::Row<uint32_t, DType>& row, const TShape& shape) {
     CHECK_EQ(row.length, shape.Size())
         << "The data size in CSV do not match size of shape: "
         << "specified shape=" << shape << ", the csv row-length=" << row.length;
-    const real_t* ptr = row.value;
-    return TBlob((real_t*)ptr, shape, cpu::kDevMask, 0);  // NOLINT(*)
+    const DType* ptr = row.value;
+    return TBlob((DType*)ptr, shape, cpu::kDevMask, 0);  // NOLINT(*)
+  }
+  // dummy label
+  mshadow::TensorContainer<cpu, 1, DType> dummy_label;
+  std::unique_ptr<dmlc::Parser<uint32_t, DType> > label_parser_;
+  std::unique_ptr<dmlc::Parser<uint32_t, DType> > data_parser_;
+};
+
+class CSVIter: public IIterator<DataInst> {
+ public:
+  CSVIter() {}
+  virtual ~CSVIter() {}
+
+  // intialize iterator loads data in
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    bool dtype_has_value = false;
+    int target_dtype = -1;
+    for (const auto& arg : kwargs) {
+      if (arg.first == "dtype") {
+        dtype_has_value = true;
+        if (arg.second == "int32" || arg.second == "float32") {
+          target_dtype = (arg.second == "int32") ? mshadow::kInt32 : mshadow::kFloat32;
+        } else {
+          CHECK(false) << arg.second << " is not supported for CSVIter";
+        }
+      }
+    }
+    if (dtype_has_value && target_dtype == mshadow::kInt32) {
+      iterator_.reset(reinterpret_cast<CSVIterBase*>(new CSVIterTyped<int>()));
+    } else if (!dtype_has_value || target_dtype == mshadow::kFloat32) {
+      iterator_.reset(reinterpret_cast<CSVIterBase*>(new CSVIterTyped<float>()));
+    }
+    iterator_->Init(kwargs);
+  }
+
+  virtual void BeforeFirst() {
+    iterator_->BeforeFirst();
+  }
+
+  virtual bool Next() {
+    return iterator_->Next();
   }
 
+  virtual const DataInst &Value(void) const {
+    return iterator_->Value();
+  }
+
+ private:
   CSVIterParam param_;
-  // output instance
-  DataInst out_;
-  // internal instance counter
-  unsigned inst_counter_{0};
-  // at end
-  bool end_{false};
-  // dummy label
-  mshadow::TensorContainer<cpu, 1, real_t> dummy_label;
-  // label parser
-  size_t label_ptr_{0}, label_size_{0};
-  size_t data_ptr_{0}, data_size_{0};
-  std::unique_ptr<dmlc::Parser<uint32_t> > label_parser_;
-  std::unique_ptr<dmlc::Parser<uint32_t> > data_parser_;
+  std::unique_ptr<CSVIterBase> iterator_;
 };
 
 
@@ -167,6 +228,10 @@ If ``data_csv = 'data/'`` is set, then all the files in this directory will be r
 
 ``reset()`` is expected to be called only after a complete pass of data.
 
+By default, the CSVIter parses all entries in the data file as float32 data type,
+if `dtype` argument is set to be 'int32' then CSVIter will parse all entries in the file
+as int32 data type.
+
 Examples::
 
   // Contents of CSV file ``data/data.csv``.
@@ -220,6 +285,20 @@ Examples::
   [2.  3.  4.]
   [3.  4.  5.]]
 
+  // Creates a 'CSVIter' with `dtype`='int32'
+  CSVIter = mx.io.CSVIter(data_csv = 'data/data.csv', data_shape = (3,),
+  batch_size = 3, round_batch=False, dtype='int32')
+
+  // Contents of two batches read from the above iterator in both passes, after calling
+  // `reset` method before second pass, is as follows:
+  [[1  2  3]
+  [2  3  4]
+  [3  4  5]]
+
+  [[4  5  6]
+  [2  3  4]
+  [3  4  5]]
+
 )code" ADD_FILELINE)
 .add_arguments(CSVIterParam::__FIELDS__())
 .add_arguments(BatchParam::__FIELDS__())
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 96248998902..a5d6a1dabef 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -112,41 +112,51 @@ class CommCPU : public Comm {
 
   void Init(int key, const NDArrayStorageType stype, const TShape& shape,
             int type = mshadow::kFloat32) override {
-    if (stype == kDefaultStorage) {
-      merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type);
-    } else {
-      merge_buf_[key].merged = NDArray(stype, shape, pinned_ctx_, true, type);
-    }
+    // Delayed allocation - the dense merged buffer might not be used at all if push()
+    // only sees sparse arrays
+    bool delay_alloc = true;
+    merge_buf_[key].merged = NDArray(shape, pinned_ctx_, delay_alloc, type);
   }
 
   const NDArray& Reduce(int key, const std::vector<NDArray>& src,
                         int priority) override {
     auto& buf = merge_buf_[key];
+    const auto stype = src[0].storage_type();
     // avoid extra copy for single device, but it may bring problems for
     // abnormal usage of kvstore
     if (src.size() == 1) {
-      if (src[0].storage_type() == kDefaultStorage) {
+      if (stype == kDefaultStorage) {
         return src[0];
-      } else {  // if sparse and only one GPU, always update weight on CPU
-        CopyFromTo(src[0], &buf.merged, priority);
-        return buf.merged;
+      } else {
+        // With 'local' kvstore, we could store the weight on CPU while compute
+        // the gradient on GPU when the weight is extremely large.
+        // To avoiding copying the weight to the same context of the gradient,
+        // we always copy the gradient to merged buf.
+        NDArray& merged = buf.merged_buf(stype);
+        CopyFromTo(src[0], &merged, priority);
+        return merged;
       }
     }
 
-    if (buf.merged.storage_type() == kDefaultStorage) {
+    NDArray& buf_merged = buf.merged_buf(stype);
+    // normal dense reduce
+    if (stype == kDefaultStorage) {
       std::vector<Engine::VarHandle> const_vars(src.size() - 1);
       std::vector<NDArray> reduce(src.size());
-      CopyFromTo(src[0], &buf.merged, priority);
-      reduce[0] = buf.merged;
+      CopyFromTo(src[0], &buf_merged, priority);
+      reduce[0] = buf_merged;
 
       if (buf.copy_buf.empty()) {
         buf.copy_buf.resize(src.size()-1);
         for (size_t j = 0; j < src.size() - 1; ++j) {
-          // allocate NDArray based on storage type
+          // allocate copy buffer
           buf.copy_buf[j] = NDArray(
             src[0].shape(), pinned_ctx_, false, src[0].dtype());
         }
       }
+      CHECK(stype == buf.copy_buf[0].storage_type())
+           << "Storage type mismatch detected. " << stype << "(src) vs. "
+           << buf.copy_buf[0].storage_type() << "(buf.copy_buf)";
       for (size_t i = 1; i < src.size(); ++i) {
         CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority);
         reduce[i] = buf.copy_buf[i-1];
@@ -161,7 +171,7 @@ class CommCPU : public Comm {
         FnProperty::kCPUPrioritized, priority, "KVStoreReduce");
 
     } else {
-      // buf.merged is a sparse ndarray.
+      // sparse reduce
       std::vector<Engine::VarHandle> const_vars(src.size());
       std::vector<NDArray> reduce(src.size());
 
@@ -172,26 +182,28 @@ class CommCPU : public Comm {
             src[0].storage_type(), src[0].shape(), pinned_ctx_, true, src[0].dtype());
         }
       }
+      CHECK(stype == buf.copy_buf[0].storage_type())
+           << "Storage type mismatch detected. " << stype << "(src) vs. "
+           << buf.copy_buf[0].storage_type() << "(buf.copy_buf)";
       for (size_t i = 0; i < src.size(); ++i) {
         CopyFromTo(src[i], &(buf.copy_buf[i]), priority);
         reduce[i] = buf.copy_buf[i];
         const_vars[i] = reduce[i].var();
       }
-      NDArray result = buf.merged;
-      Resource rsc = ResourceManager::Get()->Request(result.ctx(),
+      Resource rsc = ResourceManager::Get()->Request(buf_merged.ctx(),
           ResourceRequest(ResourceRequest::kTempSpace));
       Engine::Get()->PushAsync(
-        [reduce, result, rsc, this](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          NDArray out = result;
+        [reduce, buf_merged, rsc, this](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          NDArray out = buf_merged;
           is_serial_push_?
             ReduceSumCPUExSerial(reduce, &out)
             : mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, reduce, &out);
           on_complete();
-        }, Context::CPU(), const_vars, {result.var(), rsc.var},
+        }, Context::CPU(), const_vars, {buf_merged.var(), rsc.var},
         FnProperty::kCPUPrioritized, priority, "KVStoreReduce");
     }
 
-    return buf.merged;
+    return buf_merged;
   }
 
   void Broadcast(int key, const NDArray& src,
@@ -200,10 +212,14 @@ class CommCPU : public Comm {
     if (mask == Context::kCPU) {
       for (auto d : dst) CopyFromTo(src, d, priority);
     } else {
-      // first copy data to cpu, then broadcast
-      auto& buf = merge_buf_[key];
-      CopyFromTo(src, &buf.merged, priority);
-      for (auto d : dst) CopyFromTo(buf.merged, d, priority);
+      // First copy data to pinned_ctx, then broadcast.
+      // Note that kv.init initializes the data on pinned_ctx.
+      // This branch indicates push() with ndarrays on gpus were called,
+      // and the source is copied to gpu ctx.
+      // Also indicates that buffers are already initialized during push().
+      auto& buf = merge_buf_[key].merged_buf(src.storage_type());
+      CopyFromTo(src, &buf, priority);
+      for (auto d : dst) CopyFromTo(buf, d, priority);
     }
   }
 
@@ -223,9 +239,19 @@ class CommCPU : public Comm {
       CHECK_EQ(row_id.ctx().dev_mask(), Context::kCPU)
                << "BroadcastRowSparse with row_indices on gpu context not supported";
       // retain according to unique indices
-      const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU;
-      NDArray retained_cpu = is_to_gpu ? NDArray(kRowSparseStorage, src.shape(),
-          src.ctx(), true, src.dtype(), src.aux_types()) : *out;
+      const bool is_same_ctx = out->ctx() == src.ctx();
+      const bool is_diff_var = out->var() != src.var();
+      NDArray retained_cpu = (is_same_ctx && is_diff_var) ? *out :
+          NDArray(kRowSparseStorage, src.shape(), src.ctx(), true,
+                  src.dtype(), src.aux_types());
+      if (!is_diff_var) {
+        common::LogOnce("The output of row_sparse_pull() on key " + std::to_string(key) +
+                        "refers to the same NDArray as the one stored in KVStore."
+                        "Performing row_sparse_pull() with such output is going to change the "
+                        "data stored in KVStore. Incorrect result may be generated "
+                        "next time row_sparse_pull() is called. To avoid such an issue,"
+                        "consider create a new NDArray buffer to store the output.");
+      }
       Engine::Get()->PushAsync(
         [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           const TBlob& indices = row_id.data();
@@ -389,6 +415,24 @@ class CommCPU : public Comm {
     NDArray merged;
     /// \brief the cpu buffer for gpu data
     std::vector<NDArray> copy_buf;
+    /// \brief the merged buffer for the given storage type
+    inline NDArray& merged_buf(NDArrayStorageType stype) {
+      if (stype == kDefaultStorage) {
+        return merged;
+      }
+      CHECK(stype == kRowSparseStorage) << "unexpected storage type " << stype;
+      // check if sparse_merged is initialized
+      if (sparse_merged.is_none()) {
+        CHECK(!merged.is_none());
+        sparse_merged = NDArray(kRowSparseStorage, merged.shape(), merged.ctx(),
+                                true, merged.dtype());
+      }
+      return sparse_merged;
+    }
+
+   private:
+    /// \brief the sparse merged value
+    NDArray sparse_merged;
   };
   std::unordered_map<int, BufferEntry> merge_buf_;
   size_t bigarray_bound_;
@@ -414,7 +458,7 @@ class CommDevice : public Comm {
 
   void Init(int key, const NDArrayStorageType stype, const TShape& shape,
             int dtype = mshadow::kFloat32) override {
-    sorted_key_attrs_.emplace_back(key, shape, dtype, stype);
+    sorted_key_attrs_.emplace_back(key, shape, dtype);
   }
 
   void InitBuffersAndComm(const std::vector<NDArray>& src) {
@@ -448,10 +492,12 @@ class CommDevice : public Comm {
     auto& buf = merge_buf_[key];
     std::vector<NDArray> reduce(src.size());
 
-    const NDArrayStorageType stype = buf.merged.storage_type();
+    const NDArrayStorageType stype = src[0].storage_type();
+    NDArray& buf_merged = buf.merged_buf(stype);
+    // normal dense reduce
     if (stype == kDefaultStorage) {
-      CopyFromTo(src[0], &(buf.merged), priority);
-      reduce[0] = buf.merged;
+      CopyFromTo(src[0], &buf_merged, priority);
+      reduce[0] = buf_merged;
 
       if (buf.copy_buf.empty()) {
         // TODO(mli) this results in large device memory usage for huge ndarray,
@@ -461,7 +507,7 @@ class CommDevice : public Comm {
         buf.copy_buf.resize(src.size()-1);
         for (size_t i = 0; i < src.size()-1; ++i) {
           buf.copy_buf[i] = NDArray(
-            buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype());
+            buf_merged.shape(), buf_merged.ctx(), false, buf_merged.dtype());
         }
       }
       for (size_t i = 0; i < src.size()-1; ++i) {
@@ -469,21 +515,24 @@ class CommDevice : public Comm {
         reduce[i+1] = buf.copy_buf[i];
       }
     } else {
+      // sparse reduce
       if (buf.copy_buf.empty()) {
+        // initialize buffer for copying during reduce
         buf.copy_buf.resize(src.size());
         for (size_t j = 0; j < src.size(); ++j) {
-          buf.copy_buf[j] = NDArray(
-            buf.merged.storage_type(), buf.merged.shape(), buf.merged.ctx(),
-            true, buf.merged.dtype());
+          buf.copy_buf[j] = NDArray(stype, src[0].shape(), buf_merged.ctx(), true, src[0].dtype());
         }
       }
+      CHECK(src[0].storage_type() == buf.copy_buf[0].storage_type())
+           << "Storage type mismatch detected. " << src[0].storage_type() << "(src) vs. "
+           << buf.copy_buf[0].storage_type() << "(buf.copy_buf)";
       for (size_t i = 0; i < src.size(); ++i) {
         CopyFromTo(src[i], &(buf.copy_buf[i]), priority);
         reduce[i] = buf.copy_buf[i];
       }
     }
-    ElementwiseSum(reduce, &buf.merged, priority);
-    return buf.merged;
+    ElementwiseSum(reduce, &buf_merged, priority);
+    return buf_merged;
   }
 
   const NDArray& ReduceCompressed(int key, const std::vector<NDArray>& src,
@@ -544,10 +593,10 @@ class CommDevice : public Comm {
         }
       }
     } else {
-      auto& buf = merge_buf_[key];
-      CopyFromTo(src, &buf.merged, priority);
+      auto& buf_merged = merge_buf_[key].merged_buf(src.storage_type());
+      CopyFromTo(src, &buf_merged, priority);
       for (auto d : dst) {
-        CopyFromTo(buf.merged, d, priority);
+        CopyFromTo(buf_merged, d, priority);
       }
     }
   }
@@ -565,14 +614,26 @@ class CommDevice : public Comm {
                << "BroadcastRowSparse expects row_sparse dst NDArray";
       CHECK_EQ(row_id.ctx(), src.ctx())
               << "row_id and src are expected to be on the same context";
+
       // retain according to indices
-      const bool is_diff_ctx = out->ctx() != src.ctx();
-      NDArray out_gpu = is_diff_ctx? NDArray(kRowSparseStorage, out->shape(),
-          src.ctx(), true, out->dtype(), out->aux_types()) : *out;
+      const bool is_same_ctx = out->ctx() == src.ctx();
+      const bool is_diff_var = out->var() != src.var();
+      NDArray retained_gpu = (is_same_ctx && is_diff_var) ? *out :
+          NDArray(kRowSparseStorage, out->shape(), src.ctx(), true,
+                  out->dtype(), out->aux_types());
+      if (!is_diff_var) {
+        common::LogOnce("The output of row_sparse_pull() on key " + std::to_string(key) +
+                        "refers to the same NDArray as the one stored in KVStore."
+                        "Performing row_sparse_pull() with such output is going to change the "
+                        "data stored in KVStore. Incorrect result may be generated "
+                        "next time row_sparse_pull() is called. To avoid such an issue,"
+                        "consider create a new NDArray buffer to store the output.");
+      }
+
       Engine::Get()->PushAsync([=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           const TBlob& indices = row_id.data();
           using namespace mxnet::common;
-          NDArray temp = out_gpu;
+          NDArray temp = retained_gpu;
           switch (temp.ctx().dev_mask()) {
             case cpu::kDevMask: {
               SparseRetainOpForwardRspWrapper<cpu>(rctx.get_stream<cpu>(),
@@ -591,9 +652,9 @@ class CommDevice : public Comm {
             default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
           }
           on_complete();
-        }, out_gpu.ctx(), {src.var(), row_id.var()}, {out_gpu.var()},
+        }, retained_gpu.ctx(), {src.var(), row_id.var()}, {retained_gpu.var()},
       FnProperty::kNormal, priority, "KVStoreSparseRetain");
-      CopyFromTo(out_gpu, out, priority);
+      CopyFromTo(retained_gpu, out, priority);
     }
   }
 
@@ -640,7 +701,7 @@ class CommDevice : public Comm {
 #endif
   }
 
-  using KeyAttrs = std::tuple<int, TShape, int, NDArrayStorageType>;
+  using KeyAttrs = std::tuple<int, TShape, int>;
   // try to allocate buff on device evenly
   void InitMergeBuffer(const std::vector<Context>& devs) {
     std::sort(sorted_key_attrs_.begin(), sorted_key_attrs_.end(), [](
@@ -652,11 +713,11 @@ class CommDevice : public Comm {
     for (auto d : devs) {
       ctx_info[d.dev_id] = std::make_pair(d, 0);
     }
+
     for (size_t i = 0; i < sorted_key_attrs_.size(); ++i) {
       const int key  = std::get<0>(sorted_key_attrs_[i]);
       const TShape& shape = std::get<1>(sorted_key_attrs_[i]);
       const int type = std::get<2>(sorted_key_attrs_[i]);
-      const NDArrayStorageType stype = std::get<3>(sorted_key_attrs_[i]);
       auto& buf = merge_buf_[key];
       Context ctx;
       size_t min_size = std::numeric_limits<size_t>::max();
@@ -667,11 +728,10 @@ class CommDevice : public Comm {
           min_size = size;
         }
       }
-      if (stype == kDefaultStorage) {
-        buf.merged = NDArray(shape, ctx, false, type);
-      } else {
-        buf.merged = NDArray(stype, shape, ctx, true, type);
-      }
+      // Delayed allocation - as the dense merged buffer might not be used at all if push()
+      // only sees sparse arrays
+      bool delay_alloc = true;
+      buf.merged = NDArray(shape, ctx, delay_alloc, type);
       ctx_info[ctx.dev_id].second += shape.Size();
     }
     inited_ = true;
@@ -680,9 +740,9 @@ class CommDevice : public Comm {
   std::vector<KeyAttrs> sorted_key_attrs_;
   /// \brief temporal space for pushing and pulling
   struct BufferEntry {
-    /// \brief the merged value
+    /// \brief the dense merged value for reduce and broadcast operations
     NDArray merged;
-    /// \brief the gpu buffer
+    /// \brief the gpu buffer for copy during reduce operation
     std::vector<NDArray> copy_buf;
     /// \brief the residual buffer for gradient compression
     std::vector<NDArray> residual;
@@ -690,6 +750,26 @@ class CommDevice : public Comm {
     std::vector<NDArray> compressed_send_buf;
     /// \brief the small buffer for compressed data in receiver
     std::vector<NDArray> compressed_recv_buf;
+
+    /// \brief the merged buffer for the given storage type (could be either dense or row_sparse)
+    inline NDArray& merged_buf(NDArrayStorageType stype) {
+      if (stype == kDefaultStorage) {
+        CHECK(!merged.is_none()) << "unintialized merge buffer detected";
+        return merged;
+      }
+      CHECK(stype == kRowSparseStorage) << "unexpected storage type " << stype;
+      // check if sparse_merged is initialized
+      if (sparse_merged.is_none()) {
+        CHECK(!merged.is_none());
+        sparse_merged = NDArray(kRowSparseStorage, merged.shape(), merged.ctx(),
+                                true, merged.dtype());
+      }
+      return sparse_merged;
+    }
+
+   private:
+    /// \brief the sparse merged value for reduce and rowsparse broadcast operations
+    NDArray sparse_merged;
   };
   std::unordered_map<int, BufferEntry> merge_buf_;
   bool inited_;
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 3383c97f926..38ecf121dfe 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -276,8 +276,8 @@ class KVStoreLocal : public KVStore {
       // invalid, print warning messages once
       if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) {
         LOG(INFO) << "Warning: non-default weights detected during kvstore pull. "
-                  << "This call has been ignored. "
-                  << "Please make sure to use row_sparse_pull with row_ids.";
+                     "This call has been ignored. Please make sure to use "
+                     "kv.row_sparse_pull() or module.prepare() with row_ids.";
         this->warnings_printed_.insert(key);
       }
       return false;
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index b428c2cbefb..94d3d90413a 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -200,6 +200,7 @@ NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const {
     ret.ptr_->delay_alloc = false;
     ret.ptr_->static_data = true;
     ret.byte_offset_ = byte_offset_;
+    ret.reuse_ = false;
     return ret;
   }
 }
@@ -217,6 +218,7 @@ NDArray NDArray::Reshape(const TShape &shape) const {
   // Otherwise, reshape only works on the default layout.
   CHECK_EQ(storage_type(), kDefaultStorage);
   ret.shape_ = shape;
+  ret.reuse_ = false;
   return ret;
 }
 
@@ -249,6 +251,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const {
   MSHADOW_TYPE_SWITCH(ret.dtype(), DType, {
     ret.byte_offset_ += begin * length * sizeof(DType);
   });
+  ret.reuse_ = false;
   ret.shape_[0] = end - begin;
   return ret;
 }
@@ -348,7 +351,8 @@ void NDArray::Chunk::Reorder2Default() {
     return;
 
   mkldnn_memory_format_t format = mkl_mem_->GetDefaultFormat();
-  CHECK_NE(format, mkl_mem_->GetFormat());
+  if (format ==  mkl_mem_->GetFormat())
+    return;
 
   mkldnn::memory::primitive_desc def_pd = mkl_mem_->GetPrimitiveDesc(format);
   mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd));
@@ -485,8 +489,8 @@ const mkldnn::memory *NDArray::GetMKLDNNData(
 }
 
 const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
-    const mkldnn::memory::primitive_desc &desc) const {
-  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    const mkldnn::memory::primitive_desc &new_pd) const {
+  if (new_pd.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
     return nullptr;
   }
@@ -495,24 +499,41 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
   const mkldnn::memory *mem = GetMKLDNNData();
   // If the memory descriptor matches, it's easy.
   MKLDNNStream *stream = MKLDNNStream::Get();
-  if (mem->get_primitive_desc() == desc) {
-    return GetMKLDNNExact(mem, desc);
+  if (mem->get_primitive_desc() == new_pd) {
+    return GetMKLDNNExact(mem, new_pd);
   }
 
-  mkldnn::memory::primitive_desc _desc = desc;
+  mkldnn::memory::primitive_desc _pd = new_pd;
+  mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc();
+  mkldnn::memory::desc desc2 = _pd.desc();
   // Now we need to determine if we should reorder the memory.
   // If both use the default formats, we think we don't need to reorder.
-  mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc();
-  mkldnn::memory::desc desc2 = _desc.desc();
   if (desc1.data.format == GetDefaultFormat(desc1) &&
       desc2.data.format == GetDefaultFormat(desc2)) {
-    mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle()));
+    mkldnn_mem_ptr ret(new mkldnn::memory(new_pd, mem->get_data_handle()));
     stream->RegisterMem(ret);
     return ret.get();
-  } else {
-    mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(desc);
+  } else if (same_shape(desc1, desc2)) {
+    // If they have the same shape, we can reorder data directly.
+    mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(new_pd);
     stream->RegisterPrim(mkldnn::reorder(*mem, *ret));
     return ret;
+  } else {
+    // If they have different shapes, we need to reshape the array first.
+    // Since this method will only be used inside an operator, we can call
+    // MKLDNNDataReshape to reshape an array.
+    TShape required_shape(desc2.data.ndims);
+    for (int i = 0; i < desc2.data.ndims; i++)
+      required_shape[i] = desc2.data.dims[i];
+    NDArray reshaped = MKLDNNDataReshape(required_shape);
+    const mkldnn::memory *ret = reshaped.GetMKLDNNData();
+    if (ret->get_primitive_desc() == new_pd) {
+      return GetMKLDNNExact(ret, new_pd);
+    } else {
+      mkldnn::memory *ret2 = TmpMemMgr::Get()->Alloc(new_pd);
+      stream->RegisterPrim(mkldnn::reorder(*ret, *ret2));
+      return ret2;
+    }
   }
 }
 
@@ -537,6 +558,7 @@ NDArray NDArray::Reorder2Default() const {
   // reshape as needed
   ret.shape_ = shape_;
   ret.byte_offset_ = byte_offset_;
+  ret.reuse_ = false;
   return ret;
 }
 
@@ -566,38 +588,49 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc)
 
 const mkldnn::memory *NDArray::GetMKLDNNData() const {
   CHECK(storage_type() == kDefaultStorage);
-  // If this array uses MKLDNN layout, we have to make sure it's not a view.
-  // Otherwise, we'll have to change the layout inside the array.
-  if (IsMKLDNNData())
-    CHECK(!IsView());
-  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_);
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-  if (IsView()) {
-    mkldnn::memory::primitive_desc pd = ptr_->mkl_mem_->GetPrimitiveDesc();
-    // Sliced array must use the default layout.
-    CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format);
-    void *off_addr = static_cast<char *>(ptr_->mkl_mem_->GetDataHandle())
-        + byte_offset_;
-
+  bool is_view = IsView();
+  if (IsMKLDNNData()) {
+    // If this array uses MKLDNN layout, we have to make sure it's not a view.
+    // Otherwise, we'll have to change the layout inside the array.
+    CHECK(!is_view);
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
+    // If this array uses MKLDNN format, we should return now. Otherwise,
+    // SetMKLMem may mess up mkl_mem_.
+    return ptr_->mkl_mem_->GetRaw();
+  } else if (is_view) {
+    // If this is a view, we can't create a MKLDNN memory for the chunk
+    // because we don't have the complete data type and shape information for
+    // the chunk.
+    void *off_addr = static_cast<char *>(ptr_->shandle.dptr) + byte_offset_;
     // Create the primitive desc for the new mkldnn memory.
     mkldnn::memory::dims dims(shape().ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape()[i];
     mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(
         GetDefaultFormat(shape().ndim()));
-    mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
-        pd.desc().data.data_type);
+    mkldnn::memory::data_type cpp_type = get_mkldnn_type(dtype_);
     mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
-    mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine());
+    mkldnn::memory::primitive_desc new_pd(data_md,
+                                          CpuEngine::Get()->get_engine());
 
     std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(new_pd, off_addr));
     MKLDNNStream::Get()->RegisterMem(ret);
     return ret.get();
   } else {
+    // If this isn't a view, we can create a MKLDNN memory and store it in the
+    // chunk.
+    ptr_->SetMKLMem(shape_, dtype_);
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
     return ptr_->mkl_mem_->GetRaw();
   }
 }
 
+void NDArray::InvalidateMKLDNNData() {
+  // Removing mkl_mem_ means the NDArray will store data in the default format.
+  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->IsMKLDNN())
+    ptr_->mkl_mem_ = nullptr;
+}
+
 void NDArray::CopyFrom(const mkldnn::memory &mem) {
   CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
   if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetRaw() == &mem)
@@ -608,20 +641,23 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
   MKLDNNStream *stream = MKLDNNStream::Get();
   // If this array uses MKLDNN layout, we have to make sure it's not a view.
   // Otherwise, we'll have to change the layout inside the array.
-  if (IsMKLDNNData())
-    CHECK(!IsView());
-  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_,
-                  dtype_);
-  stream->RegisterMem(ptr_->mkl_mem_->GetMem());
-  mkldnn::memory::desc from_desc = mem.get_primitive_desc().desc();
-  mkldnn::memory::desc this_desc = ptr_->mkl_mem_->GetPrimitiveDesc().desc();
+
+  if (IsMKLDNNData() && IsView())
+    ptr_->Reorder2Default();
+
+  const mkldnn::memory *this_mem = GetMKLDNNData();
+  mkldnn::memory::primitive_desc from_pd = mem.get_primitive_desc();
+  mkldnn::memory::desc from_desc = from_pd.desc();
+  mkldnn::memory::primitive_desc this_pd = this_mem->get_primitive_desc();
+  mkldnn::memory::desc this_desc = this_pd.desc();
   mkldnn_memory_format_t from_def_format = GetDefaultFormat(from_desc);
+  mkldnn_memory_format_t this_def_format = GetDefaultFormat(this_desc);
   if (IsView()) {
     // Sliced array must use the default layout.
     CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format);
   }
   // It's possible that the memory and the NDArray don't have the same shape.
-  if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)
+  if (!same_shape(this_desc, from_desc)
       // If the source memory uses the default layout, we can reshape directly.
       && from_def_format == from_desc.data.format) {
     // In this case, we can simply create a new MKLDNN memory for the required
@@ -631,15 +667,14 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
     auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
     auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
     mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
     mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_->GetRaw()));
-  } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) {
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
+  } else if (!same_shape(this_desc, from_desc)) {
     // In this case, the source memory stores data in a customized layout. We
     // need to reorganize the data in memory before we can reshape.
-    mkldnn::memory::primitive_desc def_pd = GetPrimitiveDesc(mem.get_primitive_desc(),
-                                                             from_def_format);
+    mkldnn::memory::primitive_desc def_pd = GetPrimitiveDesc(from_pd, from_def_format);
     mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_pd);
     stream->RegisterPrim(mkldnn::reorder(mem, *def_mem));
     // Now we can reshape it
@@ -648,45 +683,40 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
     auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
     auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
     mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
     mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_->GetRaw()));
-  } else if (mem.get_primitive_desc() == ptr_->mkl_mem_->GetPrimitiveDesc()) {
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
+  } else if (from_pd == this_pd) {
     // If the layout is the same, we can just copy data.
-    stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_->GetRaw()));
+    stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
   } else {
-    mkldnn_memory_format_t src_def = GetDefaultFormat(mem.get_primitive_desc().desc());
-    mkldnn_memory_format_t dst_def = ptr_->mkl_mem_->GetDefaultFormat();
     // If both are not using the default layouts. There isn't much we can do,
     // other than reorder data layout directly.
-    if (dst_def != ptr_->mkl_mem_->GetFormat()
-        && src_def != mem.get_primitive_desc().desc().data.format) {
-      stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_->GetRaw()));
-    } else if (dst_def == ptr_->mkl_mem_->GetFormat()) {
+    if (this_def_format != this_desc.data.format
+        && from_def_format != from_desc.data.format) {
+      stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
+    } else if (this_def_format == this_desc.data.format) {
       // If the dest mem uses the default memory layout, we can simply use
       // the default format of the source memory to improve perf of reorder.
-      mkldnn::memory::primitive_desc pd = ptr_->mkl_mem_->GetPrimitiveDesc(src_def);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->mkl_mem_->GetDataHandle()));
+      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(from_pd,
+                                                           from_def_format);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, this_mem->get_data_handle()));
       stream->RegisterMem(tmp_mem);
       stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem));
     } else {
       // If the src mem uses the default memory layout, we can use
       // the default format of the source memory to improve perf.
-      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def);
+      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(this_pd,
+                                                           this_def_format);
       mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_->GetRaw()));
+      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
     }
   }
 }
-mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
-                                                mkldnn_memory_format_t format);
 
 mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) {
-  // This array shouldn't be a view.
-  CHECK(!IsView());
-
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
     return nullptr;
@@ -697,10 +727,26 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &
   mkldnn_memory_format_t def_format = GetDefaultFormat(_desc.desc());
   // If the required format is a default format, we don't need to worry about the shape.
   // If the shape isn't the same, it actually implicitly reshapes data.
-  if (required_format == def_format) {
+  if (required_format == def_format && !IsView()) {
     ptr_->SetMKLMem(shape_, dtype_);
     MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
     return GetMKLDNNExact(ptr_->mkl_mem_->GetRaw(), desc);
+  } else if (required_format == def_format) {
+    ptr_->CheckAndAlloc();
+    CHECK(ptr_->shandle.dptr);
+    // When this is a view and a user wants the default layout, we can simply
+    // create a new mkldnn memory that points to the right memory.
+    std::shared_ptr<mkldnn::memory> mem(new mkldnn::memory(
+            desc, static_cast<char *>(ptr_->shandle.dptr) + byte_offset_));
+    MKLDNNStream::Get()->RegisterMem(mem);
+    return mem.get();
+  } else if (IsView()) {
+    // If this is a view and a user wants to write data to it with special
+    // a MKLDNN format, we should reorder the data in the array and return NULL.
+    // In this way, the user will create a new NDArray for the special format
+    // and copy data back.
+    ptr_->Reorder2Default();
+    return nullptr;
   }
 
   if (ptr_->mkl_mem_)
@@ -1089,9 +1135,8 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
                              to_mem->get_primitive_desc().get_size());
       memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
     } else {
-      std::vector<mkldnn::primitive> net;
-      net.push_back(mkldnn::reorder(*from_mem, *to_mem));
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+      const_cast<NDArray &>(to).CopyFrom(*from_mem);
+      MKLDNNStream::Get()->Submit();
     }
   } else {
     // In this case, one of the NDArray isn't supported by MKLDNN, we need
@@ -1275,7 +1320,7 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
       CHECK_EQ(source[i].ctx().dev_mask(), Context::kCPU)
           << "operands context mismatch";
     } else {
-      CHECK(source[i].ctx() == out->ctx())
+      CHECK_EQ(source[i].ctx(), out->ctx())
           << "operands context mismatch";
     }
   }
@@ -1882,7 +1927,7 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
     if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
       Engine::Get()->PushAsync(
         [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
           TBlob dst_data = get_dst_data(src_data.shape_);
           ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
           rctx.get_stream<gpu>()->Wait();
@@ -1892,17 +1937,17 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
       Engine::Get()->PushAsync(
         [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
           TBlob dst_data = get_dst_data(src_data.shape_);
           ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
           rctx.get_stream<gpu>()->Wait();
           on_complete();
-        }, this->ctx(), const_vars, {this->var()},
+        }, src.ctx(), const_vars, {this->var()},
         FnProperty::kCopyFromGPU, 0, "SyncCopyFromNDArrayGPU2CPU");
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
       Engine::Get()->PushAsync(
         [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
           TBlob dst_data = get_dst_data(src_data.shape_);
           ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
           rctx.get_stream<gpu>()->Wait();
diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc
index 288fe449734..53052ad5a25 100644
--- a/src/operator/contrib/bounding_box.cc
+++ b/src/operator/contrib/bounding_box.cc
@@ -117,7 +117,7 @@ NNVM_REGISTER_OP(_contrib_box_iou)
   Example::
 
     x = [[0.5, 0.5, 1.0, 1.0], [0.0, 0.0, 0.5, 0.5]]
-    y = [0.25, 0.25, 0.75, 0.75]
+    y = [[0.25, 0.25, 0.75, 0.75]]
     box_iou(x, y, format='corner') = [[0.1428], [0.1428]]
 
 )doc" ADD_FILELINE)
@@ -137,8 +137,8 @@ NNVM_REGISTER_OP(_contrib_box_iou)
 .add_arguments(BoxOverlapParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_contrib_box_iou)
-.set_num_inputs(2)
-.set_num_outputs(1)
+.set_num_inputs(1)
+.set_num_outputs(2)
 .set_attr_parser(ParamParser<BoxOverlapParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", BoxOverlapBackward<cpu>)
diff --git a/src/operator/contrib/ctc_include/detail/cpu_ctc.h b/src/operator/contrib/ctc_include/detail/cpu_ctc.h
index ba8bbc558f0..005b956343d 100644
--- a/src/operator/contrib/ctc_include/detail/cpu_ctc.h
+++ b/src/operator/contrib/ctc_include/detail/cpu_ctc.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include <tuple>
diff --git a/src/operator/contrib/ctc_include/detail/ctc_helper.h b/src/operator/contrib/ctc_include/detail/ctc_helper.h
index 35b7a960149..250188c697c 100644
--- a/src/operator/contrib/ctc_include/detail/ctc_helper.h
+++ b/src/operator/contrib/ctc_include/detail/ctc_helper.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include <limits>
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc.h b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
index c249046424e..8015b39c437 100644
--- a/src/operator/contrib/ctc_include/detail/gpu_ctc.h
+++ b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h b/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h
index 7f53232f871..c9bc2026efb 100644
--- a/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h
+++ b/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include "../contrib/moderngpu/include/device/ctascan.cuh"
diff --git a/src/operator/contrib/ctc_include/detail/hostdevice.h b/src/operator/contrib/ctc_include/detail/hostdevice.h
index 7bec1e0017c..f7f0425bf26 100644
--- a/src/operator/contrib/ctc_include/detail/hostdevice.h
+++ b/src/operator/contrib/ctc_include/detail/hostdevice.h
@@ -1,3 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
 #pragma once
 
 #ifdef __CUDACC__
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index a2e681a8e60..e5a7dd8fb63 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -96,11 +96,16 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
   const int num_anchors = cls_prob.size(2);
   const int num_batches = cls_prob.size(0);
   const DType *p_anchor = anchors.dptr_;
+
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  std::vector<DType> outputs;
+  outputs.reserve(num_anchors * 6);
   for (int nbatch = 0; nbatch < num_batches; ++nbatch) {
     const DType *p_cls_prob = cls_prob.dptr_ + nbatch * num_classes * num_anchors;
     const DType *p_loc_pred = loc_pred.dptr_ + nbatch * num_anchors * 4;
     DType *p_out = out.dptr_ + nbatch * num_anchors * 6;
-    int valid_count = 0;
+
+#pragma omp parallel for num_threads(omp_threads)
     for (int i = 0; i < num_anchors; ++i) {
       // find the predicted class id and probability
       DType score = -1;
@@ -112,20 +117,33 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
           id = j;
         }
       }
+
       if (id > 0 && score < threshold) {
         id = 0;
       }
-      if (id > 0) {
-        // [id, prob, xmin, ymin, xmax, ymax]
-        p_out[valid_count * 6] = id - 1;  // remove background, restore original id
-        p_out[valid_count * 6 + 1] = (id == 0 ? DType(-1) : score);
-        int offset = i * 4;
-        TransformLocations(p_out + valid_count * 6 + 2, p_anchor + offset,
-          p_loc_pred + offset, clip, variances[0], variances[1],
-          variances[2], variances[3]);
+
+      // [id, prob, xmin, ymin, xmax, ymax]
+      outputs[i * 6] = id - 1;
+      outputs[i * 6 + 1] = score;
+      int offset = i * 4;
+      TransformLocations(outputs.data() + i * 6 + 2, p_anchor + offset, p_loc_pred + offset, clip,
+                         variances[0], variances[1], variances[2], variances[3]);
+    }
+
+    int valid_count = 0;
+    for (int i = 0; i < num_anchors; ++i) {
+      int offset1 = valid_count * 6;
+      int offset2 = i * 6;
+      if (outputs[offset2] >= 0) {
+        p_out[offset1]     = outputs[offset2];
+        p_out[offset1 + 1] = outputs[offset2 + 1];
+        p_out[offset1 + 2] = outputs[offset2 + 2];
+        p_out[offset1 + 3] = outputs[offset2 + 3];
+        p_out[offset1 + 4] = outputs[offset2 + 4];
+        p_out[offset1 + 5] = outputs[offset2 + 5];
         ++valid_count;
       }
-    }  // end iter num_anchors
+    }
 
     if (valid_count < 1 || nms_threshold <= 0 || nms_threshold > 1) continue;
 
@@ -138,22 +156,29 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
       sorter.push_back(SortElemDescend<DType>(p_out[i * 6 + 1], i));
     }
     std::stable_sort(sorter.begin(), sorter.end());
+
     // re-order output
     DType *ptemp = temp_space.dptr_ + nbatch * num_anchors * 6;
     int nkeep = static_cast<int>(sorter.size());
     if (nms_topk > 0 && nms_topk < nkeep) {
+      // keep topk detections
       nkeep = nms_topk;
+      for (int i = nkeep; i < valid_count; ++i) {
+        p_out[i * 6] = -1;
+      }
     }
     for (int i = 0; i < nkeep; ++i) {
       for (int j = 0; j < 6; ++j) {
         p_out[i * 6 + j] = ptemp[sorter[i].index * 6 + j];
       }
     }
+
     // apply nms
-    for (int i = 0; i < valid_count; ++i) {
+#pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < nkeep; ++i) {
       int offset_i = i * 6;
       if (p_out[offset_i] < 0) continue;  // skip eliminated
-      for (int j = i + 1; j < valid_count; ++j) {
+      for (int j = i + 1; j < nkeep; ++j) {
         int offset_j = j * 6;
         if (p_out[offset_j] < 0) continue;  // skip eliminated
         if (force_suppress || (p_out[offset_i] == p_out[offset_j])) {
diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h
index 3a3a9c34927..fb20ef0bddd 100644
--- a/src/operator/contrib/psroi_pooling-inl.h
+++ b/src/operator/contrib/psroi_pooling-inl.h
@@ -1,7 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  * Copyright (c) 2017 by Contributors
  * Copyright (c) 2017 Microsoft
- * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file psroi_pooling-inl.h
  * \brief psroi pooling operator and symbol
  * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai
diff --git a/src/operator/contrib/roi_align-inl.h b/src/operator/contrib/roi_align-inl.h
new file mode 100644
index 00000000000..5ac420cc3d4
--- /dev/null
+++ b/src/operator/contrib/roi_align-inl.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file roi_align-inl.h
+ * \brief roi align operator and symbol
+ * \author Hang Zhang
+ * modified from Caffe2
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_ROI_ALIGN_INL_H_
+#define MXNET_OPERATOR_CONTRIB_ROI_ALIGN_INL_H_
+
+#include <vector>
+#include <utility>
+#include "../mshadow_op.h"
+#include "../tensor/init_op.h"
+
+
+namespace mxnet {
+namespace op {
+
+
+// Declare enumeration of input order to make code more intuitive.
+// These enums are only visible within this header
+namespace roialign {
+enum ROIAlignOpInputs {kData, kBox};
+enum ROIAlignOpOutputs {kOut};
+}  // roialign
+
+
+struct ROIAlignParam : public dmlc::Parameter<ROIAlignParam> {
+  TShape pooled_size;
+  float spatial_scale;
+  DMLC_DECLARE_PARAMETER(ROIAlignParam) {
+    DMLC_DECLARE_FIELD(pooled_size)
+    .set_expect_ndim(2).enforce_nonzero()
+    .describe("ROI Align output roi feature map height and width: (h, w)");
+    DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0)
+    .describe("Ratio of input feature map height (or w) to raw image height (or w). "
+    "Equals the reciprocal of total stride in convolutional layers");
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_ROI_ALIGN_INL_H_
diff --git a/src/operator/contrib/roi_align.cc b/src/operator/contrib/roi_align.cc
new file mode 100644
index 00000000000..c2cb929966a
--- /dev/null
+++ b/src/operator/contrib/roi_align.cc
@@ -0,0 +1,584 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file roi_align.cc
+ * \brief roi align operator
+ * \author Hang Zhang
+ * Adapted from Caffe2
+*/
+#include "./roi_align-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>* pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc->at(pre_calc_index) = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = static_cast<int>(y);
+          int x_low = static_cast<int>(x);
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc->at(pre_calc_index) = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    int roi_cols,
+    T* top_data) {
+  DCHECK(roi_cols == 4 || roi_cols == 5);
+
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        &pre_calc);
+
+    int c;
+#pragma omp parallel for private(c) \
+num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                  pc.w2 * offset_bottom_data[pc.pos2] +
+                  pc.w3 * offset_bottom_data[pc.pos3] +
+                  pc.w4 * offset_bottom_data[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          top_data[index] = output_val;
+        }  // for pw
+      }  // for ph
+    }  // for c
+  }  // for n
+}
+
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T* w1,
+    T* w2,
+    T* w3,
+    T* w4,
+    int* x_low,
+    int* x_high,
+    int* y_low,
+    int* y_high,
+    const int /*index*/ /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    *w1 = *w2 = *w3 = *w4 = 0.;
+    *x_low = *x_high = *y_low = *y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = (T)*y_low;
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = (T)*x_low;
+  } else {
+    *x_high = *x_low + 1;
+  }
+
+  T ly = y - *y_low;
+  T lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(const T& val, T* address) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(
+    const int nthreads,
+    const T* top_diff,
+    const int /*num_rois*/,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois,
+    int rois_cols) {
+  DCHECK(rois_cols == 4 || rois_cols == 5);
+
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * rois_cols;
+    int roi_batch_ind = 0;
+    if (rois_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            &w1,
+            &w2,
+            &w3,
+            &w4,
+            &x_low,
+            &x_high,
+            &y_low,
+            &y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          add(static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          add(static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          add(static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        }  // if
+      }  // ix
+    }  // iy
+  }  // for
+}  // ROIAlignBackward
+
+
+template<typename xpu>
+void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& out_data) {
+  using namespace mshadow;
+  size_t expected_in = 2;
+  size_t expected_out = 1;
+  CHECK_EQ(in_data.size(), expected_in);
+  CHECK_EQ(out_data.size(), expected_out);
+  CHECK_EQ(out_data[roialign::kOut].shape_[0], in_data[roialign::kBox].shape_[0]);
+
+  const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);
+
+  const int count = out_data[roialign::kOut].Size();
+  // const int num_rois = in_data[roialign::kBox].size(0);
+  const int channels = in_data[roialign::kData].size(1);
+  const int height = in_data[roialign::kData].size(2);
+  const int width = in_data[roialign::kData].size(3);
+  const int pooled_height = out_data[roialign::kOut].size(2);
+  const int pooled_width = out_data[roialign::kOut].size(3);
+  const int rois_cols = in_data[roialign::kBox].size(1);
+
+  // assume all the data and gradient have the same type
+  MSHADOW_REAL_TYPE_SWITCH(in_data[0].type_flag_, DType, {
+    const DType *bottom_data = in_data[roialign::kData].dptr<DType>();
+    const DType *bottom_rois = in_data[roialign::kBox].dptr<DType>();
+    DType *top_data = out_data[roialign::kOut].dptr<DType>();
+
+    ROIAlignForward<DType>(count, bottom_data, param.spatial_scale, channels,
+                           height, width, pooled_height, pooled_width, -1, bottom_rois,
+                           rois_cols, top_data);
+  })
+}
+
+template<typename xpu>
+void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+
+  CHECK_EQ(inputs.size(), 2);
+  CHECK_EQ(outputs.size(), 2);
+  // the order here relates to the order in ROIAlignGrad
+  std::vector<TBlob> out_grad(1, inputs[0]);
+  std::vector<TBlob> in_data(1, inputs[1]);
+  // std::vector<TBlob> out_data(1, inputs[2]);
+
+  CHECK_EQ(out_grad[0].shape_[0], in_data[0].shape_[0]);
+  CHECK_NE(req[0], kWriteInplace) <<
+    "ROIAlign: Backward doesn't support kWriteInplace.";
+  CHECK_NE(req[1], kWriteInplace) <<
+    "ROIAlign: Backward doesn't support kWriteInplace.";
+
+  const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);
+
+  const int count = out_grad[0].Size();
+  const int num_rois = in_data[0].size(0);
+  const int channels = outputs[0].size(1);
+  const int height = outputs[0].size(2);
+  const int width = outputs[0].size(3);
+  const int pooled_height = out_grad[0].size(2);
+  const int pooled_width = out_grad[0].size(3);
+  const int rois_cols = in_data[0].size(1);
+
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  // assume all the data and gradient have the same type
+  MSHADOW_REAL_TYPE_SWITCH(out_grad[0].type_flag_, DType, {
+    const DType *top_diff = out_grad[0].dptr<DType>();
+    const DType *bottom_rois = in_data[0].dptr<DType>();
+    DType *grad_in = outputs[0].dptr<DType>();
+
+    if (kAddTo == req[roialign::kData] || kWriteTo == req[roialign::kData]) {
+      if (kWriteTo == req[roialign::kData]) {
+        Fill<false>(s, outputs[0], kWriteTo, static_cast<DType>(0));
+      }
+      ROIAlignBackward<DType>(count, top_diff, num_rois, param.spatial_scale,
+                     channels, height, width, pooled_height, pooled_width,
+                     -1, grad_in, bottom_rois, rois_cols);
+    }
+    if (kWriteTo == req[roialign::kBox]) {
+      Fill<false>(s, outputs[1], kWriteTo, static_cast<DType>(0));
+    }
+  })
+}
+
+DMLC_REGISTER_PARAMETER(ROIAlignParam);
+
+NNVM_REGISTER_OP(_contrib_ROIAlign)
+.describe(R"code(
+This operator takes a 4D feature map as an input array and region proposals as `rois`,
+then align the feature map over sub-regions of input and produces a fixed-sized output array.
+This operator is typically used in Faster R-CNN & Mask R-CNN networks.
+
+Different from ROI pooling, ROI Align removes the harsh quantization, properly aligning
+the extracted features with the input. RoIAlign computes the value of each sampling point
+by bilinear interpolation from the nearby grid points on the feature map. No quantization is
+performed on any coordinates involved in the RoI, its bins, or the sampling points.
+Bilinear interpolation is used to compute the exact values of the
+input features at four regularly sampled locations in each RoI bin.
+Then the feature map can be aggregated by avgpooling.
+
+
+Reference
+---------
+
+He, Kaiming, et al. "Mask R-CNN." ICCV, 2017
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "rois"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr_parser(ParamParser<ROIAlignParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+      std::vector<TShape> *in_shape, std::vector<TShape> *out_shape){
+  using namespace mshadow;
+  const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]";
+  // data: [batch_size, c, h, w]
+  TShape dshape = in_shape->at(roialign::kData);
+  CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor";
+  // bbox: [num_rois, 5]
+  TShape bshape = in_shape->at(roialign::kBox);
+  CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]";
+  CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]";
+  // out: [num_rois, c, pooled_h, pooled_w]
+  out_shape->clear();
+  out_shape->push_back(
+       Shape4(bshape[0], dshape[1], param.pooled_size[0], param.pooled_size[1]));
+  return true;
+})
+.set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
+      std::vector<int> *in_type, std::vector<int> *out_type) {
+  CHECK_EQ(in_type->size(), 2);
+  int dtype = (*in_type)[0];
+  CHECK_EQ(dtype, (*in_type)[1]);
+  CHECK_NE(dtype, -1) << "Input must have specified type";
+
+  out_type->clear();
+  out_type->push_back(dtype);
+  return true;
+})
+.set_attr<FCompute>("FCompute<cpu>", ROIAlignForwardCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    std::vector<nnvm::NodeEntry> heads;
+    heads.push_back(ograds[roialign::kOut]);
+    heads.push_back(n->inputs[roialign::kBox]);
+    return MakeGradNode("_backward_ROIAlign", n, heads, n->attrs.dict);
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator, a 4D Feature maps")
+.add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array")
+.add_arguments(ROIAlignParam::__FIELDS__());
+
+
+NNVM_REGISTER_OP(_backward_ROIAlign)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<ROIAlignParam>)
+.set_attr<FCompute>("FCompute<cpu>", ROIAlignBackwardCompute<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/contrib/roi_align.cu b/src/operator/contrib/roi_align.cu
new file mode 100644
index 00000000000..21066ea15fa
--- /dev/null
+++ b/src/operator/contrib/roi_align.cu
@@ -0,0 +1,484 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file roi_align.cu
+ * \brief roi align operator
+ * \author Hang Zhang
+ * Adapted from Caffe2
+*/
+#include "./roi_align-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                 \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+using namespace mshadow::cuda;
+
+// The maximum number of blocks to use in the default kernel call.
+constexpr int ROI_MAXIMUM_NUM_BLOCKS = 4096;
+
+/**
+ * @brief Compute the number of blocks needed to run N threads.
+ */
+inline int ROI_GET_BLOCKS(const int N) {
+  return std::max(
+      std::min(
+          (N + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock,
+          ROI_MAXIMUM_NUM_BLOCKS),
+      // Use at least 1 block, since CUDA does not allow empty block
+      1);
+}
+
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForwardKernel(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T* w1,
+    T* w2,
+    T* w3,
+    T* w4,
+    int* x_low,
+    int* x_high,
+    int* y_low,
+    int* y_high,
+    const int /*index*/ /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    *w1 = *w2 = *w3 = *w4 = 0.;
+    *x_low = *x_high = *y_low = *y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = (T)*y_low;
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = (T)*x_low;
+  } else {
+    *x_high = *x_low + 1;
+  }
+
+  T ly = y - *y_low;
+  T lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[*y_low * width + *x_low];
+  // T v2 = bottom_data[*y_low * width + *x_high];
+  // T v3 = bottom_data[*y_high * width + *x_low];
+  // T v4 = bottom_data[*y_high * width + *x_high];
+  // T val = (w1 * v1 + *w2 * v2 + *w3 * v3 + *w4 * v4);
+
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignBackwardKernel(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            &w1,
+            &w2,
+            &w3,
+            &w4,
+            &x_low,
+            &x_high,
+            &y_low,
+            &y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+          /*
+          gpu_atomic_add(
+              static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          gpu_atomic_add(
+              static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+          */
+        }  // if
+      }  // ix
+    }  // iy
+  }  // CUDA_1D_KERNEL_LOOP
+}  // RoIAlignBackward
+
+template<typename xpu>
+void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& in_data,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& out_data) {
+  using namespace mshadow;
+  size_t expected_in = 2;
+  size_t expected_out = 1;
+  CHECK_EQ(in_data.size(), expected_in);
+  CHECK_EQ(out_data.size(), expected_out);
+  CHECK_EQ(out_data[roialign::kOut].shape_[0], in_data[roialign::kBox].shape_[0]);
+
+  const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);
+
+  const int count = out_data[roialign::kOut].Size();
+  const int num_rois = in_data[roialign::kBox].size(0);
+  const int channels = in_data[roialign::kData].size(1);
+  const int height = in_data[roialign::kData].size(2);
+  const int width = in_data[roialign::kData].size(3);
+  const int pooled_height = out_data[roialign::kOut].size(2);
+  const int pooled_width = out_data[roialign::kOut].size(3);
+
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+  MSHADOW_REAL_TYPE_SWITCH(in_data[0].type_flag_, DType, {
+    const DType *bottom_data = in_data[roialign::kData].dptr<DType>();
+    const DType *bottom_rois = in_data[roialign::kBox].dptr<DType>();
+    DType *top_data = out_data[roialign::kOut].dptr<DType>();
+    RoIAlignForwardKernel<DType>
+      <<<ROI_GET_BLOCKS(count),
+         kMaxThreadsPerBlock,
+         0,
+         stream>>>(
+          count,
+          bottom_data,
+          param.spatial_scale,
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          -1,
+          bottom_rois,
+          top_data);
+  })
+}
+
+
+template<typename xpu>
+void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+
+  CHECK_EQ(inputs.size(), 2);
+  CHECK_EQ(outputs.size(), 2);
+  // the order here relates to the order in ROIAlignGrad
+  std::vector<TBlob> out_grad(1, inputs[0]);
+  std::vector<TBlob> in_data(1, inputs[1]);
+  // std::vector<TBlob> out_data(1, inputs[2]);
+
+  CHECK_EQ(out_grad[0].shape_[0], in_data[0].shape_[0]);
+  CHECK_NE(req[0], kWriteInplace) <<
+    "ROIAlign: Backward doesn't support kWriteInplace.";
+  CHECK_NE(req[1], kWriteInplace) <<
+    "ROIAlign: Backward doesn't support kWriteInplace.";
+
+  const ROIAlignParam param = nnvm::get<ROIAlignParam>(attrs.parsed);
+
+  const int count = out_grad[0].Size();
+  const int num_rois = in_data[0].size(0);
+  const int channels = outputs[0].size(1);
+  const int height = outputs[0].size(2);
+  const int width = outputs[0].size(3);
+  const int pooled_height = out_grad[0].size(2);
+  const int pooled_width = out_grad[0].size(3);
+
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+
+  // assume all the data and gradient have the same type
+  MSHADOW_REAL_TYPE_SWITCH(out_grad[0].type_flag_, DType, {
+    const DType *top_diff = out_grad[0].dptr<DType>();
+    const DType *bottom_rois = in_data[0].dptr<DType>();
+    DType *grad_in = outputs[0].dptr<DType>();
+
+    if (kWriteTo == req[roialign::kBox]) {
+      Fill<false>(s, outputs[1], kWriteTo, static_cast<DType>(0));
+    }
+    if (kNullOp == req[roialign::kData]) return;
+    if (kWriteTo == req[roialign::kData]) {
+      Fill<false>(s, outputs[0], kWriteTo, static_cast<DType>(0));
+    }
+    RoIAlignBackwardKernel<DType>
+    <<<ROI_GET_BLOCKS(count),
+       kMaxThreadsPerBlock,
+       0,
+       stream>>>(
+        count,
+        top_diff,
+        num_rois,
+        param.spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        -1,
+        grad_in,
+        bottom_rois);
+  })
+}
+
+
+NNVM_REGISTER_OP(_contrib_ROIAlign)
+.set_attr<FCompute>("FCompute<gpu>", ROIAlignForwardCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_ROIAlign)
+.set_attr<FCompute>("FCompute<gpu>", ROIAlignBackwardCompute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/transformer-inl.h b/src/operator/contrib/transformer-inl.h
new file mode 100644
index 00000000000..01faf244aff
--- /dev/null
+++ b/src/operator/contrib/transformer-inl.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file transformer-inl.h
+ * \brief Function used in cc and cu
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_TRANSFORMER_INL_H_
+#define MXNET_OPERATOR_CONTRIB_TRANSFORMER_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../mxnet_op.h"
+#include "../mshadow_op.h"
+
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu>
+static void DivSqrtDimForward_(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  double sqrt_dim = std::sqrt(static_cast<double>(inputs[0].shape_[inputs[0].ndim() - 1]));
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+      mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::div, Req>, xpu>::Launch(
+        s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), DType(sqrt_dim));
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_TRANSFORMER_INL_H_
diff --git a/src/operator/contrib/transformer.cc b/src/operator/contrib/transformer.cc
new file mode 100644
index 00000000000..00085c0dc7a
--- /dev/null
+++ b/src/operator/contrib/transformer.cc
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file transformer.cc
+ * \brief CPU implementation of the operators used in Transformer
+ */
+#include <mxnet/base.h>
+#include "./transformer-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+// relu
+MXNET_OPERATOR_REGISTER_UNARY(_contrib_div_sqrt_dim)
+.describe(R"code(Rescale the input by the square root of the channel dimension.
+
+   out = data / sqrt(data.shape[-1])
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", DivSqrtDimForward_<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_contrib_div_sqrt_dim"});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/transformer.cu b/src/operator/contrib/transformer.cu
new file mode 100644
index 00000000000..6ed073db601
--- /dev/null
+++ b/src/operator/contrib/transformer.cu
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file transformer.cu
+ * \brief GPU implementation of the operators used in Transformer
+ */
+#include <mxnet/base.h>
+#include "./transformer-inl.h"
+
+namespace mxnet {
+namespace op {
+
+// relu
+NNVM_REGISTER_OP(_contrib_div_sqrt_dim)
+.set_attr<FCompute>("FCompute<gpu>", DivSqrtDimForward_<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 1a54b73660c..b33a717d15b 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -38,7 +38,7 @@ namespace mxnet {
 namespace op {
 #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
 template<typename DType>
-class CuDNNRNNOp : public Operator {
+class CuDNNRNNOp : public Operator{
  public:
   explicit CuDNNRNNOp(RNNParam param) {
     this->param_ = param;
@@ -76,9 +76,39 @@ class CuDNNRNNOp : public Operator {
       param_.lstm_q_ = true;
     else
       param_.lstm_q_ = false;
+
+    // Create descriptors
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
+    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
   }
 
   ~CuDNNRNNOp() {
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
+    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
+
     if (init_cudnn_) {
       for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
         CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i]));
@@ -86,26 +116,16 @@ class CuDNNRNNOp : public Operator {
         CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]));
         CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
       }
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
-
-      CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
-      CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
-      CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
-      CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
-      Storage::Get()->Free(dropout_states_);
+      init_cudnn_ = false;
+
       Storage::Get()->Free(reserve_space_);
+      if (param_.p > 0) {
+        Storage::Get()->Free(dropout_states_);
+      }
     }
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
+  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
@@ -394,15 +414,6 @@ class CuDNNRNNOp : public Operator {
       strideA[1] = dimA[2];
       strideA[2] = 1;
 
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
-
       CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
                                             dtype_,
                                             3,
@@ -445,20 +456,19 @@ class CuDNNRNNOp : public Operator {
                                             strideA));
 
       // Create Dropout descriptors
-      CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
-      CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_,
-                                           &dropout_byte_));
-      dropout_size_ = dropout_byte_ / sizeof(DType);
-      dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU());
-      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_,
-                                           s->dnn_handle_,
-                                           param_.p,  // keep probability
-                                           dropout_states_.dptr,
-                                           dropout_byte_,
+      if (param_.p > 0) {
+        CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
+        dropout_size_ = dropout_byte_ / sizeof(DType);
+        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU());
+      } else {
+        dropout_states_ = {};
+        dropout_byte_ = 0;
+      }
+      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
+                                           param_.p,  // discard probability
+                                           dropout_states_.dptr, dropout_byte_,
                                            seed_));
       // RNN descriptors
-      CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
-
       #if CUDNN_MAJOR >= 6
         cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
         CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
@@ -513,8 +523,6 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
 
       // Set param descriptors
-      CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
-      CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
       int dim_w[3] = {1, 1, 1};
       dim_w[0] = w.shape_[0];
       CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index d1286170c2c..151db60975e 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -494,6 +494,9 @@ LINALG_XPU_BATCH_TRMM(gpu, double)
 // for further information about the function and its parameters.
 // Note that this is A = potrf(A), so A is input and output parameter.
 
+static const char *potrf_errstr
+  = "This may happen when the input matrix is either not symmetric or not positive definite.";
+
 template<typename xpu, typename DType>
 inline void check_potrf(const Tensor<xpu, 2, DType>& A, bool lower) {
   // Any checking that helps user debug potential problems.
@@ -507,7 +510,7 @@ void linalg_potrf<cpu, DType>(const Tensor<cpu, 2, DType>& A, bool lower, Stream
   check_potrf(A, lower); \
   int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, (lower ? 'L' : 'U'), A.size(0),  \
           A.dptr_ , A.stride_)); \
-  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu. " << potrf_errstr; \
 }
 LINALG_CPU_POTRF(spotrf, float)
 LINALG_CPU_POTRF(dpotrf, double)
@@ -589,6 +592,10 @@ LINALG_GPU_BATCH_POTRF(DnDpotrf, double)
 // for further information about the function and its parameters.
 // Note that this is A = potri(A), so A is input and output parameter.
 
+static const char *potri_errstr
+  = "This may happen when the input matrix is not a Cholesky factorization obtained"
+    " by a prior call of the potrf-operator.";
+
 template<typename xpu, typename DType>
 inline void check_potri(const Tensor<xpu, 2, DType>& A, bool lower) {
   // Any checking that helps user debug potential problems.
@@ -601,7 +608,7 @@ void linalg_potri<cpu, DType>(const Tensor<cpu, 2, DType>& A, bool lower, Stream
   check_potri(A, lower); \
   int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, (lower ? 'L' : 'U'), A.size(0),  \
           A.dptr_ , A.stride_)); \
-  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu. " << potri_errstr; \
 }
 LINALG_CPU_POTRI(spotri, float)
 LINALG_CPU_POTRI(dpotri, double)
diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h
index 32a7a5ad617..a9f6dbeda89 100644
--- a/src/operator/nn/activation-inl.h
+++ b/src/operator/nn/activation-inl.h
@@ -83,7 +83,7 @@ struct hash<mxnet::op::ActivationParam> {
 namespace mxnet {
 namespace op {
 
-template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
+template<typename xpu, typename ForwardOp, typename BackwardOp>
 void ActivationForward(const OpContext &ctx, const TBlob &in_data,
                        const OpReqType &req, const TBlob &out_data) {
   using namespace mshadow;
@@ -91,16 +91,16 @@ void ActivationForward(const OpContext &ctx, const TBlob &in_data,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const size_t sz = in_data.shape_.Size();
   if (sz) {
-    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
-      mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
-        s, sz,
-        out_data.dptr<DType>(),
-        in_data.dptr<DType>());
+    MSHADOW_REAL_TYPE_SWITCH(in_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
+          s, sz, out_data.dptr<DType>(), in_data.dptr<DType>());
+      });
     });
   }
 }
 
-template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
+template<typename xpu, typename ForwardOp, typename BackwardOp>
 void ActivationBackward(const OpContext &ctx, const TBlob &out_grad,
                         const TBlob &out_data, const OpReqType &req,
                         const TBlob &in_grad) {
@@ -109,13 +109,12 @@ void ActivationBackward(const OpContext &ctx, const TBlob &out_grad,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const size_t sz = out_data.shape_.Size();
   if (sz) {
-    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
-      mxnet_op::Kernel<mxnet_op::op_with_req<
-        mxnet::op::mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
-        s, sz,
-        in_grad.dptr<DType>(),
-        out_grad.dptr<DType>(),
-        out_data.dptr<DType>());
+    MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<
+          mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
+            s, sz, in_grad.dptr<DType>(), out_grad.dptr<DType>(), out_data.dptr<DType>());
+      });
     });
   }
 }
@@ -123,72 +122,68 @@ void ActivationBackward(const OpContext &ctx, const TBlob &out_grad,
 template<typename xpu>
 void ActivationComputeImpl(const ActivationParam &param, const OpContext &ctx,
                            const TBlob &input, OpReqType req, const TBlob &output) {
-  MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        ActivationForward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
-            ctx, input, req, output);
-        break;
-      case activation::kSigmoid:
-        ActivationForward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
-            ctx, input, req, output);
-        break;
-      case activation::kTanh:
-        ActivationForward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
-            ctx, input, req, output);
-        break;
-      case activation::kSoftReLU:
-        ActivationForward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
-            ctx, input, req, output);
-        break;
-      case activation::kSoftSign:
-        ActivationForward<xpu, mshadow_op::softsign, mshadow_op::softsign_grad, DType>(
-                ctx, input, req, output);
-            break;
-      default:
-        LOG(FATAL) << "unknown activation type";
-    }
-  });
+  switch (param.act_type) {
+    case activation::kReLU:
+      ActivationForward<xpu, mshadow_op::relu, mshadow_op::relu_grad>(
+          ctx, input, req, output);
+      break;
+    case activation::kSigmoid:
+      ActivationForward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>(
+          ctx, input, req, output);
+      break;
+    case activation::kTanh:
+      ActivationForward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad>(
+          ctx, input, req, output);
+      break;
+    case activation::kSoftReLU:
+      ActivationForward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(
+          ctx, input, req, output);
+      break;
+    case activation::kSoftSign:
+      ActivationForward<xpu, mshadow_op::softsign, mshadow_op::softsign_grad>(
+              ctx, input, req, output);
+          break;
+    default:
+      LOG(FATAL) << "unknown activation type";
+  }
 }
 
 template<typename xpu>
 void ActivationGradComputeImpl(const ActivationParam &param, const OpContext &ctx,
                                const TBlob &out_grad, const TBlob &out_data,
                                OpReqType req, const TBlob &output) {
-  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        ActivationBackward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      case activation::kSigmoid:
-        ActivationBackward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      case activation::kTanh:
-        ActivationBackward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      case activation::kSoftReLU:
-        ActivationBackward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      case activation::kSoftSign:
-        ActivationBackward<xpu, mshadow_op::softsign, mshadow_op::softsign_grad, DType>(
-                ctx, out_grad, out_data, req, output);
-            break;
-      default:
-        LOG(FATAL) << "unknown activation type";
-    }
-  });
+  switch (param.act_type) {
+    case activation::kReLU:
+      ActivationBackward<xpu, mshadow_op::relu, mshadow_op::relu_grad>(
+          ctx, out_grad, out_data, req, output);
+      break;
+    case activation::kSigmoid:
+      ActivationBackward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>(
+          ctx, out_grad, out_data, req, output);
+      break;
+    case activation::kTanh:
+      ActivationBackward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad>(
+          ctx, out_grad, out_data, req, output);
+      break;
+    case activation::kSoftReLU:
+      ActivationBackward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(
+          ctx, out_grad, out_data, req, output);
+      break;
+    case activation::kSoftSign:
+      ActivationBackward<xpu, mshadow_op::softsign, mshadow_op::softsign_grad>(
+              ctx, out_grad, out_data, req, output);
+          break;
+    default:
+      LOG(FATAL) << "unknown activation type";
+  }
 }
 
 template<typename xpu>
 void ActivationCompute(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
@@ -197,18 +192,19 @@ void ActivationCompute(const nnvm::NodeAttrs& attrs,
 
 template<typename xpu>
 void ActivationGradCompute(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
 #if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
-  CHECK_EQ(inputs.size(), 3U);
+  bool relu = param.act_type == activation::kReLU;
+  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
 #else
   CHECK_EQ(inputs.size(), 2U);
 #endif
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   ActivationGradComputeImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], outputs[0]);
 }
 
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 382efeb1447..595b8912ccc 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -45,7 +45,12 @@ struct ActivationGrad {
     std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
     heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0});
 #if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
-    heads.push_back(n->inputs[activation::kData]);
+    const NodeAttrs& attrs = n->attrs;
+    // for ReLU, no need to pass input data. This enables inplace optimization during the
+    // forward pass.
+    if (dmlc::get<ActivationParam>(attrs.parsed).act_type != activation::kReLU) {
+      heads.push_back(n->inputs[activation::kData]);
+    }
 #endif
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
@@ -74,13 +79,15 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 3U);
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  bool relu = param.act_type == activation::kReLU;
+  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
   if (SupportMKLDNN(inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNActivationBackward(attrs, ctx, inputs[0], inputs[2], req[0],
+    // XXX: for y = relu(x), y is passed as "in_data" to Backward()
+    MKLDNNActivationBackward(attrs, ctx, inputs[0], relu ? inputs[1] : inputs[2], req[0],
                              outputs[0]);
-      MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+     MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   ActivationGradComputeImpl<cpu>(param, ctx, inputs[0].data(), inputs[1].data(),
@@ -112,23 +119,29 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
                                           DispatchMode* dispatch_mode,
                                           std::vector<int> *in_attrs,
                                           std::vector<int> *out_attrs) {
+  bool ret = false;
 #if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
-  CHECK_EQ(in_attrs->size(), 3U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  if (param.act_type != activation::kReLU) {
+    CHECK_EQ(in_attrs->size(), 3U);
+    ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask,
+                                                         dispatch_mode,
+                                                         in_attrs, out_attrs);
+  } else {
+    // for ReLU activation, the backward pass only needs ograd and output
+    CHECK_EQ(in_attrs->size(), 2U);
+    ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
+                                                         dispatch_mode,
+                                                         in_attrs, out_attrs);
+  }
 #else
   CHECK_EQ(in_attrs->size(), 2U);
+  ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
+                                                       dispatch_mode,
+                                                       in_attrs, out_attrs);
 #endif
   CHECK_EQ(out_attrs->size(), 1U);
-#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
-  bool ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask,
-                                                            dispatch_mode,
-                                                            in_attrs, out_attrs);
-#else
-  bool ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
-                                                            dispatch_mode,
-                                                            in_attrs, out_attrs);
-#endif
 #if MXNET_USE_MKLDNN == 1
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
     *dispatch_mode = DispatchMode::kFComputeEx;
   }
@@ -162,7 +175,12 @@ The following activation functions are supported:
 .add_arguments(ActivationParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_Activation)
-.set_num_inputs(3)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    int act_type = dmlc::get<ActivationParam>(attrs.parsed).act_type;
+    // for ReLU activation, the backward pass only needs ograd and output
+    if (act_type == activation::kReLU) return 2;
+    return 3;
+  })
 .set_num_outputs(1)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FInferStorageType>("FInferStorageType", BackwardActStorageType)
diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu
index dc435b2acc1..68b4053efdd 100644
--- a/src/operator/nn/activation.cu
+++ b/src/operator/nn/activation.cu
@@ -55,12 +55,13 @@ void ActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
 
-  // SoftReLU not supported by CUDNN yet
+  // SoftReLU and kSoftSign are both not supported by CUDNN yet
   if (param.act_type == activation::kSoftReLU) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      ActivationForward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(ctx,
-          inputs[0], req[0], outputs[0]);
-    });
+    ActivationForward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(ctx,
+      inputs[0], req[0], outputs[0]);
+  } else if (param.act_type == activation::kSoftSign) {
+    ActivationForward<gpu, mshadow_op::softsign, mshadow_op::softsign_grad>(ctx,
+      inputs[0], req[0], outputs[0]);
   } else {
     MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       get_cudnn_op<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
@@ -70,24 +71,28 @@ void ActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
 
 template<>
 void ActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 3U);
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  bool relu = param.act_type == activation::kReLU;
+  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
 
-  // SoftReLU not supported by CUDNN yet
+  // both SoftReLU and SoftSign not supported by CUDNN yet
   if (param.act_type == activation::kSoftReLU) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      ActivationBackward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
-          ctx, inputs[0], inputs[1], req[0], outputs[0]);
-    });
+    ActivationBackward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(
+      ctx, inputs[0], inputs[1], req[0], outputs[0]);
+  } else if (param.act_type == activation::kSoftSign) {
+    ActivationBackward<gpu, mshadow_op::softsign, mshadow_op::softsign_grad>(
+      ctx, inputs[0], inputs[1], req[0], outputs[0]);
   } else {
     MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      get_cudnn_op<DType>(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]);
+      // XXX: for y = relu(x), y is passed as "in_data" to Backward()
+      get_cudnn_op<DType>(param).Backward(ctx, inputs[0], relu ? inputs[1] : inputs[2],
+                                          inputs[1], req[0], outputs[0]);
     });
   }
 }
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 7fd8bbb5599..0e8a929e1ba 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -54,7 +54,8 @@ static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  if (SupportMKLDNNConv(params, inputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs);
     MKLDNN_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -68,7 +69,8 @@ static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                         const std::vector<NDArray>& inputs,
                                         const std::vector<OpReqType>& req,
                                         const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  if (SupportMKLDNNConv(params, inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs);
     MKLDNN_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -363,6 +365,18 @@ static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
     if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
     if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
   }
+  CHECK_EQ(param_.kernel.ndim(), param_.stride.ndim())
+    << "Stride must have the same number of dimensions with kernel_size,"
+    << "but kernel_size is set to " << param_.kernel << " while stride is "
+    << param_.stride;
+  CHECK_EQ(param_.kernel.ndim(), param_.dilate.ndim())
+    << "Dilate must have the same number of dimensions with kernel_size,"
+    << "but kernel_size is set to " << param_.kernel << " while dilate is "
+    << param_.dilate;
+  CHECK_EQ(param_.kernel.ndim(), param_.pad.ndim())
+    << "Padding must have the same number of dimensions with kernel_size,"
+    << "but kernel_size is set to " << param_.kernel << " while padding is "
+    << param_.pad;
   attrs->parsed = std::move(param_);
 }
 
diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h
index a89e7bfaf08..2c1f442808c 100644
--- a/src/operator/nn/cudnn/cudnn_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_activation-inl.h
@@ -130,6 +130,9 @@ class CuDNNActivationOp {
     #endif
   }
 
+  // backward computation for cudnn activation operator. Note that for relu
+  // it's okay to pass "out_data" as "in_data", since it doesn't make any
+  // difference in terms of computing the gradient of relu.
   void Backward(const OpContext &ctx, const TBlob &out_grad,
       const TBlob &in_data, const TBlob &out_data,
       const OpReqType &req, const TBlob &in_grad) {
diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
index 239da023668..0845eb79fd6 100644
--- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
@@ -48,7 +48,7 @@ class CuDNNSoftmaxActivationOp {
   }
 
   void Forward(const OpContext &ctx, const TBlob &in_data,
-      const OpReqType &req, const TBlob &out_data) {
+               const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -102,14 +102,14 @@ class CuDNNSoftmaxActivationOp {
   }
 
   void Backward(const OpContext &ctx, const TBlob &out_grad,
-      const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) {
+                const TBlob &out_data, const OpReqType &req,
+                const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> grad;
-    Tensor<gpu, 4> data;
     Tensor<gpu, 4> output_data;
     Tensor<gpu, 4> input_grad;
     cudnnSoftmaxMode_t softmax_mode;
@@ -141,6 +141,13 @@ class CuDNNSoftmaxActivationOp {
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          input_grad.shape_[0],
+                                          input_grad.shape_[1],
+                                          input_grad.shape_[2],
+                                          input_grad.shape_[3]));
     CUDNN_CALL(cudnnSoftmaxBackward(s->dnn_handle_,
                                     CUDNN_SOFTMAX_ACCURATE,
                                     softmax_mode,
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 0d1b391104a..9e0a70121bf 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -209,7 +209,7 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
     if (param_.target_shape.ndim() > 2) {
       if (param_.target_shape[0] > 0) {
         CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please it carefully";
+          << "param_.target_shape[0] was not reasonable, please set it carefully";
       }
       if (param_.target_shape[1] > 0) {
         CHECK_EQ(param_.target_shape[1], oshape[3]) \
@@ -304,7 +304,8 @@ static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                       const std::vector<NDArray>& inputs,
                                       const std::vector<OpReqType>& req,
                                       const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  if (SupportMKLDNNDeconv(param, inputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNNDeconvolutionForward(attrs, ctx, inputs, req, outputs);
     MKLDNN_OPCHECK_RUN(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req,
@@ -320,7 +321,8 @@ static void DeconvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                           const std::vector<NDArray>& inputs,
                                           const std::vector<OpReqType>& req,
                                           const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  if (SupportMKLDNNDeconv(param, inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     MKLDNNDeconvolutionBackward(attrs, ctx, inputs, req, outputs);
     MKLDNN_OPCHECK_RUN(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req,
@@ -356,6 +358,22 @@ static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) {
     if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
     if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
   }
+  CHECK_EQ(param_.kernel.ndim(), param_.stride.ndim())
+    << "Stride must have the same number of dimensions with kernel_size,"
+    << "but kernel_size is set to " << param_.kernel << " while stride is "
+    << param_.stride;
+  CHECK_EQ(param_.kernel.ndim(), param_.dilate.ndim())
+    << "Dilate must have the same number of dimensions with kernel_size,"
+    << "but kernel_size is set to " << param_.kernel << " while dilate is "
+    << param_.dilate;
+  CHECK_EQ(param_.kernel.ndim(), param_.pad.ndim())
+    << "Padding must have the same number of dimensions with kernel_size,"
+    << "but kernel_size is set to " << param_.kernel << " while padding is "
+    << param_.pad;
+  CHECK_EQ(param_.kernel.ndim(), param_.adj.ndim())
+    << "Adjustment must have the same number of dimensions with kernel_size,"
+    << "but kernel_size is set to " << param_.kernel << " while adjustment is "
+    << param_.adj;
   attrs->parsed = std::move(param_);
 }
 
diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu
index 1cabe732b6d..cdfb606900b 100644
--- a/src/operator/nn/deconvolution.cu
+++ b/src/operator/nn/deconvolution.cu
@@ -40,9 +40,15 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
                                                      const std::vector<TShape>& in_shape,
                                                      const std::vector<TShape>& out_shape,
                                                      const RunContext& rctx) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<DeconvSignature,
                                          std::shared_ptr<CuDNNDeconvolutionOp<DType> >,
                                          OpHash> ops;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<DeconvSignature,
+                                            std::shared_ptr<CuDNNDeconvolutionOp<DType> >,
+                                            OpHash> ops;
+#endif
   DeconvSignature key(param);
   size_t ndim = 0;
   for (auto &s : in_shape)
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 1af4798d1ce..8e4aac61354 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -375,7 +375,7 @@ void DropoutCompute(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& outputs) {
   const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    static thread_local DropoutOp<xpu, DType> op;
+    DropoutOp<xpu, DType> op;
     op.Init(param);
     op.Forward(ctx, inputs, req, outputs);
   });
@@ -397,7 +397,7 @@ void DropoutGradCompute(const nnvm::NodeAttrs& attrs,
   out_data[dropout::kMask] = inputs[1];
 
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    static thread_local DropoutOp<xpu, DType> op;
+    DropoutOp<xpu, DType> op;
     op.Init(param);
     op.Backward(ctx, out_grads, out_data, req, outputs);
   });
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 68d32617e9d..e86c471dab3 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -167,7 +167,7 @@ If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :
 activity :math:`b_{x,y}^{i}` is given by the expression:
 
 .. math::
-   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
+   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \frac{\alpha}{n} \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
 
 where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total
 number of kernels in the layer.
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index 9be5bfbc150..50e742d48fe 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -134,7 +134,11 @@ class MKLDNNActForward {
 static MKLDNNActForward &GetActForward(const ActivationParam& param,
                                        const OpContext &ctx, const NDArray &in_data,
                                        const mkldnn::memory &in_mem) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
+#endif
   MKLDNNActSignature key(param);
   key.AddSign(ctx.is_train);
   key.AddSign(param.act_type);
@@ -165,6 +169,8 @@ void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   stream->Submit();
 }
 
+// For backward relu activation, it's okay to pass "out_data" as "in_data" to this
+// function, since the computation only involes non-zeros.
 void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                               const NDArray &out_grad, const NDArray &in_data,
                               const OpReqType &req, const NDArray &in_grad) {
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 16e5605b668..bd2faf5775a 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -137,10 +137,6 @@ static inline bool SupportMKLDNN(const NDArray &input) {
       && SupportStorageMKLDNN(input.storage_type());
 }
 
-static inline bool SupportMKLDNNConv(const NDArray &input) {
-  return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
-}
-
 /*
  * This is to align address to a certain alignment.
  */
@@ -148,7 +144,11 @@ void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space);
 
 namespace op {
 struct ActivationParam;
-bool SupportMKLDNNAct(const op::ActivationParam& param);
+struct ConvolutionParam;
+struct DeconvolutionParam;
+bool SupportMKLDNNAct(const ActivationParam& param);
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input);
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
 }
 
 static int GetTypeSize(int dtype) {
@@ -232,7 +232,11 @@ class TmpMemMgr {
 
  public:
   static TmpMemMgr *Get() {
+#if DMLC_CXX11_THREAD_LOCAL
     static thread_local TmpMemMgr mgr;
+#else
+    static MX_THREAD_LOCAL TmpMemMgr mgr;
+#endif
     return &mgr;
   }
 
@@ -273,12 +277,11 @@ class MKLDNNStream {
   std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
 
  public:
-  static MKLDNNStream *Get() {
-    static thread_local MKLDNNStream stream;
-    return &stream;
-  }
+  static MKLDNNStream *Get();
 
-  void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); }
+  void RegisterPrim(const mkldnn::primitive &prim) {
+    net.push_back(prim);
+  }
 
   void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
     mem_holder.push_back(mem);
@@ -288,10 +291,21 @@ class MKLDNNStream {
     return !net.empty();
   }
 
-  void Submit() {
-    if (!net.empty())
+  /*
+   * After submitting mkldnn operations for execution, we need to
+   * clean up memory held by the stream. However, sometimes users
+   * might want to separate mkldnn execution and memory cleanup.
+   */
+  void Submit(bool cleanup = true) {
+    if (!net.empty()) {
       mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-    net.clear();
+      net.clear();
+    }
+    if (cleanup)
+      Cleanup();
+  }
+
+  void Cleanup() {
     mem_holder.clear();
     TmpMemMgr::Get()->Reset();
   }
@@ -349,6 +363,16 @@ inline bool same_shape(const TShape &shape, const mkldnn_dims_t dims, int ndims)
   return true;
 }
 
+inline bool same_shape(const mkldnn::memory::desc &desc1,
+                       const mkldnn::memory::desc &desc2) {
+  if (desc1.data.ndims != desc2.data.ndims)
+    return false;
+  for (int i = 0; i < desc1.data.ndims; i++)
+    if (desc1.data.dims[i] != desc2.data.dims[i])
+      return false;
+  return true;
+}
+
 inline bool same_shape(const TShape &shape, int dtype,
                        const mkldnn::memory::desc &desc) {
   return same_shape(shape, desc.data.dims, desc.data.ndims)
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index df37db5e780..1bd1581dbc2 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -25,6 +25,15 @@
 
 namespace mxnet {
 
+MKLDNNStream *MKLDNNStream::Get() {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local MKLDNNStream stream;
+#else
+  static MX_THREAD_LOCAL MKLDNNStream stream;
+#endif
+  return &stream;
+}
+
 void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
   if (size > *space)
     return nullptr;
@@ -57,8 +66,11 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
     this->curr_mem = static_cast<char *>(mem) + pd.get_size();
     return ret.get();
   } else {
-    LOG(WARNING) << "Allocate " << pd.get_size()
-        << " bytes with malloc directly";
+    // If curr_mem has been initialized and we still reach here. It means
+    // the current allocated memory isn't enough.
+    if (this->curr_mem)
+      LOG(WARNING) << "Allocate " << pd.get_size()
+          << " bytes with malloc directly";
     mkldnn_mem_ptr ret(new mkldnn::memory(pd));
     MKLDNNStream::Get()->RegisterMem(ret);
     return ret.get();
@@ -121,7 +133,7 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
     // We have to allocate new memory for the sum result.
     auto sum_res = TmpMemMgr::Get()->Alloc(
         res.second->get_primitive_desc());
-    op::Sum(*res.second, *mem, *sum_res);
+    op::MKLDNNSum(*res.second, *mem, *sum_res);
     const_cast<NDArray &>(arr).CopyFrom(*sum_res);
   }
 }
@@ -211,6 +223,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_hwio:
       case mkldnn_OIhw8i8o:
       case mkldnn_OIhw16i16o:
+      case mkldnn_OIhw4i16o4i:
       case mkldnn_OIhw8i16o2i:
       case mkldnn_OIhw8o16i2o:
       case mkldnn_OIhw8o8i:
@@ -231,6 +244,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_goihw:
       case mkldnn_gOIhw8i8o:
       case mkldnn_gOIhw16i16o:
+      case mkldnn_gOIhw4i16o4i:
       case mkldnn_gOIhw8i16o2i:
       case mkldnn_gOIhw8o16i2o:
       case mkldnn_gOIhw8o8i:
@@ -283,10 +297,7 @@ void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
     } else {
       if (in_bufs.empty())
         in_bufs.reserve(inputs.size());
-      in_bufs.emplace_back(inputs[i].shape(), inputs[i].ctx(),
-                           false, inputs[i].dtype());
-      const mkldnn::memory *mem = inputs[i].GetMKLDNNData();
-      in_bufs.back().CopyFrom(*mem);
+      in_bufs.push_back(inputs[i].Reorder2Default());
       in_blobs[i] = in_bufs.back().data();
     }
   }
@@ -294,10 +305,15 @@ void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
 
   std::vector<TBlob> out_blobs(outputs.size());
   for (size_t i = 0; i < out_blobs.size(); i++) {
-    if (req[i] == kWriteTo)
-      const_cast<NDArray &>(outputs[i]).InvalidateMKLDNNData();
-    CHECK(outputs[i].IsDefaultData());
-    out_blobs[i] = outputs[i].data();
+    NDArray output = outputs[i];
+    // ensure output does not use mkldnn mem.
+    // for inplace, we already converted & copied input above.
+    if ((req[i] == kWriteTo) || (req[i] == kWriteInplace))
+      const_cast<NDArray &>(output).InvalidateMKLDNNData();
+    else if (req[i] == kAddTo)
+      output = outputs[i].Reorder2Default();
+    CHECK(output.IsDefaultData());
+    out_blobs[i] = output.data();
   }
   fn(attrs, ctx, in_blobs, req, out_blobs);
 }
@@ -341,7 +357,11 @@ static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
       arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_);
   std::atomic<bool> success(true);
 #pragma omp parallel for
+#ifdef _MSC_VER
+  for (int64_t i = 0; i < arr1.shape().Size(); i++) {
+#else
   for (size_t i = 0; i < arr1.shape().Size(); i++) {
+#endif
     if (std::abs(data1[i] - data2[i]) > atol + rtol * std::abs(data2[i]))
       success.store(false);
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
index d1c80a63eee..9046836e8e7 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -184,7 +184,11 @@ template<typename DType>
 static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
                                      const OpContext &ctx, const NDArray &in_data,
                                      unsigned flags) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
+#endif
   MKLDNNBNSignature key(param);
   key.AddSign(ctx.is_train);
   key.AddSign(in_data);
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
index 240673de4ab..dbc0e94c630 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -75,7 +75,11 @@ class MKLDNNConcatFwd {
 static MKLDNNConcatFwd &GetConcatForward(
     int concat_dim, const std::vector<NDArray> &in_data,
     const std::vector<mkldnn::memory::primitive_desc> &data_md) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
+#endif
   OpSignature key;
   key.AddSign(concat_dim);
   key.AddSign(in_data);
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index 453221f9b37..f851a6d2535 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -31,6 +31,12 @@
 namespace mxnet {
 namespace op {
 
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
+  if (params.kernel.ndim() != 2)
+    return false;
+  return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
+}
+
 static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
     const ConvolutionParam& param, bool is_train, const NDArray &data,
     const NDArray &weights, const NDArray *bias, const NDArray &output) {
@@ -39,16 +45,15 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
+  CHECK_GE(param.stride.ndim(), 2U);
+  CHECK_GE(param.pad.ndim(), 2U);
+  CHECK_GE(param.dilate.ndim(), 2U);
   mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  }
+  strides[0] = param.stride[0];
+  strides[1] = param.stride[1];
   mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  }
+  padding[0] = param.pad[0];
+  padding[1] = param.pad[1];
   if (param.dilate.ndim() == 0 && bias == nullptr) {
     mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
@@ -61,10 +66,8 @@ static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
     return mkldnn::convolution_forward::primitive_desc(desc, engine);
   } else {
     mkldnn::memory::dims dilates{0, 0};
-    if (param.dilate.ndim() == 2) {
-      dilates[0] = param.dilate[0] - 1;
-      dilates[1] = param.dilate[1] - 1;
-    }
+    dilates[0] = param.dilate[0] - 1;
+    dilates[1] = param.dilate[1] - 1;
     if (bias == nullptr) {
       mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
@@ -88,26 +91,23 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
+  CHECK_GE(param.stride.ndim(), 2U);
+  CHECK_GE(param.pad.ndim(), 2U);
+  CHECK_GE(param.dilate.ndim(), 2U);
   mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  }
+  strides[0] = param.stride[0];
+  strides[1] = param.stride[1];
   mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  }
+  padding[0] = param.pad[0];
+  padding[1] = param.pad[1];
   if (param.dilate.ndim() == 0) {
     mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
     return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd);
   } else {
     mkldnn::memory::dims dilates{0, 0};
-    if (param.dilate.ndim() == 2) {
-      dilates[0] = param.dilate[0] - 1;
-      dilates[1] = param.dilate[1] - 1;
-    }
+    dilates[0] = param.dilate[0] - 1;
+    dilates[1] = param.dilate[1] - 1;
     mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, dilates, padding, padding,
         mkldnn::padding_kind::zero);
@@ -123,16 +123,15 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
+  CHECK_GE(param.stride.ndim(), 2U);
+  CHECK_GE(param.pad.ndim(), 2U);
+  CHECK_GE(param.dilate.ndim(), 2U);
   mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  }
+  strides[0] = param.stride[0];
+  strides[1] = param.stride[1];
   mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  }
+  padding[0] = param.pad[0];
+  padding[1] = param.pad[1];
   if (param.dilate.ndim() == 0 && bias == nullptr) {
     mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
@@ -145,10 +144,8 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
     return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
   } else {
     mkldnn::memory::dims dilates{0, 0};
-    if (param.dilate.ndim() == 2) {
-      dilates[0] = param.dilate[0] - 1;
-      dilates[1] = param.dilate[1] - 1;
-    }
+    dilates[0] = param.dilate[0] - 1;
+    dilates[1] = param.dilate[1] - 1;
     if (bias == nullptr) {
       mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
@@ -232,7 +229,11 @@ static inline MKLDNNConvForward &GetConvFwd(
     const nnvm::NodeAttrs& attrs, bool is_train,
     const NDArray &data, const NDArray &weights,
     const NDArray *bias, const NDArray &output) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash> fwds;
+#endif
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   MKLDNNConvSignature key(param);
   key.AddSign(is_train);
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
index 71d540c969c..75e51aff006 100644
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -35,7 +35,13 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                 const NDArray &in_data, const OpReqType &req,
                 const NDArray &out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[0]);
-  auto in_mem = in_data.GetMKLDNNData();
+
+  // If the input data is a view of an MKLDNN array, we should create a new
+  // NDArray with reordered data.
+  NDArray data = in_data;
+  if (data.IsMKLDNNData() && data.IsView())
+    data = data.Reorder2Default();
+  auto in_mem = data.GetMKLDNNData();
   if (req == kAddTo) {
     TmpMemMgr::Get()->Init(ctx.requested[0]);
     // We should try and force the output memory has the same format
@@ -44,7 +50,7 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
     if (out_mem == nullptr)
       out_mem = out_data.GetMKLDNNData();
     auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc());
-    Sum(*in_mem, *out_mem, *sum_res);
+    MKLDNNSum(*in_mem, *out_mem, *sum_res);
     const_cast<NDArray &>(out_data).CopyFrom(*sum_res);
   } else {
     const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index af57b68cfd3..7f3676a70dd 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -32,6 +32,12 @@
 namespace mxnet {
 namespace op {
 
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input) {
+  if (params.kernel.ndim() != 2)
+    return false;
+  return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
+}
+
 static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) {
   mkldnn::memory::dims dims(1);
   // This is convolution on 4D data. The second dimension is the channel.
@@ -67,31 +73,18 @@ static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwdImpl(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
+  CHECK_GE(param.stride.ndim(), 2U);
+  CHECK_GE(param.pad.ndim(), 2U);
+  CHECK_GE(param.dilate.ndim(), 2U);
   mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  } else if (param.stride.ndim() == 1) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[0];
-  } else {
-    LOG(FATAL) << "Unsupported stride dim";
-  }
+  strides[0] = param.stride[0];
+  strides[1] = param.stride[1];
   mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  } else if (param.pad.ndim() == 1) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[0];
-  } else {
-    LOG(FATAL) << "Unsupported pad dim";
-  }
+  padding[0] = param.pad[0];
+  padding[1] = param.pad[1];
   mkldnn::memory::dims dilate{0, 0};
-  if (param.dilate.ndim() == 2) {
-    dilate[0] = param.dilate[0] - 1;
-    dilate[1] = param.dilate[1] - 1;
-  }
+  dilate[0] = param.dilate[0] - 1;
+  dilate[1] = param.dilate[1] - 1;
   auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
       strides, padding, dilate);
   mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
@@ -107,31 +100,18 @@ static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
+  CHECK_GE(param.stride.ndim(), 2U);
+  CHECK_GE(param.pad.ndim(), 2U);
+  CHECK_GE(param.dilate.ndim(), 2U);
   mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  } else if (param.stride.ndim() == 1) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[0];
-  } else {
-    LOG(FATAL) << "Unsupported stride dim";
-  }
+  strides[0] = param.stride[0];
+  strides[1] = param.stride[1];
   mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  } else if (param.pad.ndim() == 1) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[0];
-  } else {
-    LOG(FATAL) << "Unsupported pad dim";
-  }
+  padding[0] = param.pad[0];
+  padding[1] = param.pad[1];
   mkldnn::memory::dims dilate{0, 0};
-  if (param.dilate.ndim() == 2) {
-    dilate[0] = param.dilate[0] - 1;
-    dilate[1] = param.dilate[1] - 1;
-  }
+  dilate[0] = param.dilate[0] - 1;
+  dilate[1] = param.dilate[1] - 1;
   return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
       strides, padding, dilate);
 }
@@ -144,31 +124,18 @@ static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
+  CHECK_GE(param.stride.ndim(), 2U);
+  CHECK_GE(param.pad.ndim(), 2U);
+  CHECK_GE(param.dilate.ndim(), 2U);
   mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  } else if (param.stride.ndim() == 1) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[0];
-  } else {
-    LOG(FATAL) << "Unsupported stride dim";
-  }
+  strides[0] = param.stride[0];
+  strides[1] = param.stride[1];
   mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  } else if (param.pad.ndim() == 1) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[0];
-  } else {
-    LOG(FATAL) << "Unsupported pad dim";
-  }
+  padding[0] = param.pad[0];
+  padding[1] = param.pad[1];
   mkldnn::memory::dims dilate{0, 0};
-  if (param.dilate.ndim() == 2) {
-    dilate[0] = param.dilate[0] - 1;
-    dilate[1] = param.dilate[1] - 1;
-  }
+  dilate[0] = param.dilate[0] - 1;
+  dilate[1] = param.dilate[1] - 1;
   if (!has_bias) {
     mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
         out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero);
@@ -293,8 +260,13 @@ static inline MKLDNNDeconvForward &GetDeconvFwd(
     const nnvm::NodeAttrs& attrs, const NDArray &data,
     const NDArray &weights, const NDArray *bias,
     const NDArray &output) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local
         std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL
+        std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash> fwds;
+#endif
   const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
   DeconvSignature key(param);
   // Here we can sign the conv op with NDArray because conv primitive will
diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
index b0b715a9da0..adb72a2a9c4 100644
--- a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
@@ -149,9 +149,15 @@ void MKLDNNLRNFwd::Execute() {
 static MKLDNNLRNFwd &GetLRNFwd(const LRNParam& param,
                                const OpContext &ctx,
                                const NDArray &in_data) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNLRNSignature,
                                          MKLDNNLRNFwd,
                                          OpHash> lrn_fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNLRNSignature,
+                                            MKLDNNLRNFwd,
+                                            OpHash> lrn_fwds;
+#endif
   auto alg_ = algorithm::lrn_across_channels;
   auto kind_ = prop_kind::forward_training;
   if (ctx.is_train) {
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
index 9149cb0c6a9..50937706d93 100644
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -104,7 +104,7 @@ void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx
                               const NDArray &out_grad, const NDArray &in_data,
                               const OpReqType &req, const NDArray &in_grad);
 
-void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
+void MKLDNNSum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
          const mkldnn::memory &out);
 
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
index 1aeb7d48dc3..259af2b9402 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling.cc
+++ b/src/operator/nn/mkldnn/mkldnn_pooling.cc
@@ -186,9 +186,15 @@ MKLDNNPoolingFwd &GetPoolingFwd(const PoolingParam &param,
                                 const bool is_train,
                                 const NDArray &data,
                                 const NDArray &output) {
+#if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNPoolingSignature,
                                          MKLDNNPoolingFwd,
                                          OpHash> pooling_fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNPoolingSignature,
+                                            MKLDNNPoolingFwd,
+                                            OpHash> pooling_fwds;
+#endif
 
   bool with_workspace = is_train && MKLDNNRequireWorkspace(param);
   MKLDNNPoolingSignature key(param);
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
index ccad068e423..fdbfb1558f6 100644
--- a/src/operator/nn/mkldnn/mkldnn_sum.cc
+++ b/src/operator/nn/mkldnn/mkldnn_sum.cc
@@ -31,7 +31,7 @@
 namespace mxnet {
 namespace op {
 
-void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
+void MKLDNNSum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
          const mkldnn::memory &out) {
   std::vector<mkldnn::memory::primitive_desc> input_pds(2);
   std::vector<float> scales(2, 1);
@@ -59,8 +59,15 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   std::vector<float> scales(inputs.size(), 1);
   in_prims.reserve(inputs.size());
   bool pd_same = true;
+  std::vector<NDArray> in_bufs(inputs.size());
   for (size_t i = 0; i < inputs.size(); i++) {
-    auto in_mem = inputs[i].GetMKLDNNData();
+    const mkldnn::memory *in_mem;
+    if (inputs[i].IsMKLDNNData() && inputs[i].IsView()) {
+      in_bufs[i] = inputs[i].Reorder2Default();
+      in_mem = in_bufs[i].GetMKLDNNData();
+    } else {
+      in_mem = inputs[i].GetMKLDNNData();
+    }
     in_prims.push_back(*in_mem);
     in_pds[i] = in_mem->get_primitive_desc();
   }
@@ -68,9 +75,16 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   mkldnn::sum::primitive_desc pdesc(scales, in_pds);
   pd_same = pd_same && (pdesc.dst_primitive_desc() == in_pds[0]);
   auto out_mem = const_cast<NDArray&>(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc());
-  bool addr_same = out_mem->get_data_handle() == inputs[0].GetMKLDNNData()->get_data_handle();
-  if ((req == kWriteTo) ||
-      (req == kWriteInplace && pd_same && addr_same)) {
+  bool addr_same = false;
+  const void *first_data_handle;
+  if (in_bufs[0].is_none())
+    first_data_handle = inputs[0].GetMKLDNNData()->get_data_handle();
+  else
+    first_data_handle = in_bufs[0].GetMKLDNNData()->get_data_handle();
+  if (out_mem)
+    addr_same = out_mem->get_data_handle() == first_data_handle;
+  if (((req == kWriteTo) || (req == kWriteInplace && pd_same && addr_same))
+      && out_mem) {
     // do sum computation directly on output NDArray
     MKLDNNStream *stream = MKLDNNStream::Get();
     stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem));
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 0e9cff0c51e..9d004d295be 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -80,7 +80,9 @@
 
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
+#include "./pool_utils.h"
 #include "../mxnet_op.h"
+#include "../mshadow_op.h"
 #include "../../common/cuda_utils.h"
 
 namespace mxnet {
@@ -208,27 +210,26 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
  * \brief avg/sum pooling gpu kernel for 1-D images.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType>
+template <typename DType, int p = 1>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
-                                       const int stride_w, const int pad_w,
-                                       DType* out_data, bool getAvg = false) {
+                                       const int stride_w, const int pad_w, DType* out_data,
+                                       const bool getAvg = false) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-	  const int pw = index % pooled_width;
-	  const int c = (index / pooled_width) % channels;
-	  const int n = index / pooled_width / channels;
-	  int wstart = pw * stride_w - pad_w;
-	  int wend = min(wstart + kernel_w, width + pad_w);
-	  const int pool_size = (getAvg? (wend - wstart) : 1);
-	  wstart = max(wstart, 0);
-	  wend = min(wend, width);
-	  DType sum = 0;
-	  const DType* out_slice =
-	 		in_data + (n * channels + c) * width;
+    const int pw = index % pooled_width;
+    const int c = (index / pooled_width) % channels;
+    const int n = index / pooled_width / channels;
+    int wstart = pw * stride_w - pad_w;
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (getAvg? (wend - wstart) : 1);
+    wstart = max(wstart, 0);
+    wend = min(wend, width);
+    DType sum = 0;
+    const DType* out_slice = in_data + (n * channels + c) * width;
     for (int w = wstart; w < wend; ++w) {
-      sum += out_slice[w];
+      sum += a_pow_p<DType, p>::Map(out_slice[w]) / pool_size;
     }
-    out_data[index] = sum / pool_size;
+    out_data[index] = a_root_p<DType, p>::Map(sum);
   }
 }
 
@@ -236,37 +237,36 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data,
  * \brief avg/sum pooling gpu kernel for 2-D images.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType>
+template <typename DType, int p = 1>
 __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int height, const int width,
                                        const int pooled_height, const int pooled_width,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_h, const int stride_w,
-                                       const int pad_h, const int pad_w,
-                                       DType* out_data, bool getAvg = false) {
+                                       const int pad_h, const int pad_w, DType* out_data,
+                                       const bool getAvg = false) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-	  const int pw = index % pooled_width;
-	  const int ph = (index / pooled_width) % pooled_height;
-	  const int c = (index / pooled_width / pooled_height) % channels;
-	  const int n = index / pooled_width / pooled_height / channels;
-	  int hstart = ph * stride_h - pad_h;
-	  int wstart = pw * stride_w - pad_w;
-	  int hend = min(hstart + kernel_h, height + pad_h);
-	  int wend = min(wstart + kernel_w, width + pad_w);
-	  const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
-	  hstart = max(hstart, 0);
-	  wstart = max(wstart, 0);
-	  hend = min(hend, height);
-	  wend = min(wend, width);
-	  DType sum = 0;
-	  const DType* out_slice =
-	 		in_data + (n * channels + c) * height * width;
-	  for (int h = hstart; h < hend; ++h) {
-		  for (int w = wstart; w < wend; ++w) {
-		    sum += out_slice[h * width + w];
-		  }
-	  }
-    out_data[index] = sum / pool_size;
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    DType sum = 0;
+    const DType* out_slice = in_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        sum += a_pow_p<DType, p>::Map(out_slice[h * width + w]) / pool_size;
+      }
+    }
+    out_data[index] = a_root_p<DType, p>::Map(sum);
   }
 }
 
@@ -274,7 +274,7 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
  * \brief avg/sum pooling gpu kernel for 3-D images.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType>
+template <typename DType, int p = 1>
 __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int depth, const int height, const int width,
                                        const int pooled_depth, const int pooled_height,
@@ -282,37 +282,36 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_d, const int stride_h, const int stride_w,
                                        const int pad_d, const int pad_h, const int pad_w,
-                                       DType* out_data, bool getAvg = false) {
+                                       DType* out_data, const bool getAvg = false) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-	  const int pw = index % pooled_width;
-	  const int ph = (index / pooled_width) % pooled_height;
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
     const int pd = (index / pooled_width / pooled_height) % pooled_depth;
-	  const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
-	  const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const int n = index / pooled_width / pooled_height / pooled_depth / channels;
     int dstart = pd * stride_d - pad_d;
-	  int hstart = ph * stride_h - pad_h;
-	  int wstart = pw * stride_w - pad_w;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
     int dend = min(dstart + kernel_d, depth + pad_d);
-	  int hend = min(hstart + kernel_h, height + pad_h);
-	  int wend = min(wstart + kernel_w, width + pad_w);
-	  const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
     dstart = max(dstart, 0);
-	  hstart = max(hstart, 0);
-	  wstart = max(wstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
     dend = min(dend, depth);
-	  hend = min(hend, height);
-	  wend = min(wend, width);
-	  DType sum = 0;
-	  const DType* out_slice =
-	 		in_data + (n * channels + c) * depth * height * width;
+    hend = min(hend, height);
+    wend = min(wend, width);
+    DType sum = 0;
+    const DType* out_slice = in_data + (n * channels + c) * depth * height * width;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          sum += out_slice[(d * height + h) * width + w];
+          sum += a_pow_p<DType, p>::Map(out_slice[(d * height + h) * width + w]) / pool_size;
         }
       }
     }
-    out_data[index] = sum / pool_size;
+    out_data[index] = a_root_p<DType, p>::Map(sum);
   }
 }
 
@@ -482,34 +481,38 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr
  * \brief avg/sum unpooling gpu kernel for 1-D images.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType>
+template<typename DType, int p = 1>
 __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
                                          const int channels, const int width,
                                          const int pooled_width, const int kernel_w,
-                                         const int stride_w, const int pad_w,
-                                         DType* in_grad, bool isAvg = false) {
+                                         const int stride_w, const int pad_w, DType* in_grad,
+                                         const bool isAvg = false) {
   // index is the input image index in NCW
   CUDA_KERNEL_LOOP(index, nthreads) {
-	  // find out the local index
-	  // find out the local offset
-	  const int w = index % width + pad_w;
-	  const int c = (index / width) % channels;
-	  const int n = index / width / channels;
-	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-	  const int pwend = min(w / stride_w + 1, pooled_width);
-	  DType gradient = 0;
-	  const DType* out_grad_slice =
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_w;
+    const int c = (index / width) % channels;
+    const int n = index / width / channels;
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    DType gradient = 0;
+    const DType* out_grad_slice =
       out_grad + (n * channels + c) * pooled_width;
+    const DType* out_data_slice =
+      out_data + (n * channels + c) * pooled_width;
     for (int pw = pwstart; pw < pwend; ++pw) {
       // figure out the pooling size
       int wstart = pw * stride_w - pad_w;
       int wend = min(wstart + kernel_w, width + pad_w);
       int pool_size = (isAvg? (wend - wstart) : 1);
-      gradient += out_grad_slice[pw] / pool_size;
+      gradient +=
+        lp_grad<DType, p>::Map(out_grad_slice[pw], in_data[index], out_data_slice[pw]) / pool_size;
     }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
     // use "+=" here instead of "=" to accommodate when req=kAddTo
-	  in_grad[index] += gradient;
+    in_grad[index] += gradient;
   }
 }
 
@@ -517,43 +520,50 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
  * \brief avg/sum unpooling gpu kernel for 2-D images.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType>
+template<typename DType, int p = 1>
 __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
                                          const int channels, const int height, const int width,
                                          const int pooled_height, const int pooled_width,
                                          const int kernel_h, const int kernel_w,
                                          const int stride_h, const int stride_w,
-                                         const int pad_h, const int pad_w,
-                                         DType* in_grad, bool isAvg = false) {
+                                         const int pad_h, const int pad_w, DType* in_grad,
+                                         const bool isAvg = false) {
   // index is the input image index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
-	  // find out the local index
-	  // find out the local offset
-	  const int w = index % width + pad_w;
-	  const int h = (index / width) % height + pad_h;
-	  const int c = (index / width / height) % channels;
-	  const int n = index / width / height / channels;
-	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-	  const int phend = min(h / stride_h + 1, pooled_height);
-	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-	  const int pwend = min(w / stride_w + 1, pooled_width);
-	  DType gradient = 0;
-	  const DType* out_grad_slice =
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_w;
+    const int h = (index / width) % height + pad_h;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    DType gradient = 0;
+    const DType* out_grad_slice =
       out_grad + (n * channels + c) * pooled_height * pooled_width;
-	  for (int ph = phstart; ph < phend; ++ph) {
-	 	  for (int pw = pwstart; pw < pwend; ++pw) {
-		    // figure out the pooling size
-			  int hstart = ph * stride_h - pad_h;
-			  int wstart = pw * stride_w - pad_w;
-			  int hend = min(hstart + kernel_h, height + pad_h);
-			  int wend = min(wstart + kernel_w, width + pad_w);
-			  int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
-			  gradient += out_grad_slice[ph * pooled_width + pw] / pool_size;
-		  }
-	  }
+    const DType* out_data_slice =
+      out_data + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = min(hstart + kernel_h, height + pad_h);
+        int wend = min(wstart + kernel_w, width + pad_w);
+        int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
+        int out_index = ph * pooled_width + pw;
+        gradient +=
+          lp_grad<DType, p>::Map(out_grad_slice[out_index],
+                                 in_data[index],
+                                 out_data_slice[out_index]) / pool_size;
+      }
+    }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
     // use "+=" here instead of "=" to accommodate when req=kAddTo
-	  in_grad[index] += gradient;
+    in_grad[index] += gradient;
   }
 }
 
@@ -561,33 +571,36 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
  * \brief avg/sum unpooling gpu kernel for 3-D images.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType>
+template<typename DType, int p = 1>
 __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
                                          const int channels, const int depth, const int height,
                                          const int width, const int pooled_depth,
                                          const int pooled_height, const int pooled_width,
                                          const int kernel_d, const int kernel_h,
                                          const int kernel_w, const int stride_d, const int stride_h,
                                          const int stride_w, const int pad_d, const int pad_h,
-                                         const int pad_w, DType* in_grad, bool isAvg = false) {
+                                         const int pad_w, DType* in_grad, const bool isAvg = false) {
   // index is the input image index in NCDHW
   CUDA_KERNEL_LOOP(index, nthreads) {
-	  // find out the local index
-	  // find out the local offset
-	  const int w = index % width + pad_w;
-	  const int h = (index / width) % height + pad_h;
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_w;
+    const int h = (index / width) % height + pad_h;
     const int d = (index / width / height) % depth + pad_d;
-	  const int c = (index / width / height / depth) % channels;
-	  const int n = index / width / height / depth / channels;
+    const int c = (index / width / height / depth) % channels;
+    const int n = index / width / height / depth / channels;
     const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
     const int pdend = min(d / stride_d + 1, pooled_depth);
-	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-	  const int phend = min(h / stride_h + 1, pooled_height);
-	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-	  const int pwend = min(w / stride_w + 1, pooled_width);
-	  DType gradient = 0;
-	  const DType* out_grad_slice =
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    DType gradient = 0;
+    const DType* out_grad_slice =
       out_grad + (n * channels + c) * pooled_depth * pooled_height * pooled_width;
+    const DType* out_data_slice =
+      out_data + (n * channels + c) * pooled_depth * pooled_height * pooled_width;
     for (int pd = pdstart; pd < pdend; ++pd) {
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
@@ -599,13 +612,16 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
           int hend = min(hstart + kernel_h, height + pad_h);
           int wend = min(wstart + kernel_w, width + pad_w);
           int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
-          gradient += out_grad_slice[(pd * pooled_height + ph) * pooled_width + pw] / pool_size;
+          int out_index = (pd * pooled_height + ph) * pooled_width + pw;
+          gradient += lp_grad<DType, p>::Map(out_grad_slice[out_index],
+                                             in_data[index],
+                                             out_data_slice[out_index]) / pool_size;
         }
       }
     }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
     // use "+=" here instead of "=" to accommodate when req=kAddTo
-	  in_grad[index] += gradient;
+    in_grad[index] += gradient;
   }
 }
 
@@ -621,8 +637,9 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type, only support kWriteTo for now
  * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ * \param p_value value of p for Lp pooling
  */
-template<typename DType>
+template<typename DType, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
@@ -651,6 +668,13 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
                                    kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_1d_gpu_kernel<DType, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
+                                   kernel[0], stride[0], pad[0], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -679,6 +703,14 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape[2], oshape[3], kernel[0], kernel[1],
                                    stride[0], stride[1], pad[0], pad[1], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_2d_gpu_kernel<DType, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -710,6 +742,15 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
                                    pad[0], pad[1], pad[2], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_3d_gpu_kernel<DType, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
+                                   kernel[1], kernel[2], stride[0], stride[1], stride[2],
+                                   pad[0], pad[1], pad[2], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -730,8 +771,9 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
  * \param in_grad pointer of the gradient of the operator's input tensor
+ * \param p_value value of p for Lp pooling
  */
-template<typename DType>
+template<typename DType, int p>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -754,7 +796,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                     ishape.Size(), out_grad,
+                                     ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
                                      stride[0], pad[0], in_grad, true);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
@@ -762,14 +804,22 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                     ishape.Size(), out_grad,
+                                     ishape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], oshape[2], kernel[0],
+                                     stride[0], pad[0], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_1d_gpu_kernel<DType, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
                                      stride[0], pad[0], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
-  } else  if (kernel.ndim() == 2) {
+  } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
@@ -783,7 +833,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                     ishape.Size(), out_grad,
+                                     ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
                                      stride[0], stride[1], pad[0], pad[1], in_grad, true);
@@ -792,7 +842,16 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                     ishape.Size(), out_grad,
+                                     ishape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_2d_gpu_kernel<DType, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
                                      stride[0], stride[1], pad[0], pad[1], in_grad);
@@ -815,7 +874,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                     ishape.Size(), out_grad,
+                                     ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
@@ -825,7 +884,17 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                     ishape.Size(), out_grad,
+                                     ishape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
+                                     pad[2], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_3d_gpu_kernel<DType, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 79accb5d521..9fe43b2bd46 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -62,7 +62,9 @@
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include <algorithm>
+#include "./pool_utils.h"
 #include "../mxnet_op.h"
+#include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
@@ -70,7 +72,7 @@ namespace op {
 namespace pool_enum {
 enum PoolingOpInputs {kData};
 enum PoolingOpOutputs {kOut, kMask};
-enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling, kLpPooling};
 enum PoolingOpPadConventionType {kValid, kFull};
 }  // namespace pool_enum
 
@@ -211,10 +213,10 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
  * \brief avg/sum pooling cpu function for 1-D images.
  * Do not call this kernel directly. Use the interface pool().
  */
-template<typename DType>
+template<typename DType, int p = 1>
 inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, bool getAvg = false) {
+                            DType* out_data, const bool getAvg = false) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -227,14 +229,14 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
       for (int pw = 0; pw < pooled_width; ++pw) {
         int wstart = pw * stride_w - pad_w;
         int wend = std::min(wstart + kernel_w, width + pad_w);
-        int pool_size = (wend - wstart);
+        int pool_size = (getAvg ? (wend - wstart) : 1);
         wstart = std::max(wstart, 0);
         wend = std::min(wend, width);
         DType sum = 0;
         for (int w = wstart; w < wend; ++w) {
-          sum += in_data[w];
+          sum += a_pow_p<DType, p>::Map(in_data[w]) / pool_size;
         }
-        out_data[pw] = (getAvg? sum/pool_size : sum);
+        out_data[pw] = a_root_p<DType, p>::Map(sum);
       }
       in_data += in_data_offset;
       out_data += out_data_offset;
@@ -246,10 +248,10 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
  * \brief avg/sum pooling cpu function for 2-D images.
  * Do not call this kernel directly. Use the interface pool().
  */
-template<typename DType>
+template<typename DType, int p = 1>
 inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, bool getAvg = false) {
+                            DType* out_data, const bool getAvg = false) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -265,7 +267,7 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
           int wend = std::min(wstart + kernel_w, width + pad_w);
-          int pool_size = (hend - hstart) * (wend - wstart);
+          int pool_size = (getAvg ? (hend - hstart) * (wend - wstart) : 1);
           hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
@@ -273,10 +275,10 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
           DType sum = 0;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              sum += in_data[h*width+w];
+              sum += a_pow_p<DType, p>::Map(in_data[h*width+w]) / pool_size;
             }
           }
-          out_data[ph*pooled_width+pw] = (getAvg? sum/pool_size : sum);
+          out_data[ph*pooled_width+pw] = a_root_p<DType, p>::Map(sum);
         }
       }
       in_data += in_data_offset;
@@ -289,10 +291,10 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
  * \brief avg/sum pooling cpu function for 3-D images.
  * Do not call this kernel directly. Use the interface pool().
  */
-template<typename DType>
+template<typename DType, int p = 1>
 inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data, bool getAvg = false) {
+                            DType* out_data, const bool getAvg = false) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -311,7 +313,7 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
             int dend = std::min(dstart + kernel_d, depth + pad_d);
             int hend = std::min(hstart + kernel_h, height + pad_h);
             int wend = std::min(wstart + kernel_w, width + pad_w);
-            int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            int pool_size = (getAvg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
@@ -322,11 +324,11 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
-                  sum += in_data[(d*height+h)*width+w];
+                  sum += a_pow_p<DType, p>::Map(in_data[(d*height+h)*width+w]) / pool_size;
                 }
               }
             }
-            out_data[(pd*pooled_height+ph)*pooled_width+pw] = (getAvg? sum/pool_size : sum);
+            out_data[(pd*pooled_height+ph)*pooled_width+pw] = a_root_p<DType, p>::Map(sum);
           }
         }
       }
@@ -504,11 +506,11 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
  * \brief avg/sum unpooling cpu function for 1-D images.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType>
-inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
-                              const TShape& oshape, const TShape& kernel,
+template<typename DType, int p = 1>
+inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
+                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride,
-                              DType* in_grad, bool isAvg = false) {
+                              DType* in_grad, const bool isAvg = false) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -521,18 +523,17 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
       for (int pw = 0; pw < pooled_width; ++pw) {
         int wstart = pw * stride_w - pad_w;
         int wend = std::min(wstart + kernel_w, width + pad_w);
-        int pool_size = 1;
-        if (isAvg) {
-          pool_size = wend - wstart;
-        }
+        int pool_size = (isAvg ? (wend - wstart) : 1);
         wstart = std::max(wstart, 0);
         wend = std::min(wend, width);
         for (int w = wstart; w < wend; ++w) {
-          in_grad[w] += out_grad[pw] / pool_size;
+          in_grad[w] += lp_grad<DType, p>::Map(out_grad[pw], in_data[w], out_data[pw]) / pool_size;
         }
       }
       in_grad += in_grad_offset;
+      in_data += in_grad_offset;
       out_grad += out_grad_offset;
+      out_data += out_grad_offset;
     }
   }
 }
@@ -541,11 +542,11 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
  * \brief avg/sum unpooling cpu function for 2-D images.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType>
-inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
-                              const TShape& oshape, const TShape& kernel,
+template<typename DType, int p = 1>
+inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
+                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride,
-                              DType* in_grad, bool isAvg = false) {
+                              DType* in_grad, const bool isAvg = false) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -561,10 +562,7 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
           int wend = std::min(wstart + kernel_w, width + pad_w);
-          int pool_size = 1;
-          if (isAvg) {
-            pool_size = (hend - hstart) * (wend - wstart);
-          }
+          int pool_size = (isAvg ? (hend - hstart) * (wend - wstart) : 1);
           hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
@@ -572,13 +570,18 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
           const int pool_index = ph * pooled_width + pw;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              in_grad[h*width+w] += out_grad[pool_index] / pool_size;
+              in_grad[h*width+w] +=
+                lp_grad<DType, p>::Map(out_grad[pool_index],
+                                       in_data[h*width+w],
+                                       out_data[pool_index]) / pool_size;
             }
           }
         }
       }
       in_grad += in_grad_offset;
+      in_data += in_grad_offset;
       out_grad += out_grad_offset;
+      out_data += out_grad_offset;
     }
   }
 }
@@ -587,11 +590,11 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
  * \brief avg/sum unpooling cpu function for 3-D images.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType>
-inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
-                              const TShape& oshape, const TShape& kernel,
+template<typename DType, int p = 1>
+inline void unpool_sum_3d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
+                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride,
-                              DType* in_grad, bool isAvg = false) {
+                              DType* in_grad, const bool isAvg = false) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -610,10 +613,7 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
             int dend = std::min(dstart + kernel_d, depth + pad_d);
             int hend = std::min(hstart + kernel_h, height + pad_h);
             int wend = std::min(wstart + kernel_w, width + pad_w);
-            int pool_size = 1;
-            if (isAvg) {
-              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            }
+            int pool_size = (isAvg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
@@ -624,7 +624,10 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
-                  in_grad[(d*height+h)*width+w] += out_grad[pool_index] / pool_size;
+                  in_grad[(d*height+h)*width+w] +=
+                    lp_grad<DType, p>::Map(out_grad[pool_index],
+                                           in_data[(d*height+h)*width+w],
+                                           out_data[pool_index]) / pool_size;
                 }
               }
             }
@@ -632,7 +635,9 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
         }
       }
       in_grad += in_grad_offset;
+      in_data += in_grad_offset;
       out_grad += out_grad_offset;
+      out_data += out_grad_offset;
     }
   }
 }
@@ -649,8 +654,9 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type, only support kWriteTo for now
  * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ * \param p_value value of p for Lp pooling
  */
-template<typename DType>
+template<typename DType, int p>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
@@ -663,6 +669,8 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
       pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      pool_sum_1d_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -673,6 +681,8 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
       pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      pool_sum_2d_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -683,6 +693,8 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
       pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
       pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      pool_sum_3d_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -705,12 +717,13 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
  * \param in_grad pointer of the gradient of the operator's input tensor
+ * \param p_value value of p for Lp pooling
  */
-template<typename DType>
+template<typename DType, int p>
 inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
-                   const int pool_type, OpReqType req_type, DType* in_grad) {
+                   const int pool_type, OpReqType req_type, DType* in_grad, const int p_value = 2) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
@@ -719,9 +732,13 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     if (pool_enum::kMaxPooling == pool_type) {
       unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+      unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
+                        true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+      unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      unpool_sum_1d_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                                  in_grad);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -729,9 +746,13 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     if (pool_enum::kMaxPooling == pool_type) {
       unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+      unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
+                        true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+      unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      unpool_sum_2d_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                                  in_grad);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -739,9 +760,13 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     if (pool_enum::kMaxPooling == pool_type) {
       unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+      unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
+                        true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+      unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kLpPooling == pool_type) {
+      unpool_sum_3d_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                                  in_grad);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
diff --git a/src/operator/nn/pool_utils.h b/src/operator/nn/pool_utils.h
new file mode 100644
index 00000000000..641cc4a995a
--- /dev/null
+++ b/src/operator/nn/pool_utils.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_NN_POOL_UTILS_H_
+#define MXNET_OPERATOR_NN_POOL_UTILS_H_
+
+#include "../mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename DType, int p>
+struct a_pow_p {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return mshadow_op::power::Map(a, DType(p));
+  }
+};
+
+template<typename DType>
+struct a_pow_p<DType, 1> {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return a;
+  }
+};
+
+template<typename DType>
+struct a_pow_p<DType, 2> {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return a*a;
+  }
+};
+
+template<typename DType>
+struct a_pow_p<DType, 3> {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return a*a*a;
+  }
+};
+
+template<typename DType, int p>
+struct a_root_p {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return mshadow_op::power::Map(a, DType(1.0 / p));
+  }
+};
+
+template<typename DType>
+struct a_root_p<DType, 1> {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return a;
+  }
+};
+
+template<typename DType>
+struct a_root_p<DType, 2> {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return mshadow_op::square_root::Map(a);
+  }
+};
+
+template<typename DType>
+struct a_root_p<DType, 3> {
+  static MSHADOW_XINLINE DType Map(const DType a) {
+    return mshadow_op::cube_root::Map(a);
+  }
+};
+
+template<typename DType, int p>
+struct lp_grad {
+  static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
+    return grad * mshadow_op::power::Map(in_data / out_data, DType(p - 1));
+  }
+};
+
+template<typename DType>
+struct lp_grad<DType, 1> {
+  static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
+    return grad;
+  }
+};
+
+template<typename DType>
+struct lp_grad<DType, 2> {
+  static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
+    return grad * in_data / out_data;
+  }
+};
+
+template<typename DType>
+struct lp_grad<DType, 3> {
+  static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
+    return grad * in_data * in_data / (out_data * out_data);
+  }
+};
+
+}   // namespace op
+}   // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_POOL_UTILS_H_
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index a390dd0f053..a4770b49e85 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
- * \author Bing Xu, Jun Wu, Da Zheng
+ * \author Bing Xu, Jun Wu, Da Zheng, Hao Jin
 */
 
 #ifndef MXNET_OPERATOR_NN_POOLING_INL_H_
@@ -49,6 +49,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   int pooling_convention;
   bool global_pool;
   bool cudnn_off;
+  dmlc::optional<int> p_value;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(kernel).set_default(TShape())  // add default value here
     .enforce_nonzero()
@@ -58,6 +59,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .add_enum("max", pool_enum::kMaxPooling)
     .add_enum("avg", pool_enum::kAvgPooling)
     .add_enum("sum", pool_enum::kSumPooling)
+    .add_enum("lp", pool_enum::kLpPooling)
     .describe("Pooling type to be applied.");
 
     DMLC_DECLARE_FIELD(global_pool).set_default(false)
@@ -77,6 +79,9 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
 
     DMLC_DECLARE_FIELD(pad).set_default(TShape())
     .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding.");
+
+    DMLC_DECLARE_FIELD(p_value).set_default(dmlc::optional<int>())
+    .describe("Value of p for Lp pooling, can be 1 or 2, required for Lp Pooling");
   }
 
   bool operator==(const PoolingParam& other) const {
@@ -86,7 +91,8 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
            this->pool_type          == other.pool_type &&
            this->pooling_convention == other.pooling_convention &&
            this->global_pool        == other.global_pool &&
-           this->cudnn_off          == other.cudnn_off;
+           this->cudnn_off          == other.cudnn_off &&
+           this->p_value            == other.p_value;
   }
 };
 
@@ -105,6 +111,7 @@ struct hash<mxnet::op::PoolingParam> {
     ret = dmlc::HashCombine(ret, val.pooling_convention);
     ret = dmlc::HashCombine(ret, val.global_pool);
     ret = dmlc::HashCombine(ret, val.cudnn_off);
+    ret = dmlc::HashCombine(ret, val.p_value);
     return ret;
   }
 };
@@ -144,12 +151,33 @@ class PoolingOp {
       }
       stride = TShape(ishape.ndim() - 2);
     }
-
-    pool(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
-         kernel,
-         padding,
-         stride,
-         param_.pool_type, req, out_data.dptr<DType>());
+    const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
+                        param_.p_value.value() : 1;
+    switch (p_value) {
+      case 1:
+        pool<DType, 1>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
+          kernel,
+          padding,
+          stride,
+          param_.pool_type, req, out_data.dptr<DType>());
+        break;
+      case 2:
+        pool<DType, 2>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
+          kernel,
+          padding,
+          stride,
+          param_.pool_type, req, out_data.dptr<DType>());
+        break;
+      case 3:
+        pool<DType, 3>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
+          kernel,
+          padding,
+          stride,
+          param_.pool_type, req, out_data.dptr<DType>());
+        break;
+      default:
+        LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
+    }
   }
 
   void Backward(const OpContext& ctx, const TBlob& out_grad,
@@ -171,30 +199,42 @@ class PoolingOp {
       stride = TShape(ishape.ndim() - 2);
     }
 
-    unpool(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
+    const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
+                        param_.p_value.value() : 1;
+    switch (p_value) {
+      case 1:
+        unpool<DType, 1>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
+           in_grad.shape_, out_grad.shape_,
+           kernel,
+           padding,
+           stride,
+           param_.pool_type, req, in_grad.dptr<DType>());
+        break;
+      case 2:
+        unpool<DType, 2>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
+           in_grad.shape_, out_grad.shape_,
+           kernel,
+           padding,
+           stride,
+           param_.pool_type, req, in_grad.dptr<DType>());
+        break;
+      case 3:
+        unpool<DType, 3>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
            in_grad.shape_, out_grad.shape_,
            kernel,
            padding,
            stride,
            param_.pool_type, req, in_grad.dptr<DType>());
+        break;
+      default:
+        LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
+    }
   }
 
  private:
   PoolingParam param_;
 };  // class PoolingOp
 
-template<typename xpu, typename DType>
-PoolingOp<xpu, DType> &GetPoolingOp(const PoolingParam &param) {
-  static thread_local PoolingOp<xpu, DType> op;
-  // check if filter size assigned correctly
-  if (param.global_pool == false) {
-    CHECK_GT(param.kernel.ndim(), 0U)
-        << "You need to set the kernel size if global pooling is not used";
-  }
-  op.Init(param);
-  return op;
-}
-
 template<typename xpu>
 void PoolingCompute(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
@@ -204,11 +244,19 @@ void PoolingCompute(const nnvm::NodeAttrs& attrs,
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), GetNumOutputs(param));
+  if (!param.global_pool) {
+    // check if filter size assigned correctly
+    CHECK_GT(param.kernel.ndim(), 0U)
+        << "You need to set the kernel size if global pooling is not used";
+  }
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      GetPoolingOp<xpu, DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+        || pool_enum::kSumPooling == param.pool_type
+        || pool_enum::kLpPooling == param.pool_type) {
+      PoolingOp<xpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs[0], req[0], outputs[0]);
     } else {
       LOG(FATAL) << "unknown pooling type";
     }
@@ -225,6 +273,11 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), GetNumBackInputs(param));
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
+  if (!param.global_pool) {
+    // check if filter size assigned correctly
+    CHECK_GT(param.kernel.ndim(), 0U)
+        << "You need to set the kernel size if global pooling is not used";
+  }
   off_t ograd_idx, in_data_idx, out_data_idx;
   // When MKLDNN is enabled, the input data may contains arrays for workspace.
   if (GetNumBackInputs(param) == 5) {
@@ -239,10 +292,12 @@ void PoolingGradCompute(const nnvm::NodeAttrs& attrs,
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      GetPoolingOp<xpu, DType>(param).Backward(ctx, inputs[ograd_idx],
-                                               inputs[in_data_idx], inputs[out_data_idx],
-                                               req[0], outputs[0]);
+        || pool_enum::kSumPooling == param.pool_type
+        || pool_enum::kLpPooling == param.pool_type) {
+      PoolingOp<xpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, inputs[ograd_idx], inputs[in_data_idx],
+                  inputs[out_data_idx], req[0], outputs[0]);
     } else {
       LOG(FATAL) << "unknown pooling type";
     }
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index ca472b3ca1b..3ff94da3c2d 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -92,6 +92,9 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
                          std::vector<TShape> *out_shape) {
   const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), 1U);
+  if (param.pool_type == pool_enum::kLpPooling) {
+    CHECK(param.p_value.has_value());
+  }
   const TShape &dshape = (*in_shape)[0];
   CHECK_GE(dshape.ndim(), 3U)
       << "Pooling: Input data should be  3D in (batch, channel, x)"
@@ -344,11 +347,23 @@ Three pooling options are supported by ``pool_type``:
 - **avg**: average pooling
 - **max**: max pooling
 - **sum**: sum pooling
+- **lp**: Lp pooling
 
 For 3-D pooling, an additional *depth* dimension is added before
 *height*. Namely the input data will have shape *(batch_size, channel, depth,
 height, width)*.
 
+Notes on Lp pooling:
+
+Lp pooling was first introduced by this paper: https://arxiv.org/pdf/1204.3968.pdf.
+L-1 pooling is simply sum pooling, while L-inf pooling is simply max pooling.
+We can see that Lp pooling stands between those two, in practice the most common value for p is 2.
+
+For each window ``X``, the mathematical expression for Lp pooling is:
+
+..math::
+  f(X) = \sqrt{p}{\sum\limits_{x \in X} x^p}
+
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs([](const NodeAttrs& attrs) {
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index 17e6eb43a7a..997218620c3 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -66,6 +66,9 @@ void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
+        case pool_enum::kLpPooling:
+          LOG(WARNING) << "Lp pooling is not supported by cudnn, MXNet lp pooling is applied.";
+          break;
       }
     });
   }
@@ -74,8 +77,11 @@ void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      GetPoolingOp<gpu, DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+        || pool_enum::kSumPooling == param.pool_type
+        || pool_enum::kLpPooling == param.pool_type) {
+      PoolingOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs[0], req[0], outputs[0]);
     } else {
       LOG(FATAL) << "unknown pooling type";
     }
@@ -117,6 +123,9 @@ void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
+        case pool_enum::kLpPooling:
+          LOG(WARNING) << "Lp pooling is not supported by cudnn, MXNet Lp pooling is applied.";
+          break;
       }
     });
   }
@@ -125,10 +134,12 @@ void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      GetPoolingOp<gpu, DType>(param).Backward(ctx, inputs[ograd_idx],
-                                               inputs[in_data_idx], inputs[out_data_idx],
-                                               req[0], outputs[0]);
+        || pool_enum::kSumPooling == param.pool_type
+        || pool_enum::kLpPooling == param.pool_type) {
+      PoolingOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, inputs[ograd_idx], inputs[in_data_idx],
+                  inputs[out_data_idx], req[0], outputs[0]);
     } else {
       LOG(FATAL) << "unknown pooling type";
     }
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 935c92a034c..cf126ed58ea 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -305,7 +305,7 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs,
     dispatched = storage_type_assign(out_attrs, static_cast<NDArrayStorageType>(weight_stype),
                                      dispatch_mode, DispatchMode::kFComputeEx);
     // warn users if lazy_update is turned on
-    if (dispatched && param.lazy_update) LogLazyUpdate();
+    if (dispatched && param.wd != 0 && param.lazy_update) LogLazyUpdate();
   }
   if (!dispatched) {
     dispatched = dispatch_fallback(out_attrs, dispatch_mode);
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index 71f4e738161..a3105eb654d 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -92,6 +92,13 @@ bool QuantizedPoolingType(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_pooling)
+.describe(R"code(Pooling operator for input and output data type of int8.
+The input and output data comes with min and max thresholds for quantizing
+the float32 data into int8.
+
+.. Note::
+    This operator only supports forward propogation. DO NOT use it in training.
+    This operator only supports `pool_type` of `avg` or `max`.)code" ADD_FILELINE)
 .set_num_inputs(3)
 .set_num_outputs(3)
 .set_attr_parser(ParamParser<PoolingParam>)
@@ -118,13 +125,6 @@ NNVM_REGISTER_OP(_contrib_quantized_pooling)
 .add_arguments(PoolingParam::__FIELDS__());
 
 NNVM_REGISTER_OP(Pooling)
-.describe(R"code(Pooling operator for input and output data type of int8.
-The input and output data comes with min and max thresholds for quantizing
-the float32 data into int8.
-
-.. Note::
-    This operator only supports forward propogation. DO NOT use it in training.
-    This operator only supports `pool_type` of `avg` or `max`.)code" ADD_FILELINE)
 .set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
     PoolingParam param;
     param.Init(attrs.dict);
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 13c077dd9e3..eded6aeed8a 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file rnn-inl.h
  * \brief
- * \author Sebastian Bodenstein
+ * \author Sebastian Bodenstein, Shu Zhang
 */
 #ifndef MXNET_OPERATOR_RNN_INL_H_
 #define MXNET_OPERATOR_RNN_INL_H_
@@ -29,6 +29,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
+#include <mxnet/storage.h>
 #include <algorithm>
 #include <map>
 #include <vector>
@@ -37,8 +38,7 @@
 #include "./math.h"
 #include "./math_functions-inl.h"
 #include "./operator_common.h"
-#include "./mshadow_op.h"
-#include "./linalg.h"
+#include "./rnn_impl.h"
 
 namespace mxnet {
 namespace op {
@@ -50,18 +50,37 @@ namespace rnn_enum {
   enum RNNOpResource {kTempSpace};
 }
 
-// A utility function to calculate input size
-inline int rnn_single_param_size(int inputSize,
-                                int hiddenSize,
-                                int mode) {
-  int size = hiddenSize * (hiddenSize + inputSize + 2);
-  // Different RNN's have different num weights
+inline int GetRnnParamSize(int num_layer,
+                           int input_size,
+                           int state_size,
+                           int direction,
+                           int mode) {
+  int size = state_size * direction;
   switch (mode) {
     case rnn_enum::kRnnRelu:
-      size *= 1;
+    case rnn_enum::kRnnTanh:
       break;
+    case rnn_enum::kLstm:
+      size *= 4;
+      break;
+    case rnn_enum::kGru:
+      size *= 3;
+      break;
+  }
+  int size1 = (input_size + state_size + 2) * size;  // first layer size
+  int size2 = (state_size * direction + state_size + 2) * size;  // other layers size
+  int param_size = size1 + (num_layer - 1) * size2;
+  return param_size;
+}
+
+inline int GetRnnBiasSize(int num_layer,
+                           int state_size,
+                           int direction,
+                           int mode) {
+  int size = 2 * state_size * direction * num_layer;
+  switch (mode) {
+    case rnn_enum::kRnnRelu:
     case rnn_enum::kRnnTanh:
-      size *= 1;
       break;
     case rnn_enum::kLstm:
       size *= 4;
@@ -73,19 +92,48 @@ inline int rnn_single_param_size(int inputSize,
   return size;
 }
 
-inline int rnn_param_size(int layerNum,
-                          int inputSize,
-                          int hiddenSize,
-                          bool bidirectional,
-                          int mode) {
-  // get size of first layer
-  int size = rnn_single_param_size(inputSize, hiddenSize, mode);
-  // get size of remaining layers
-  if (bidirectional) {
-    size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
-    size *= 2;
-  } else {
-    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
+inline size_t GetRNNWorkspaceSize(int seq_length,
+                                  int batch_size,
+                                  int hidden_size,
+                                  int direction,
+                                  int mode) {
+  size_t size = 0;
+  switch (mode) {
+    case rnn_enum::kRnnRelu:
+    case rnn_enum::kRnnTanh:
+    case rnn_enum::kGru:
+      LOG(FATAL) << "Only LSTM is supported at the moment";
+      break;
+    case rnn_enum::kLstm:
+      size = (seq_length + 1) * batch_size * hidden_size * 4 + batch_size * hidden_size * 2
+             + seq_length * batch_size * hidden_size * direction;
+      break;
+    default:
+      LOG(FATAL) << "unknown RNN mode " << mode;
+      break;
+  }
+  return size;
+}
+
+inline size_t GetRNNReserveSpaceSize(int num_layer,
+                                     int direction,
+                                     int seq_length,
+                                     int batch_size,
+                                     int hidden_size,
+                                     int mode) {
+  size_t size = 0;
+  switch (mode) {
+    case rnn_enum::kRnnRelu:
+    case rnn_enum::kRnnTanh:
+    case rnn_enum::kGru:
+      LOG(FATAL) << "Only LSTM is supported at the moment";
+      break;
+    case rnn_enum::kLstm:
+      size = num_layer * direction * seq_length * batch_size * hidden_size * 6;
+      break;
+    default:
+      LOG(FATAL) << "unknown RNN mode " << mode;
+      break;
   }
   return size;
 }
@@ -125,51 +173,153 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   }
 };
 
-template<typename xpu, typename DType>
-class RNNOp : public Operator {
- public:
-  explicit RNNOp(RNNParam p) {
-  }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // TODO(sbodenstein): add MShadow implementation
+/**
+ * @params: ws: Temp workspace for gemm's output storage.
+ *          rs: Reserve space of forward intermediate data used for training.
+ *          num_layers: The number of recurrent layers.
+ *          direction: direction is 2 if use bidirectional recurrent layers, else is 1;
+ *          seq_length: The number of iterations to unroll over.
+ *          batch_size: size of batch.
+ *          input_size: The number of expected input features.
+ *          state_size: The number of hidden state features.
+ *          x_ptr: Pointer of tensor x containing the features of the input sequence.
+ *                 x's shape is [seq_length, batch_size, input_size]
+ *          hx_ptr: Pointer of tensor hx containing the initial hidden state.
+ *                  hx's shape is [num_layers, batch_size, state_size]
+ *          cx_ptr: Only used in lstm mode. pointer of tensor cx containing the initial cell state.
+ *                  cx's shape is [num_layers, batch_size, state_size]
+ *          w_ptr: Pointer of tensor w containing weights.
+ *          b_ptr: Pointer of tensor w containing bias.
+ *          y_ptr: Pointer of tensor y containing the features of the output features from the
+ *                 last layers of the RNN. y's shape is [seq_length, batch_size, state_size]
+ *          hy_ptr: Pointer of tensor hy containing the hidden state for t=seq_length.
+ *                  hy's shape is [num_layers, batch_size, state_size]
+ *          cy_ptr: Only used in lstm mode. pointer of tensor cy  containing the cell state
+ *                  for t=seq_length. cy' shape is [num_layers, batch_size, state_size]
+ *          mode: Specifies the type of RNN to compute.
+ */
+template <typename DType>
+void RNNForwardTraining(DType* ws,
+                        DType* rs,
+                        bool state_outputs,
+                        const int num_layers,
+                        const int direction,
+                        const int seq_length,
+                        const int batch_size,
+                        const int input_size,
+                        const int state_size,
+                        DType* x_ptr,
+                        DType* hx_ptr,
+                        DType* cx_ptr,
+                        DType* w_ptr,
+                        DType* b_ptr,
+                        DType* y_ptr,
+                        DType* hy_ptr,
+                        DType* cy_ptr,
+                        int mode) {
+  switch (mode) {
+    case rnn_enum::kRnnTanh:
+    case rnn_enum::kRnnRelu:
+    case rnn_enum::kGru:
+      LOG(FATAL) << "Only LSTM is supported at the moment";
+      break;
+    case rnn_enum::kLstm:
+      LstmForwardTraining<DType>(ws, rs, state_outputs, num_layers, direction, seq_length,
+                                 batch_size, input_size, state_size, x_ptr, hx_ptr, cx_ptr,
+                                 w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr);
+      break;
+    default:
+      LOG(FATAL) << "unknown RNN mode " << mode;
+      break;
   }
+}
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // TODO(sbodenstein): add MShadow implementation
+template <typename DType>
+void RNNForwardInference(DType* ws,
+                         bool state_outputs,
+                         const int num_layers,
+                         const int direction,
+                         const int seq_length,
+                         const int batch_size,
+                         const int input_size,
+                         const int state_size,
+                         DType* x_ptr,
+                         DType* hx_ptr,
+                         DType* cx_ptr,
+                         DType* w_ptr,
+                         DType* b_ptr,
+                         DType* y_ptr,
+                         DType* hy_ptr,
+                         DType* cy_ptr,
+                         int mode) {
+  switch (mode) {
+    case rnn_enum::kRnnRelu:
+    case rnn_enum::kRnnTanh:
+    case rnn_enum::kGru:
+      LOG(FATAL) << "Only LSTM is supported at the moment";
+      break;
+    case rnn_enum::kLstm:
+      LstmForwardInference<DType>(ws, state_outputs, num_layers, direction, seq_length,
+                                  batch_size, input_size, state_size, x_ptr, hx_ptr, cx_ptr,
+                                  w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr);
+      break;
+    default:
+      LOG(FATAL) << "unknown RNN mode" << mode;
+      break;
   }
+}
 
- private:
-  RNNParam param_;
-};  // class RNNOp
+template <typename DType>
+void RNNBackward(DType* ws,
+                 DType* rs,
+                 const int num_layers,
+                 const int direction,
+                 const int seq_length,
+                 const int batch_size,
+                 const int input_size,
+                 const int state_size,
+                 DType* x_ptr,
+                 DType* hx_ptr,
+                 DType* cx_ptr,
+                 DType* w_ptr,
+                 DType* y_ptr,
+                 DType* dy_ptr,
+                 DType* dhy_ptr,
+                 DType* dcy_ptr,
+                 DType* dx_ptr,
+                 DType* dhx_ptr,
+                 DType* dcx_ptr,
+                 DType* dw_ptr,
+                 DType* db_ptr,
+                 int mode) {
+  switch (mode) {
+    case rnn_enum::kRnnRelu:
+    case rnn_enum::kRnnTanh:
+    case rnn_enum::kGru:
+      break;
+    case rnn_enum::kLstm:
+      LstmBackward<DType>(ws, rs, num_layers, direction, seq_length, batch_size,
+                          input_size, state_size, x_ptr, hx_ptr, cx_ptr, w_ptr, y_ptr,
+                          dy_ptr, dhy_ptr, dcy_ptr, dx_ptr, dhx_ptr, dcx_ptr, dw_ptr, db_ptr);
+      break;
+    default:
+      LOG(FATAL) << "unknown RNN mode" << mode;
+      break;
+  }
+}
 
 template<typename DType>
-class RNNOp<cpu, DType> : public Operator {
+class RNNOp : public Operator{
  public:
-  explicit RNNOp(RNNParam param) {
-    this->param_ = param;
-    // RNN Mode
-    param_.lstm_q_ = false;
-    switch (param_.mode) {
-      case rnn_enum::kLstm:
-        param_.lstm_q_ = true;
-        break;
-      default:
-        LOG(FATAL) << "only LSTM is implmented on CPU";
+  explicit RNNOp(RNNParam p)
+    :param_(p), init_space_(false), reserve_space_size_(0)
+  {}
+
+  ~RNNOp() {
+    if (init_space_) {
+      Storage::Get()->Free(reserve_space_);
+      init_space_ = false;
     }
   }
 
@@ -178,189 +328,221 @@ class RNNOp<cpu, DType> : public Operator {
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
-    // Layout TNC
-    CHECK(!ctx.is_train) << "only inference mode is available"
-      "for cpu at the moment.";
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-
-    if (!param_.state_outputs)
-      LOG(FATAL) << "no state outputs is currently not supported for cpu.";
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(param_.mode, rnn_enum::kLstm) << "Only lstm mode is supported at the moment.";
+    CHECK_EQ(param_.p, 0) << "Dropout is not supported at the moment.";
 
-    CHECK_EQ(req[rnn_enum::kOut], kWriteTo);
+    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    if (!param_.state_outputs) {
+      out_expected = 1;
+    }
     CHECK_EQ(in_data.size(), in_expected);
     CHECK_EQ(out_data.size(), out_expected);
-
-    mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
-    // get input + output tensors
-    // w layout i2h_w, h2h_w, i2h_b, h2h_b
-    Tensor<cpu, 3, DType> x =
-        in_data[rnn_enum::kData].get<cpu, 3, DType>(s);  // TNC
+    Stream<cpu> *s = ctx.get_stream<cpu>();
+    // get input + output tensor
+    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
     Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> hx =
-        in_data[rnn_enum::kState].get<cpu, 3, DType>(s);  // LNC
-    Tensor<cpu, 3, DType> y =
-        out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);  // TNC
-    int64_t seq_len = x.shape_[0];
-    int64_t num_layers = hx.shape_[0];
-    int64_t batch_size = x.shape_[1];
-    int64_t h_channel = hx.shape_[2];
-    int64_t in_channel = x.shape_[2];
-    Tensor<cpu, 2, DType> x_flatten = in_data[rnn_enum::kData]
-      .get_with_shape<cpu, 2, DType>(
-          mshadow::Shape2(seq_len * batch_size, in_channel), s);  // (T*N)C
-    Tensor<cpu, 2, DType> y_flatten = out_data[rnn_enum::kOut]
-      .get_with_shape<cpu, 2, DType>(
-          mshadow::Shape2(
-              y.shape_[0] * y.shape_[1], y.shape_[2]), s);  // (T*N)C
-
+    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
+    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
     CHECK(x.CheckContiguous());
     CHECK(w.CheckContiguous());
     CHECK(hx.CheckContiguous());
     CHECK(y.CheckContiguous());
+    param_.seq_length_ = x.shape_[0];
+    param_.batch_size_ = x.shape_[1];
+    param_.input_size_ = x.shape_[2];
+
+    const int direction = param_.bidirectional ? 2 : 1;
+    const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
+    DType* b_ptr = w.dptr_ + w.shape_[0] - bsize;
+
+    DType* hy_ptr = NULL;
+    if (param_.state_outputs) {
+      hy_ptr = out_data[rnn_enum::kStateOut].dptr<DType>();
+    }
+    DType* cx_ptr = NULL;
+    DType* cy_ptr = NULL;
 
-    if (param_.lstm_q_) {
-      const size_t kNumMat = 4;
-      int64_t fused_h_ch = kNumMat * h_channel;
-      int64_t h_size = batch_size * fused_h_ch;
-      int64_t num_dir = 1 + param_.bidirectional;
-      int64_t h2h_w_size = h_channel * fused_h_ch;
-
-      Tensor<cpu, 3, DType> cx =
-          in_data[rnn_enum::kStateCell].get<cpu, 3, DType>(s);
-      CHECK(cx.CheckContiguous());
-
-      Tensor<cpu, 3, DType> cy =
-          out_data[rnn_enum::kStateCellOut].get<cpu, 3, DType>(s);
-      Tensor<cpu, 3, DType> hy =
-          out_data[rnn_enum::kStateOut].get<cpu, 3, DType>(s);
-      CHECK(cy.CheckContiguous());
-      CHECK(hy.CheckContiguous());
-
-      DType* workspace_addr =
-      static_cast<DType *>(ctx.requested[rnn_enum::kTempSpace]
-          .get_host_space_internal(sizeof(DType) *
-                                  (seq_len * h_size + h_size
-                                  + y.shape_[0] * y.shape_[1] * y.shape_[2])));
-      Tensor<cpu, 3, DType> i2h_y(
-          workspace_addr, mshadow::Shape3(seq_len, batch_size, fused_h_ch));
-      Tensor<cpu, 2, DType> i2h_y_flatten(
-          workspace_addr, mshadow::Shape2(seq_len * batch_size, fused_h_ch));
-      Tensor<cpu, 2, DType> h2h_y(workspace_addr
-          + seq_len * h_size, mshadow::Shape2(batch_size, fused_h_ch));
-      Tensor<cpu, 3, DType> y_tmp(workspace_addr
-          + (seq_len + 1) * h_size, y.shape_);
-      Tensor<cpu, 2, DType> y_flatten_tmp(workspace_addr
-          + (seq_len + 1) * h_size, y_flatten.shape_);
-      CHECK(i2h_y.CheckContiguous());
-      CHECK(h2h_y.CheckContiguous());
-      CHECK(y_tmp.CheckContiguous());
-
-      for (int64_t layer = 0; layer < num_layers; layer++) {
-        int reverse_dir = 0;
-        int out_tmp = 0;
-        if (param_.bidirectional && layer % 2)
-          reverse_dir = 1;
-        if (layer / num_dir % 2 == 0)
-          out_tmp = 1;
-        mshadow::Shape<2> i2h_w_shape = mshadow::Shape2(fused_h_ch,
-            (layer < num_dir) ? in_channel : num_dir * h_channel);
-        mshadow::Shape<2> h2h_w_shape = mshadow::Shape2(fused_h_ch, h_channel);
-        int64_t start = layer < num_dir ?
-            (layer * (in_channel * fused_h_ch + h2h_w_size)) :  // input layer
-              (num_dir * (in_channel * fused_h_ch + h2h_w_size)
-              + (layer - num_dir) * (h2h_w_size * num_dir + h2h_w_size));
-        Tensor<cpu, 2, DType> i2h_w(w.dptr_ + start, i2h_w_shape);
-        start += layer < num_dir ?
-            in_channel * fused_h_ch : h2h_w_size * num_dir;
-        Tensor<cpu, 2, DType> h2h_w(w.dptr_ + start, h2h_w_shape);
-        start = num_dir * (in_channel * fused_h_ch + h2h_w_size)
-            + (num_layers - num_dir) * (h2h_w_size * (num_dir + 1))
-              + layer * fused_h_ch * 2;
-        Tensor<cpu, 1, DType> i2h_b = w.Slice(start, start + fused_h_ch);
-        start += fused_h_ch;
-        Tensor<cpu, 1, DType> h2h_b = w.Slice(start, start + fused_h_ch);
-        if (out_tmp) {
-          linalg_gemm(layer < num_dir ? x_flatten:y_flatten, i2h_w,
-              i2h_y_flatten, false, true, s);
-        } else {
-          linalg_gemm(layer < num_dir ? x_flatten:y_flatten_tmp, i2h_w,
-              i2h_y_flatten, false, true, s);
-        }
-        i2h_y_flatten += repmat(i2h_b, seq_len * batch_size);
-        for (int64_t t = 0; t < seq_len; t++) {
-          int64_t timestep = t;
-          if (reverse_dir)
-            timestep = seq_len - 1 - t;
-          linalg_gemm(t == 0 ? hx[layer]:hy[layer], h2h_w, h2h_y,
-              false, true, s);
-          h2h_y += repmat(h2h_b, batch_size);
-          // fused element-wise ops
-          LSTMFusedElementWiseCPUOps(i2h_y[timestep], cx[layer], h2h_y,
-              y[timestep], out_tmp ? y_tmp[timestep]: y[timestep],
-                hy[layer], cy[layer], batch_size, h_channel, t,
-                reverse_dir, out_tmp && (layer == num_layers - 1));
-        }
+    if (param_.mode == rnn_enum::kLstm) {
+      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
+      if (param_.state_outputs) {
+        cy_ptr = out_data[rnn_enum::kStateCellOut].dptr<DType>();
       }
+    }
+
+    // allocate temp space
+    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                                                      param_.state_size, direction, param_.mode);
+    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
+
+    if (ctx.is_train) {
+      const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                                   param_.seq_length_, param_.batch_size_,
+                                                   param_.state_size, param_.mode);
+      if (init_space_ && reserve_space_size_ < r_size) {
+        Storage::Get()->Free(reserve_space_);
+        init_space_ = false;
+      }
+
+      if (!init_space_) {
+        reserve_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
+        reserve_space_size_ = r_size;
+        init_space_ = true;
+      }
+
+      DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
+      RNNForwardTraining<DType>(workspace.dptr_,
+                                reserve_space_ptr,
+                                param_.state_outputs,
+                                param_.num_layers,
+                                direction,
+                                param_.seq_length_,
+                                param_.batch_size_,
+                                param_.input_size_,
+                                param_.state_size,
+                                x.dptr_,
+                                hx.dptr_,
+                                cx_ptr,
+                                w.dptr_,
+                                b_ptr,
+                                y.dptr_,
+                                hy_ptr,
+                                cy_ptr,
+                                param_.mode);
     } else {
-      LOG(FATAL) << "only LSTM is available for cpu at the moment.";
+      RNNForwardInference<DType>(workspace.dptr_,
+                                 param_.state_outputs,
+                                 param_.num_layers,
+                                 direction,
+                                 param_.seq_length_,
+                                 param_.batch_size_,
+                                 param_.input_size_,
+                                 param_.state_size,
+                                 x.dptr_,
+                                 hx.dptr_,
+                                 cx_ptr,
+                                 w.dptr_,
+                                 b_ptr,
+                                 y.dptr_,
+                                 hy_ptr,
+                                 cy_ptr,
+                                 param_.mode);
     }
   }
 
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
-      const std::vector<TBlob> &out_data,
+                        const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_args) {
-    LOG(FATAL) << "LSTM backward is not available for cpu at the moment.";
-  }
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(param_.mode, rnn_enum::kLstm) << "Only lstm mode is supported at the moment.";
+    CHECK_EQ(param_.p, 0) << "Dropout is not supported at the moment.";
+    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
+    if (!param_.state_outputs) {
+      out_expected = 1;
+    }
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(in_grad.size(), in_expected);
+    CHECK_EQ(out_grad.size(), out_expected);
+    CHECK_EQ(req.size(), in_expected);
+    CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
+    CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
+    mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+    // get input + output tensors
+    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
+    Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
+    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
+    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
+    Tensor<cpu, 3, DType> dx = in_grad[rnn_enum::kData].get<cpu, 3, DType>(s);
+    Tensor<cpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<cpu, 1, DType>(s);
+    Tensor<cpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<cpu, 3, DType>(s);
+    Tensor<cpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<cpu, 3, DType>(s);
+    CHECK(x.CheckContiguous());
+    CHECK(w.CheckContiguous());
+    CHECK(hx.CheckContiguous());
+    CHECK(y.CheckContiguous());
+    CHECK(dx.CheckContiguous());
+    CHECK(dw.CheckContiguous());
+    CHECK(dhx.CheckContiguous());
+    CHECK(dy.CheckContiguous());
+    param_.seq_length_ = x.shape_[0];
+    param_.batch_size_ = x.shape_[1];
+    param_.input_size_ = x.shape_[2];
+
+    const int direction = param_.bidirectional ? 2 : 1;
+    const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
+    DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize;
+
+    DType * dhy_ptr = NULL;
+    if (param_.state_outputs) {
+      dhy_ptr = out_grad[rnn_enum::kStateOut].dptr<DType>();
+    }
 
- private:
-  RNNParam param_;
+    DType * cx_ptr = NULL;
+    DType * dcx_ptr = NULL;
+    DType * dcy_ptr = NULL;
 
-  void LSTMFusedElementWiseCPUOps(const Tensor<cpu, 2, DType> &i2h_y,
-                                  const Tensor<cpu, 2, DType> &cx,
-                                  const Tensor<cpu, 2, DType> &h2h_y,
-                                  const Tensor<cpu, 2, DType> &y,
-                                  // holding intermediate layer output
-                                  const Tensor<cpu, 2, DType> &tmp,
-                                  const Tensor<cpu, 2, DType> &hy,
-                                  const Tensor<cpu, 2, DType> &cy,
-                                  const int64_t batch_size,
-                                  const int64_t h_channel,
-                                  const int64_t t,
-                                  const int reverse_dir,
-                                  const int copy_tmp2y) {
-    int64_t length = batch_size * h_channel;
-    #pragma omp parallel for
-    for (int64_t ji = 0; ji < length; ++ji) {
-      int64_t j = ji / h_channel;  // batch dim
-      int64_t i = ji % h_channel;
-      int64_t f = i + h_channel;
-      int64_t c = i + h_channel * 2;
-      int64_t o = i + h_channel * 3;
-      int64_t j_pos = j * h_channel * 4;
-      h2h_y.dptr_[j_pos + i] += i2h_y.dptr_[j_pos + i];
-      h2h_y.dptr_[j_pos + f] += i2h_y.dptr_[j_pos + f];
-      h2h_y.dptr_[j_pos + o] += i2h_y.dptr_[j_pos + o];
-      h2h_y.dptr_[j_pos + c] += i2h_y.dptr_[j_pos + c];
-      h2h_y.dptr_[j_pos + i] = 1.0f / (1.0f + math::exp(-h2h_y.dptr_[j_pos + i]));
-      h2h_y.dptr_[j_pos + f] = 1.0f / (1.0f + math::exp(-h2h_y.dptr_[j_pos + f]));
-      h2h_y.dptr_[j_pos + o] = 1.0f / (1.0f + math::exp(-h2h_y.dptr_[j_pos + o]));
-      h2h_y.dptr_[j_pos + c] = tanh(h2h_y.dptr_[j_pos + c]);
-      cy[j][i] = h2h_y.dptr_[j_pos + f] * (t == 0 ? cx[j][i]:cy[j][i])
-          + h2h_y.dptr_[j_pos + i] * h2h_y.dptr_[j_pos + c];
-      hy[j][i] = h2h_y.dptr_[j_pos + o] * tanh(cy[j][i]);
-      tmp[j][i + h_channel * reverse_dir] = hy[j][i];
-      if (copy_tmp2y) {
-        y[j][i] = tmp[j][i];
-        if (reverse_dir)
-          y[j][i + h_channel] = tmp[j][i + h_channel];
+    if (param_.mode == rnn_enum::kLstm) {
+      CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
+      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
+      dcx_ptr = in_grad[rnn_enum::kStateCell].dptr<DType>();
+      if (param_.state_outputs) {
+        dcy_ptr = out_grad[rnn_enum::kStateCellOut].dptr<DType>();
       }
     }
+
+    // allocate temp space
+    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                                                      param_.state_size, direction, param_.mode);
+    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
+        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
+
+    size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                           param_.seq_length_, param_.batch_size_,
+                                           param_.state_size, param_.mode);
+    if (!init_space_ || reserve_space_size_ != r_size) {
+      LOG(FATAL) << "Check forward init error";
+    }
+
+    DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
+    RNNBackward<DType>(workspace.dptr_,
+                       reserve_space_ptr,
+                       param_.num_layers,
+                       direction,
+                       param_.seq_length_,
+                       param_.batch_size_,
+                       param_.input_size_,
+                       param_.state_size,
+                       x.dptr_,
+                       hx.dptr_,
+                       cx_ptr,
+                       w.dptr_,
+                       y.dptr_,
+                       dy.dptr_,
+                       dhy_ptr,
+                       dcy_ptr,
+                       dx.dptr_,
+                       dhx.dptr_,
+                       dcx_ptr,
+                       dw.dptr_,
+                       db_ptr,
+                       param_.mode);
   }
+
+ private:
+  RNNParam param_;
+  bool init_space_;
+  size_t reserve_space_size_;
+  Storage::Handle reserve_space_;
 };  // class RNNOp
 
 template<typename xpu>
@@ -429,10 +611,10 @@ class RNNProp : public OperatorProperty {
                         Shape3(total_layers, batch_size, param_.state_size));
 
     // calculate parameter vector length
-    int param_size = rnn_param_size(param_.num_layers,
+    int param_size = GetRnnParamSize(param_.num_layers,
                                     input_size,
                                     param_.state_size,
-                                    param_.bidirectional,
+                                    numDirections,
                                     param_.mode);
     SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
 
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index a60adbcd2fb..6da367d3b80 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -23,7 +23,6 @@
  * \brief
  * \author Sebastian Bodenstein
 */
-
 #include "./rnn-inl.h"
 
 namespace mxnet {
@@ -32,7 +31,7 @@ template<>
 Operator *CreateOp<cpu>(RNNParam param, int dtype) {
   Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new RNNOp<cpu, DType>(param);
+    op = new RNNOp<DType>(param);
   });
   return op;
 }
@@ -46,7 +45,50 @@ Operator *RNNProp::CreateOperatorEx(Context ctx,
 DMLC_REGISTER_PARAMETER(RNNParam);
 
 MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
-.describe("Applies a recurrent layer to input.")
+.describe(R"code(Applies recurrent layers to input.
+Currently, vanilla RNN, LSTM and GRU are implemented, with
+ both multi-layer and bidirectional support.
+**Vanilla RNN**
+Applies a single-gate recurrent layer to input X. Two kinds of
+ activation function are supported: ReLU and tanh.
+
+ReLU activation function:
+
+.. math::
+    $h_t = relu(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})$
+
+Tanh activtion function:
+
+.. math::
+   $h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})$
+
+Reference paper: Finding structure in time - Elman, 1988.
+ https://crl.ucsd.edu/~elman/Papers/fsit.pdf
+
+**LSTM**
+Long Short-Term Memory - Hochreiter, 1997.
+
+.. math::
+  \begin{array}{ll}
+            i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
+            f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
+            o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
+            c_t = f_t * c_{(t-1)} + i_t * g_t \\
+            h_t = o_t * \tanh(c_t)
+            \end{array}
+
+**GRU**
+Gated Recurrent Unit - Cho et al. 2014.
+http://arxiv.org/abs/1406.1078
+
+.. math::
+\begin{array}{ll}
+            r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+            z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
+            \end{array})code")
 .add_argument("data", "NDArray-or-Symbol", "Input data to RNN")
 .add_argument("parameters", "NDArray-or-Symbol",
               "Vector of all RNN trainable parameters concatenated")
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
new file mode 100644
index 00000000000..2ee374bbf56
--- /dev/null
+++ b/src/operator/rnn_impl.h
@@ -0,0 +1,457 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file    rnn_impl.h
+ * \brief
+ * \author  Shu Zhang
+*/
+#ifndef MXNET_OPERATOR_RNN_IMPL_H_
+#define MXNET_OPERATOR_RNN_IMPL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./math.h"
+#include "./math_functions-inl.h"
+#include "./operator_common.h"
+#include "./mshadow_op.h"
+#include "./linalg.h"
+
+template<typename DType>
+inline DType sigmoid(DType x) {
+  return 1.0f / (1.0f + exp(-x));
+}
+
+template<typename DType>
+void LstmForwardTrainingSingleLayer(DType* ws,
+                                    DType* rs,
+                                    bool state_outputs,
+                                    bool bid,
+                                    const int T,
+                                    const int N,
+                                    const int I,
+                                    const int H,
+                                    const Tensor<cpu, 2, DType> &x,
+                                    const Tensor<cpu, 2, DType> &hx,
+                                    const Tensor<cpu, 2, DType> &cx,
+                                    const Tensor<cpu, 3, DType> &y,
+                                    DType* w_ptr,
+                                    DType* b_ptr,
+                                    DType* hy_ptr,
+                                    DType* cy_ptr) {
+  using namespace mshadow;
+  const Tensor<cpu, 2, DType> wx(w_ptr, Shape2(H * 4, I));
+  const Tensor<cpu, 2, DType> wh(w_ptr + I * H * 4, Shape2(H * 4, H));
+  const Tensor<cpu, 2, DType> bx(b_ptr, Shape2(4, H));
+  const Tensor<cpu, 2, DType> bh(b_ptr + H * 4, Shape2(4, H));
+  const Tensor<cpu, 2, DType> yx_flat(ws, Shape2(T * N, 4 * H));
+  const Tensor<cpu, 2, DType> yh_flat(ws + T * N * H * 4, Shape2(N, 4 * H));
+  const Tensor<cpu, 4, DType> yx(yx_flat.dptr_, Shape4(T, N, 4, H));
+  const Tensor<cpu, 3, DType> yh(yh_flat.dptr_, Shape3(N, 4, H));
+  Tensor<cpu, 2, DType> h(yh_flat.dptr_ + N * H * 4, Shape2(N, H));
+  DType *c_ptr = bid ? rs + T * N * H * 7 : rs;
+  Tensor<cpu, 3, DType> c(c_ptr, Shape3(T, N, H));
+  Tensor<cpu, 4, DType> ifgo(c_ptr + T * N * H, Shape4(T, N, H, 4));
+
+  const int offset = bid ? H : 0;
+  const DType alpha = 1.0;
+  const DType beta = 0.0;
+  const int cell_size = N * H;
+  linalg_gemm(x, wx, yx_flat, alpha, beta, false, true);
+
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  for (int i = 0; i < T; ++i) {
+    int t = bid ? T - 1 - i : i;
+    linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int jk = 0; jk < cell_size; ++jk) {
+      int j = jk / H;
+      int k = jk % H;
+      DType it = sigmoid<DType>(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]);
+      DType ft = sigmoid<DType>(yx[t][j][1][k] + yh[j][1][k] + bx[1][k] + bh[1][k]);
+      DType gt =           tanh(yx[t][j][2][k] + yh[j][2][k] + bx[2][k] + bh[2][k]);
+      DType ot = sigmoid<DType>(yx[t][j][3][k] + yh[j][3][k] + bx[3][k] + bh[3][k]);
+      DType ct = (i ? c[i-1][j][k] : cx[j][k]) * ft + it * gt;
+      DType ht = ot * tanh(ct);
+      h[j][k] = ht;
+      // reserve
+      y[t][j][k + offset] = ht;
+      c[i][j][k] = ct;
+      ifgo[i][j][k][0] = it;
+      ifgo[i][j][k][1] = ft;
+      ifgo[i][j][k][2] = gt;
+      ifgo[i][j][k][3] = ot;
+      if (i == T - 1 && state_outputs) {
+        hy_ptr[jk] = ht;
+        cy_ptr[jk] = ct;
+      }
+    }
+  }
+}
+
+template <typename DType>
+void LstmForwardTraining(DType* ws,
+                         DType* rs,
+                         bool state_outputs,
+                         const int L,
+                         const int D,
+                         const int T,
+                         const int N,
+                         const int I,
+                         const int H,
+                         DType* x_ptr,
+                         DType* hx_ptr,
+                         DType* cx_ptr,
+                         DType* w_ptr,
+                         DType* b_ptr,
+                         DType* y_ptr,
+                         DType* hy_ptr,
+                         DType* cy_ptr) {
+  const int total_layers = D * L;
+  Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, H));
+  Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
+  const int b_size = 2 * H * 4;
+  const int r_size = D * T * N * H * 6;
+  const int y_offset = T * N * H * 5;
+  const int cell_size = N * H;
+  int idx = 0;  // state & cell state's idx;
+  for (int i = 0; i < L; ++i) {
+    const int input_size = i ? H * D : I;
+    const int w_size = (input_size + H) * H * 4;
+    Tensor<cpu, 2, DType> x(x_ptr, Shape2(T * N, input_size));
+    Tensor<cpu, 3, DType> y(rs + y_offset, Shape3(T, N, H * D));
+    LstmForwardTrainingSingleLayer<DType>(ws, rs, state_outputs, false, T, N, input_size, H, x,
+                                          hx[idx], cx[idx], y, w_ptr, b_ptr, hy_ptr, cy_ptr);
+    if (D == 2) {
+      w_ptr += w_size;
+      b_ptr += b_size;
+      ++idx;
+      if (state_outputs) {
+        hy_ptr += cell_size;
+        cy_ptr += cell_size;
+      }
+      LstmForwardTrainingSingleLayer<DType>(ws, rs, state_outputs, true, T, N, input_size, H, x,
+                                            hx[idx], cx[idx], y, w_ptr, b_ptr, hy_ptr, cy_ptr);
+    }
+    if (i != L - 1) {
+      w_ptr += w_size;
+      b_ptr += b_size;
+      x_ptr = y.dptr_;
+      rs += r_size;
+      ++idx;
+      if (state_outputs) {
+        hy_ptr += cell_size;
+        cy_ptr += cell_size;
+      }
+    }
+  }
+  memcpy(y_ptr, rs + y_offset, T * N * H * D * sizeof(DType));
+}
+
+template<typename DType>
+void LstmForwardInferenceSingleLayer(DType* ws,
+                                     bool state_outputs,
+                                     bool bid,
+                                     const int T,
+                                     const int N,
+                                     const int I,
+                                     const int H,
+                                     const Tensor<cpu, 2, DType> &x,
+                                     const Tensor<cpu, 2, DType> &hx,
+                                     const Tensor<cpu, 2, DType> &cx,
+                                     const Tensor<cpu, 3, DType> &y,
+                                     DType* w_ptr,
+                                     DType* b_ptr,
+                                     DType* hy_ptr,
+                                     DType* cy_ptr) {
+  using namespace mshadow;
+  const Tensor<cpu, 2, DType> wx(w_ptr, Shape2(H * 4, I));
+  const Tensor<cpu, 2, DType> wh(w_ptr + I * H * 4, Shape2(H * 4, H));
+  const Tensor<cpu, 2, DType> bx(b_ptr, Shape2(4, H));
+  const Tensor<cpu, 2, DType> bh(b_ptr + H * 4, Shape2(4, H));
+  Tensor<cpu, 2, DType> yx_flat(ws, Shape2(T * N, H * 4));
+  Tensor<cpu, 2, DType> yh_flat(ws + T * N * H * 4, Shape2(N, H * 4));
+  const Tensor<cpu, 4, DType> yx(yx_flat.dptr_, Shape4(T, N, 4, H));
+  const Tensor<cpu, 3, DType> yh(yh_flat.dptr_, Shape3(N, 4, H));
+  Tensor<cpu, 2, DType> h(yh_flat.dptr_ + N * H * 4, Shape2(N, H));
+  Tensor<cpu, 2, DType> c(h.dptr_ + N * H, Shape2(N, H));
+  const int offset = bid ? H : 0;
+  const DType alpha = 1.0;
+  const DType beta = 0.0;
+  const int cell_size = N * H;
+  linalg_gemm(x, wx, yx_flat, alpha, beta, false, true);
+
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  for (int i = 0; i < T; ++i) {
+    int t = bid ? T - 1 - i : i;
+    linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int jk = 0; jk < cell_size; ++jk) {
+      int j = jk / H;
+      int k = jk % H;
+      DType it = sigmoid<DType>(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]);
+      DType ft = sigmoid<DType>(yx[t][j][1][k] + yh[j][1][k] + bx[1][k] + bh[1][k]);
+      DType gt =           tanh(yx[t][j][2][k] + yh[j][2][k] + bx[2][k] + bh[2][k]);
+      DType ot = sigmoid<DType>(yx[t][j][3][k] + yh[j][3][k] + bx[3][k] + bh[3][k]);
+      DType ct = (i ? c[j][k] : cx[j][k]) * ft + it * gt;
+      DType ht = ot * tanh(ct);
+      y[t][j][k + offset] = ht;
+      if (i == T - 1 && state_outputs) {
+        hy_ptr[jk] = ht;
+        cy_ptr[jk] = ct;
+      } else {
+        h[j][k] = ht;
+        c[j][k] = ct;
+      }
+    }
+  }
+}
+
+template <typename DType>
+void LstmForwardInference(DType* ws,
+                          bool state_outputs,
+                          const int L,
+                          const int D,
+                          const int T,
+                          const int N,
+                          const int I,
+                          const int H,
+                          DType* x_ptr,
+                          DType* hx_ptr,
+                          DType* cx_ptr,
+                          DType* w_ptr,
+                          DType* b_ptr,
+                          DType* y_ptr,
+                          DType* hy_ptr,
+                          DType* cy_ptr) {
+  const int total_layers = D * L;
+  Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, H));
+  Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
+  const int b_size = 2 * H * 4;
+  const int cell_size = N * H;
+  DType* y_tmp_ptr = ws + (T + 1) * cell_size * 4 + cell_size * 2;
+  DType* y_cur_ptr = y_ptr;
+  int idx = 0;  // state & cell state's idx;
+  bool flag = L % 2 ? false : true;
+  for (int i = 0; i < L; ++i) {
+    const int input_size = i ? H * D : I;
+    const int w_size = (input_size + H) * H * 4;
+    // If bidirectional, need space to save current layer output y.
+    if (D == 2) {
+      y_cur_ptr = flag ? y_tmp_ptr : y_ptr;
+      flag = !flag;
+    }
+    Tensor<cpu, 2, DType> x(x_ptr, Shape2(T * N, input_size));
+    Tensor<cpu, 3, DType> y(y_cur_ptr, Shape3(T, N, H * D));
+    LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, false, T, N, input_size, H,
+                                           x, hx[idx], cx[idx], y, w_ptr, b_ptr, hy_ptr, cy_ptr);
+    // If bidirectional, then calculate the reverse direction's forward result.
+    if (D == 2) {
+      w_ptr += w_size;
+      b_ptr += b_size;
+      ++idx;
+      if (state_outputs) {
+        hy_ptr += cell_size;
+        cy_ptr += cell_size;
+      }
+      LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, true, T, N, input_size, H,
+                                             x, hx[idx], cx[idx], y, w_ptr, b_ptr, hy_ptr, cy_ptr);
+    }
+    // Don't need to move pointer in the last layer.
+    if (i != L - 1) {
+      w_ptr += w_size;
+      b_ptr += b_size;
+      x_ptr = y_cur_ptr;
+      ++idx;
+      if (state_outputs) {
+        hy_ptr += cell_size;
+        cy_ptr += cell_size;
+      }
+    }
+  }
+}
+
+template <typename DType>
+void LstmBackwardSingleLayer(DType* ws,
+                             DType* rs,
+                             bool bid,
+                             const int T,
+                             const int N,
+                             const int I,
+                             const int H,
+                             const Tensor<cpu, 2, DType> &x,
+                             const Tensor<cpu, 2, DType> &hx,
+                             const Tensor<cpu, 2, DType> &cx,
+                             const Tensor<cpu, 3, DType> &y,
+                             const Tensor<cpu, 3, DType> &dy,
+                             const Tensor<cpu, 2, DType> &dx,
+                             const Tensor<cpu, 2, DType> &dhx,
+                             const Tensor<cpu, 2, DType> &dcx,
+                             DType* dhy_ptr,
+                             DType* dcy_ptr,
+                             DType* w_ptr,
+                             DType* dw_ptr,
+                             DType* db_ptr) {
+  using namespace mshadow;
+  const Tensor<cpu, 2, DType> wx(w_ptr, Shape2(H * 4, I));
+  const Tensor<cpu, 2, DType> wh(w_ptr + I * H * 4, Shape2(H * 4, H));
+  Tensor<cpu, 2, DType> dwx(dw_ptr, Shape2(H * 4, I));
+  Tensor<cpu, 2, DType> dwh(dw_ptr + I * H * 4, Shape2(H * 4, H));
+  Tensor<cpu, 1, DType> dbx(db_ptr, Shape1(H * 4));
+  Tensor<cpu, 1, DType> dbh(dbx.dptr_ + H * 4, Shape1(H * 4));
+  DType *c_ptr = bid ? rs + T * N * H * 7 : rs;
+  const Tensor<cpu, 3, DType> c(c_ptr, Shape3(T, N, H));
+  const Tensor<cpu, 4, DType> ifgo(c_ptr + T * N * H, Shape4(T, N, H, 4));
+  memset(dwh.dptr_, 0, H * H * 4 * sizeof(DType));
+  memset(dbx.dptr_, 0, H * 4 * sizeof(DType));
+  memset(dbh.dptr_, 0, H * 4 * sizeof(DType));
+  Tensor<cpu, 4, DType> difgo(ws, Shape4(T, N, 4, H));
+  Tensor<cpu, 2, DType> dh(ws + T * N * H * 4, Shape2(N, H));
+  Tensor<cpu, 2, DType> dc(dh.dptr_ + N * H, Shape2(N, H));
+  Tensor<cpu, 2, DType> htmp(dc.dptr_ + N * H, Shape2(N, H));
+  const int offset = bid ? H : 0;
+  const DType alpha = 1.0;
+  const DType beta0 = 0.0;
+  const DType beta1 = 1.0;
+  const int cell_size = N * H;
+  if (dhy_ptr != NULL) {
+    memcpy(dh.dptr_, dhy_ptr, cell_size * sizeof(DType));
+  }
+  if (dcy_ptr != NULL) {
+    memcpy(dc.dptr_, dcy_ptr, cell_size * sizeof(DType));
+  }
+
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  for (int i = T - 1; i >= 0; --i) {
+    int t = bid ? T - 1 - i : i;
+    int tnext = bid ? t + 1 : t - 1;
+    const Tensor<cpu, 2, DType>& dhnext = i ? dh : dhx;
+    const Tensor<cpu, 2, DType>& dcnext = i ? dc : dcx;
+    const Tensor<cpu, 2, DType>& hnext = i ? htmp : hx;
+    const Tensor<cpu, 2, DType>& cnext = i ? c[i - 1] : cx;
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int jk = 0; jk < cell_size; ++jk) {
+      int j = jk / H;
+      int k = jk % H;
+      DType tc = tanh(c[i][j][k]);
+      DType it = ifgo[i][j][k][0];
+      DType ft = ifgo[i][j][k][1];
+      DType gt = ifgo[i][j][k][2];
+      DType ot = ifgo[i][j][k][3];
+      dh[j][k] += dy[t][j][k + offset];
+      dc[j][k] += dh[j][k] * ot * (1 - tc * tc);
+      difgo[t][j][0][k] = dc[j][k] * gt * it * (1 - it);
+      difgo[t][j][1][k] = dc[j][k] * cnext[j][k] * ft * (1 - ft);
+      difgo[t][j][2][k] = dc[j][k] * it * (1 - gt * gt);
+      difgo[t][j][3][k] = dh[j][k] * tc * ot * (1 - ot);
+      dcnext[j][k] = dc[j][k] * ft;
+      if (i) {
+        htmp[j][k] = y[tnext][j][k + offset];
+      }
+    }
+    Tensor<cpu, 2, DType> dyh(difgo[t].dptr_, Shape2(N, H * 4));
+    linalg_gemm(dyh, wh, dhnext, alpha, beta0, false, false);
+    linalg_gemm(dyh, hnext, dwh, alpha, beta1, true, false);
+  }
+  Tensor<cpu, 2, DType> dyx(difgo.dptr_, Shape2(T * N, H * 4));
+  linalg_gemm(dyx, wx, dx, alpha, bid ? beta1 : beta0, false, false);
+  linalg_gemm(dyx, x, dwx, alpha, beta0, true, false);
+  const int row = T * N;
+  const int col = H * 4;
+  for (int i = 0; i < row; ++i) {
+    for (int j = 0; j < col; ++j) {
+      dbx[j] += dyx[i][j];
+      dbh[j] = dbx[j];
+    }
+  }
+}
+

  (This diff was longer than 20,000 lines, and has been truncated...)


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services