You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/12/20 01:03:59 UTC
[GitHub] aaronmarkham closed pull request #8942: Cloud install instructions - adding DLAMI docs link

aaronmarkham closed pull request #8942: Cloud install instructions - adding DLAMI docs link
URL: https://github.com/apache/incubator-mxnet/pull/8942
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 468be298b8..193f5b02c4 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -5,9 +5,15 @@
 ### Essentials ###
 - [ ] Passed code style checking (`make lint`)
 - [ ] Changes are complete (i.e. I finished coding on this PR)
-- [ ] All changes have test coverage
-- [ ] For user-facing API changes, API doc string has been updated. For new C++ functions in header files, their functionalities and arguments are well-documented. 
-- [ ] To my best knowledge, examples are either not affected by this change, or have been fixed to be compatible with this change
+- [ ] All changes have test coverage:
+- Unit tests are added for small changes to verify correctness (e.g. adding a new operator)
+- Nightly tests are added for complicated/long-running ones (e.g. changing distributed kvstore)
+- Build tests will be added for build configuration changes (e.g. adding a new build option with NCCL)
+- [ ] Code is well-documented: 
+- For user-facing API changes, API doc string has been updated. 
+- For new C++ functions in header files, their functionalities and arguments are documented. 
+- For new examples, README.md is added to explain the what the example does, the source of the dataset, expected performance on test set and reference to the original paper if applicable
+- [ ] To the my best knowledge, examples are either not affected by this change, or have been fixed to be compatible with this change
 
 ### Changes ###
 - [ ] Feature1, tests, (and when applicable, API doc)
diff --git a/.gitignore b/.gitignore
index fbd62c9ec5..9d2e8944f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,6 +100,10 @@ input.txt*
 # ctags
 tags
 
+# cscope
+cscope.out
+cscope.files
+
 # Scala package
 *.class
 scala-package/*/target/
diff --git a/.gitmodules b/.gitmodules
index 4ad3e407f9..170c105a6f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,3 +19,6 @@
 [submodule "3rdparty/openmp"]
 	path = 3rdparty/openmp
 	url = https://github.com/llvm-mirror/openmp
+[submodule "3rdparty/googletest"]
+	path = 3rdparty/googletest
+	url = https://github.com/google/googletest.git
diff --git a/3rdparty/googletest b/3rdparty/googletest
new file mode 160000
index 0000000000..ec44c6c167
--- /dev/null
+++ b/3rdparty/googletest
@@ -0,0 +1 @@
+Subproject commit ec44c6c1675c25b9827aacd08c02433cccde7780
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50f60089cf..6e6b17880e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and foun
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON AND NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
-mxnet_option(USE_PROFILER         "Build with Profiler support"   OFF)
+mxnet_option(USE_PROFILER         "Build with Profiler support"   ON)
 mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
 mxnet_option(USE_PLUGINS_WARPCTC  "Use WARPCTC Plugins" OFF)
 mxnet_option(USE_PLUGIN_CAFFE     "Use Caffe Plugin" OFF)
@@ -47,6 +47,7 @@ mxnet_option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 mxnet_option(USE_GPROF            "Compile with gprof (profiling) flag" OFF)
 mxnet_option(USE_VTUNE            "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
 mxnet_option(INSTALL_EXAMPLES     "Install the example source files." OFF)
+mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
 
 
 
@@ -87,6 +88,9 @@ else(MSVC)
   check_cxx_compiler_flag("-std=c++0x"   SUPPORT_CXX0X)
   check_cxx_compiler_flag("-msse2"       SUPPORT_MSSE2)
   set(CMAKE_C_FLAGS "-Wall -Wno-unknown-pragmas -fPIC -Wno-sign-compare")
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang$")
+    set(CMAKE_C_FLAGS "-Wno-braced-scalar-init")
+  endif()
   if(CMAKE_BUILD_TYPE STREQUAL "Debug")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0 -g")
   elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
@@ -311,6 +315,15 @@ if(USE_JEMALLOC)
 endif()
 
 include(CTest)
+set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/googletest/googletest")
+set(GTEST_INCLUDE_DIR ${GTEST_ROOT}/include)
+#set(GTEST_BOTH_LIBRARIES gtest gtest_main)
+set(GTEST_LIBRARIES gtest gtest_main)
+set(GTEST_MAIN_LIBRARY gtest_main)
+set(GTEST_LIBRARY gtest)
+
+add_subdirectory(${GTEST_ROOT})
+find_package(GTest REQUIRED)
 
 # cudnn detection
 if(USE_CUDNN AND USE_CUDA)
@@ -571,6 +584,10 @@ if (INSTALL_EXAMPLES)
   install(DIRECTORY example  DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
 endif()
 
+if (USE_SIGNAL_HANDLER)
+	add_definitions(-DMXNET_USE_SIGNAL_HANDLER=1)
+endif()
+
 # AUTO_INSTALL_DIR -> Optional: specify post-build install direcory
 if(AUTO_INSTALL_DIR)
   # ---[ Install Includes
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7209b7c6a5..9d8542ec2d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -150,3 +150,6 @@ List of Contributors
 * [Manu Seth](https://github.com/mseth10/)
 * [Calum Leslie](https://github.com/calumleslie)
 * [Andre Tamm](https://github.com/andretamm)
+* [Marco de Abreu](https://github.com/marcoabreu)
+ - Marco is the creator of the current MXNet CI.
+* [Julian Salazar](https://github.com/JulianSlzr)
diff --git a/Jenkinsfile b/Jenkinsfile
index cbe63758ac..731e288372 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -16,7 +16,9 @@ def init_git() {
   deleteDir()
   retry(5) {
     try {
-      timeout(time: 2, unit: 'MINUTES') {
+      // Make sure wait long enough for quote. Important: Don't increase the amount of 
+      // retries as this will increase the amount of requests and worsen the throttling
+      timeout(time: 15, unit: 'MINUTES') {
         checkout scm
         sh 'git submodule update --init'
         sh 'git clean -d -f'        
@@ -52,12 +54,12 @@ def init_git_win() {
 def make(docker_type, make_flag) {
   timeout(time: max_time, unit: 'MINUTES') {
     try {
-      sh "${docker_run} ${docker_type} make ${make_flag}"
+      sh "${docker_run} ${docker_type} --dockerbinary docker make ${make_flag}"
     } catch (exc) {
       echo 'Incremental compilation failed with ${exc}. Fall back to build from scratch'
-      sh "${docker_run} ${docker_type} sudo make clean"
-      sh "${docker_run} ${docker_type} sudo make -C amalgamation/ clean"
-      sh "${docker_run} ${docker_type} make ${make_flag}"
+      sh "${docker_run} ${docker_type} --dockerbinary docker sudo make clean"
+      sh "${docker_run} ${docker_type} --dockerbinary docker sudo make -C amalgamation/ clean"
+      sh "${docker_run} ${docker_type} --dockerbinary docker make ${make_flag}"
     }
   }
 }
@@ -85,17 +87,17 @@ echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
 // Python 2
 def python2_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
-    sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/unittest"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/train"
+    sh "${docker_run} ${docker_type} --dockerbinary docker find . -name '*.pyc' -type f -delete"
+    sh "${docker_run} ${docker_type} --dockerbinary docker PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/unittest"
+    sh "${docker_run} ${docker_type} --dockerbinary docker PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/train"
   }
 }
 
 // Python 3
 def python3_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
-    sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest"
+    sh "${docker_run} ${docker_type} --dockerbinary docker find . -name '*.pyc' -type f -delete"
+    sh "${docker_run} ${docker_type} --dockerbinary docker PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest"
   }
 }
 
@@ -118,426 +120,445 @@ def python3_gpu_ut(docker_type) {
 }
 
 try {
-    stage("Sanity Check") {
-      timeout(time: max_time, unit: 'MINUTES') {
-        node('mxnetlinux') {
-          ws('workspace/sanity') {
-            init_git()
-            sh "python tools/license_header.py check"
-            make('lint', 'cpplint rcpplint jnilint')
-            make('lint', 'pylint')
-          }
+  stage("Sanity Check") {
+    timeout(time: max_time, unit: 'MINUTES') {
+      node('mxnetlinux-cpu') {
+        ws('workspace/sanity') {
+          init_git()
+          sh "python tools/license_header.py check"
+          make('lint', 'cpplint rcpplint jnilint')
+          make('lint', 'pylint')
         }
       }
     }
+  }
 
-    stage('Build') {
-      parallel 'CPU: Openblas': {
-        node('mxnetlinux') {
-          ws('workspace/build-cpu') {
-            init_git()
-            def flag = """ \
-    DEV=1                         \
-    USE_PROFILER=1                \
-    USE_CPP_PACKAGE=1             \
-    USE_BLAS=openblas             \
-    -j\$(nproc)
-    """
-            make("cpu", flag)
-            pack_lib('cpu')
-          }
+  stage('Build') {
+    parallel 'CPU: Openblas': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/build-cpu') {
+          init_git()
+          def flag = """ \
+            DEV=1                         \
+            USE_PROFILER=1                \
+            USE_CPP_PACKAGE=1             \
+            USE_BLAS=openblas             \
+            -j\$(nproc)
+            """
+          make("cpu", flag)
+          pack_lib('cpu')
         }
-      },
-      'GPU: CUDA7.5+cuDNN5': {
-        node('mxnetlinux') {
-          ws('workspace/build-gpu') {
-            init_git()
-            def flag = """ \
-    DEV=1                         \
-    USE_PROFILER=1                \
-    USE_BLAS=openblas             \
-    USE_CUDA=1                    \
-    USE_CUDA_PATH=/usr/local/cuda \
-    USE_CUDNN=1                   \
-    USE_CPP_PACKAGE=1             \
-    -j\$(nproc)
-    """
-            make('gpu', flag)
-            pack_lib('gpu')
-            stash includes: 'build/cpp-package/example/test_score', name: 'cpp_test_score'
-          }
+      }
+    },
+    'CPU: MKLML': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/build-mklml-cpu') {
+          init_git()
+          def flag = """ \
+            DEV=1                         \
+            USE_PROFILER=1                \
+            USE_CPP_PACKAGE=1             \
+            USE_BLAS=openblas             \
+            USE_MKL2017=1                 \
+            USE_MKL2017_EXPERIMENTAL=1    \
+            -j\$(nproc)
+            """
+          make("cpu_mklml", flag)
+          pack_lib('mklml_cpu')
         }
-      },
-      'Amalgamation MIN': {
-        node('mxnetlinux') {
-          ws('workspace/amalgamationmin') {
-            init_git()
-            make('cpu', '-C amalgamation/ clean')
-            make('cpu', '-C amalgamation/ USE_BLAS=openblas MIN=1')
-          }
+      }
+    },
+    'GPU: MKLML': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/build-mklml-gpu') {
+          init_git()
+          def flag = """ \
+            DEV=1                         \
+            USE_PROFILER=1                \
+            USE_CPP_PACKAGE=1             \
+            USE_BLAS=openblas             \
+            USE_MKL2017=1                 \
+            USE_MKL2017_EXPERIMENTAL=1    \
+            USE_CUDA=1                    \
+            USE_CUDA_PATH=/usr/local/cuda \
+            USE_CUDNN=1                   \
+            -j\$(nproc)
+            """
+          make("build_cuda", flag)
+          pack_lib('mklml_gpu')
+        }
+      }
+    },
+    'GPU: CUDA8.0+cuDNN5': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/build-gpu') {
+          init_git()
+          def flag = """ \
+            DEV=1                         \
+            USE_PROFILER=1                \
+            USE_BLAS=openblas             \
+            USE_CUDA=1                    \
+            USE_CUDA_PATH=/usr/local/cuda \
+            USE_CUDNN=1                   \
+            USE_CPP_PACKAGE=1             \
+            -j\$(nproc)
+            """
+          make('build_cuda', flag)
+          pack_lib('gpu')
+          stash includes: 'build/cpp-package/example/test_score', name: 'cpp_test_score'
+        }
+      }
+    },
+    'Amalgamation MIN': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/amalgamationmin') {
+          init_git()
+          make('cpu', '-C amalgamation/ clean')
+          make('cpu', '-C amalgamation/ USE_BLAS=openblas MIN=1')
         }
-      },
-      'Amalgamation': {
-        node('mxnetlinux') {
-          ws('workspace/amalgamation') {
-            init_git()
-            make('cpu', '-C amalgamation/ clean')
-            make('cpu', '-C amalgamation/ USE_BLAS=openblas')
+      }
+    },
+    'Amalgamation': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/amalgamation') {
+          init_git()
+          make('cpu', '-C amalgamation/ clean')
+          make('cpu', '-C amalgamation/ USE_BLAS=openblas')
+        }
+      }
+    },
+    'Build CPU windows':{
+      node('mxnetwindows-cpu') {
+        ws('workspace/build-cpu') {
+          withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
+            init_git_win()
+            bat """mkdir build_vc14_cpu
+              call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
+              cd build_vc14_cpu
+              cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 ${env.WORKSPACE}"""
+            bat 'C:\\mxnet\\build_vc14_cpu.bat'
+
+            bat '''rmdir /s/q pkg_vc14_cpu
+              mkdir pkg_vc14_cpu\\lib
+              mkdir pkg_vc14_cpu\\python
+              mkdir pkg_vc14_cpu\\include
+              mkdir pkg_vc14_cpu\\build
+              copy build_vc14_cpu\\Release\\libmxnet.lib pkg_vc14_cpu\\lib
+              copy build_vc14_cpu\\Release\\libmxnet.dll pkg_vc14_cpu\\build
+              xcopy python pkg_vc14_cpu\\python /E /I /Y
+              xcopy include pkg_vc14_cpu\\include /E /I /Y
+              xcopy dmlc-core\\include pkg_vc14_cpu\\include /E /I /Y
+              xcopy mshadow\\mshadow pkg_vc14_cpu\\include\\mshadow /E /I /Y
+              xcopy nnvm\\include pkg_vc14_cpu\\nnvm\\include /E /I /Y
+              del /Q *.7z
+              7z.exe a vc14_cpu.7z pkg_vc14_cpu\\
+              '''
+            stash includes: 'vc14_cpu.7z', name: 'vc14_cpu'
           }
         }
-      },
-      'GPU: MKLML': {
-        node('mxnetlinux') {
-          ws('workspace/build-mklml') {
-            init_git()
-            def flag = """ \
-    DEV=1                         \
-    USE_PROFILER=1                \
-    USE_BLAS=openblas             \
-    USE_MKL2017=1                 \
-    USE_MKL2017_EXPERIMENTAL=1    \
-    USE_CUDA=1                    \
-    USE_CUDA_PATH=/usr/local/cuda \
-    USE_CUDNN=1                   \
-    USE_CPP_PACKAGE=1             \
-    -j\$(nproc)
-    """
-            make('mklml_gpu', flag)
-            pack_lib('mklml')
+      }
+    },
+    //Todo: Set specific CUDA_ARCh for windows builds in cmake
+    'Build GPU windows':{
+      node('mxnetwindows-cpu') {
+        ws('workspace/build-gpu') {
+          withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
+           init_git_win()
+           bat """mkdir build_vc14_gpu
+             call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
+             cd build_vc14_gpu
+             cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
+           bat 'C:\\mxnet\\build_vc14_gpu.bat'
+           bat '''rmdir /s/q pkg_vc14_gpu
+             mkdir pkg_vc14_gpu\\lib
+             mkdir pkg_vc14_gpu\\python
+             mkdir pkg_vc14_gpu\\include
+             mkdir pkg_vc14_gpu\\build
+             copy build_vc14_gpu\\libmxnet.lib pkg_vc14_gpu\\lib
+             copy build_vc14_gpu\\libmxnet.dll pkg_vc14_gpu\\build
+             xcopy python pkg_vc14_gpu\\python /E /I /Y
+             xcopy include pkg_vc14_gpu\\include /E /I /Y
+             xcopy dmlc-core\\include pkg_vc14_gpu\\include /E /I /Y
+             xcopy mshadow\\mshadow pkg_vc14_gpu\\include\\mshadow /E /I /Y
+             xcopy nnvm\\include pkg_vc14_gpu\\nnvm\\include /E /I /Y
+             del /Q *.7z
+             7z.exe a vc14_gpu.7z pkg_vc14_gpu\\
+             '''
+           stash includes: 'vc14_gpu.7z', name: 'vc14_gpu'
           }
         }
-      },
-      'CPU windows':{
-        node('mxnetwindows') {
-          ws('workspace/build-cpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              init_git_win()
-              bat """mkdir build_vc14_cpu
-    call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-    cd build_vc14_cpu
-    cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 ${env.WORKSPACE}"""
-              bat 'C:\\mxnet\\build_vc14_cpu.bat'
-
-              bat '''rmdir /s/q pkg_vc14_cpu
-    mkdir pkg_vc14_cpu\\lib
-    mkdir pkg_vc14_cpu\\python
-    mkdir pkg_vc14_cpu\\include
-    mkdir pkg_vc14_cpu\\build
-    copy build_vc14_cpu\\Release\\libmxnet.lib pkg_vc14_cpu\\lib
-    copy build_vc14_cpu\\Release\\libmxnet.dll pkg_vc14_cpu\\build
-    xcopy python pkg_vc14_cpu\\python /E /I /Y
-    xcopy include pkg_vc14_cpu\\include /E /I /Y
-    xcopy dmlc-core\\include pkg_vc14_cpu\\include /E /I /Y
-    xcopy mshadow\\mshadow pkg_vc14_cpu\\include\\mshadow /E /I /Y
-    xcopy nnvm\\include pkg_vc14_cpu\\nnvm\\include /E /I /Y
-    del /Q *.7z
-    7z.exe a vc14_cpu.7z pkg_vc14_cpu\\
-    '''
-              stash includes: 'vc14_cpu.7z', name: 'vc14_cpu'
-             }
-            }
-           }
-         },
-         'GPU windows':{
-           node('mxnetwindows') {
-             ws('workspace/build-gpu') {
-               withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-                 init_git_win()
-                 bat """mkdir build_vc14_gpu
-    call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-    cd build_vc14_gpu
-    cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
-                 bat 'C:\\mxnet\\build_vc14_gpu.bat'
-                 bat '''rmdir /s/q pkg_vc14_gpu
-    mkdir pkg_vc14_gpu\\lib
-    mkdir pkg_vc14_gpu\\python
-    mkdir pkg_vc14_gpu\\include
-    mkdir pkg_vc14_gpu\\build
-    copy build_vc14_gpu\\libmxnet.lib pkg_vc14_gpu\\lib
-    copy build_vc14_gpu\\libmxnet.dll pkg_vc14_gpu\\build
-    xcopy python pkg_vc14_gpu\\python /E /I /Y
-    xcopy include pkg_vc14_gpu\\include /E /I /Y
-    xcopy dmlc-core\\include pkg_vc14_gpu\\include /E /I /Y
-    xcopy mshadow\\mshadow pkg_vc14_gpu\\include\\mshadow /E /I /Y
-    xcopy nnvm\\include pkg_vc14_gpu\\nnvm\\include /E /I /Y
-    del /Q *.7z
-    7z.exe a vc14_gpu.7z pkg_vc14_gpu\\
-    '''
-                 stash includes: 'vc14_gpu.7z', name: 'vc14_gpu'
-               }
-             }
-           }
       }
     }
+  } // End of stage('Build')
 
-    stage('Unit Test') {
-      parallel 'Python2: CPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python2-cpu') {
-            init_git()
-            unpack_lib('cpu')
-            python2_ut('cpu')
-          }
+  stage('Unit Test') {
+    parallel 'Python2: CPU': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-python2-cpu') {
+          init_git()
+          unpack_lib('cpu')
+          python2_ut('cpu')
         }
-      },
-      'Python3: CPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python3-cpu') {
-            init_git()
-            unpack_lib('cpu')
-            python3_ut('cpu')
-          }
+      }
+    },
+    'Python3: CPU': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-python3-cpu') {
+          init_git()
+          unpack_lib('cpu')
+          python3_ut('cpu')
         }
-      },
-      'Python2: GPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python2-gpu') {
-            init_git()
-            unpack_lib('gpu', mx_lib)
-            python2_gpu_ut('gpu')
-          }
+      }
+    },
+    'Python2: GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-python2-gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          python2_gpu_ut('gpu')
         }
-      },
-      'Python3: GPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python3-gpu') {
-            init_git()
-            unpack_lib('gpu', mx_lib)
-            python3_gpu_ut('gpu')
-          }
+      }
+    },
+    'Python3: GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-python3-gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          python3_gpu_ut('gpu')
         }
-      },
-      'Python2: MKLML-CPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python2-mklml-cpu') {
-            init_git()
-            unpack_lib('mklml')
-            python2_ut('mklml_gpu')
-          }
+      }
+    },
+    'Python2: MKLML-CPU': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-python2-mklml-cpu') {
+          init_git()
+          unpack_lib('mklml_cpu')
+          python2_ut('cpu_mklml')
         }
-      },
-      'Python2: MKLML-GPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python2-mklml-gpu') {
-            init_git()
-            unpack_lib('mklml')
-            python2_gpu_ut('mklml_gpu')
-          }
+      }
+    },
+    'Python2: MKLML-GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-python2-mklml-gpu') {
+          init_git()
+          unpack_lib('mklml_gpu')
+          python2_gpu_ut('gpu_mklml')
         }
-      },
-      'Python3: MKLML-CPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python3-mklml-cpu') {
-            init_git()
-            unpack_lib('mklml')
-            python3_ut('mklml_gpu')
-          }
+      }
+    },
+    'Python3: MKLML-CPU': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-python3-mklml-cpu') {
+          init_git()
+          unpack_lib('mklml_cpu')
+          python3_ut('cpu_mklml')
         }
-      },
-      'Python3: MKLML-GPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-python3-mklml-gpu') {
-            init_git()
-            unpack_lib('mklml')
-            python3_gpu_ut('mklml_gpu')
+      }
+    },
+    'Python3: MKLML-GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-python3-mklml-gpu') {
+          init_git()
+          unpack_lib('mklml_gpu')
+          python3_gpu_ut('gpu_mklml')
+        }
+      }
+    },
+    'Scala: CPU': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-scala-cpu') {
+          init_git()
+          unpack_lib('cpu')
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} cpu make scalapkg USE_BLAS=openblas"
+            sh "${docker_run} cpu make scalatest USE_BLAS=openblas"
           }
         }
-      },
-      'Scala: CPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-scala-cpu') {
-            init_git()
-            unpack_lib('cpu')
-            timeout(time: max_time, unit: 'MINUTES') {
-              sh "${docker_run} cpu make scalapkg USE_BLAS=openblas"
-              sh "${docker_run} cpu make scalatest USE_BLAS=openblas"
-            }
+      }
+    },
+    'Perl: CPU': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-perl-cpu') {
+          init_git()
+          unpack_lib('cpu')
+          timeout(time: max_time, unit: 'MINUTES') {
+              sh "${docker_run} cpu ./perl-package/test.sh"
           }
         }
-      },
-      'Perl: CPU': {
-            node('mxnetlinux') {
-                ws('workspace/ut-perl-cpu') {
-                    init_git()
-                    unpack_lib('cpu')
-                    timeout(time: max_time, unit: 'MINUTES') {
-                        sh "${docker_run} cpu ./perl-package/test.sh"
-                    }
-                }
-            }
-      },
-      'Perl: GPU': {
-            node('mxnetlinux') {
-                ws('workspace/ut-perl-gpu') {
-                    init_git()
-                    unpack_lib('gpu')
-                    timeout(time: max_time, unit: 'MINUTES') {
-                        sh "${docker_run} gpu ./perl-package/test.sh"
-                    }
-                }
-            }
-      },
-      'R: CPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-r-cpu') {
-            init_git()
-            unpack_lib('cpu')
-            timeout(time: max_time, unit: 'MINUTES') {
-              sh "${docker_run} cpu rm -rf .Renviron"
-              sh "${docker_run} cpu mkdir -p /workspace/ut-r-cpu/site-library"
-              sh "${docker_run} cpu make rpkg USE_BLAS=openblas R_LIBS=/workspace/ut-r-cpu/site-library"
-              sh "${docker_run} cpu R CMD INSTALL --library=/workspace/ut-r-cpu/site-library R-package"
-              sh "${docker_run} cpu make rpkgtest R_LIBS=/workspace/ut-r-cpu/site-library"
-            }
+      }
+    },
+    'Perl: GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-perl-gpu') {
+          init_git()
+          unpack_lib('gpu')
+          timeout(time: max_time, unit: 'MINUTES') {
+              sh "${docker_run} gpu ./perl-package/test.sh"
           }
         }
-      },
-      'R: GPU': {
-        node('mxnetlinux') {
-          ws('workspace/ut-r-gpu') {
-            init_git()
-            unpack_lib('gpu')
-            timeout(time: max_time, unit: 'MINUTES') {
-              sh "${docker_run} gpu rm -rf .Renviron"
-              sh "${docker_run} gpu mkdir -p /workspace/ut-r-gpu/site-library"
-              sh "${docker_run} gpu make rpkg USE_BLAS=openblas R_LIBS=/workspace/ut-r-gpu/site-library"
-              sh "${docker_run} gpu R CMD INSTALL --library=/workspace/ut-r-gpu/site-library R-package"
-              sh "${docker_run} gpu make rpkgtest R_LIBS=/workspace/ut-r-gpu/site-library R_GPU_ENABLE=1"
-            }
+      }
+    },
+    'R: CPU': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-r-cpu') {
+          init_git()
+          unpack_lib('cpu')
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} cpu rm -rf .Renviron"
+            sh "${docker_run} cpu mkdir -p /workspace/ut-r-cpu/site-library"
+            sh "${docker_run} cpu make rpkg USE_BLAS=openblas R_LIBS=/workspace/ut-r-cpu/site-library"
+            sh "${docker_run} cpu R CMD INSTALL --library=/workspace/ut-r-cpu/site-library R-package"
+            sh "${docker_run} cpu make rpkgtest R_LIBS=/workspace/ut-r-cpu/site-library"
           }
         }
-      },
-      'Python 2: CPU Win':{
-        node('mxnetwindows') {
-          ws('workspace/ut-python-cpu') {
-            init_git_win()
-            unstash 'vc14_cpu'
-            bat '''rmdir /s/q pkg_vc14_cpu
-    7z x -y vc14_cpu.7z'''
-            bat """xcopy C:\\mxnet\\data data /E /I /Y
-    xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py2
-    set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-    del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-    C:\\mxnet\\test_cpu.bat"""
+      }
+    },
+    'R: GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-r-gpu') {
+          init_git()
+          unpack_lib('gpu')
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} gpu rm -rf .Renviron"
+            sh "${docker_run} gpu mkdir -p /workspace/ut-r-gpu/site-library"
+            sh "${docker_run} gpu make rpkg USE_BLAS=openblas R_LIBS=/workspace/ut-r-gpu/site-library"
+            sh "${docker_run} gpu R CMD INSTALL --library=/workspace/ut-r-gpu/site-library R-package"
+            sh "${docker_run} gpu make rpkgtest R_LIBS=/workspace/ut-r-gpu/site-library R_GPU_ENABLE=1"
           }
-         }
-       },
-       'Python 3: CPU Win': {
-          node('mxnetwindows') {
-          ws('workspace/ut-python-cpu') {
-            init_git_win()
-            unstash 'vc14_cpu'
-            bat '''rmdir /s/q pkg_vc14_cpu
-    7z x -y vc14_cpu.7z'''
+        }
+      }
+    },
+    'Python 2: CPU Win':{
+      node('mxnetwindows-cpu') {
+        ws('workspace/ut-python-cpu') {
+          init_git_win()
+          unstash 'vc14_cpu'
+          bat '''rmdir /s/q pkg_vc14_cpu
+            7z x -y vc14_cpu.7z'''
           bat """xcopy C:\\mxnet\\data data /E /I /Y
-    xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py3
-    set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-    del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-    C:\\mxnet\\test_cpu.bat"""
-          }
-         }
-       },
-       'Python 2: GPU Win':{
-         node('mxnetwindows') {
-           ws('workspace/ut-python-gpu') {
-             init_git_win()
-             unstash 'vc14_gpu'
-             bat '''rmdir /s/q pkg_vc14_gpu
-    7z x -y vc14_gpu.7z'''
-             bat """xcopy C:\\mxnet\\data data /E /I /Y
-    xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py2
-    set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-    del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-    C:\\mxnet\\test_gpu.bat"""
-           }
-         }
-       },
-       'Python 3: GPU Win':{
-         node('mxnetwindows') {
-           ws('workspace/ut-python-gpu') {
-             init_git_win()
-             unstash 'vc14_gpu'
-             bat '''rmdir /s/q pkg_vc14_gpu
-    7z x -y vc14_gpu.7z'''
-             bat """xcopy C:\\mxnet\\data data /E /I /Y
-    xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py3
-    set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-    del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-    C:\\mxnet\\test_gpu.bat"""
-           }
+            xcopy C:\\mxnet\\model model /E /I /Y
+            call activate py2
+            set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+            del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
+            C:\\mxnet\\test_cpu.bat"""
+        }
+       }
+    },
+    'Python 3: CPU Win': {
+       node('mxnetwindows-cpu') {
+       ws('workspace/ut-python-cpu') {
+         init_git_win()
+         unstash 'vc14_cpu'
+         bat '''rmdir /s/q pkg_vc14_cpu
+           7z x -y vc14_cpu.7z'''
+         bat """xcopy C:\\mxnet\\data data /E /I /Y
+           xcopy C:\\mxnet\\model model /E /I /Y
+           call activate py3
+           set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+           del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
+           C:\\mxnet\\test_cpu.bat"""
          }
+      }
+    },
+    'Python 2: GPU Win':{
+      node('mxnetwindows-gpu') {
+        ws('workspace/ut-python-gpu') {
+          init_git_win()
+          unstash 'vc14_gpu'
+          bat '''rmdir /s/q pkg_vc14_gpu
+            7z x -y vc14_gpu.7z'''
+          bat """xcopy C:\\mxnet\\data data /E /I /Y
+            xcopy C:\\mxnet\\model model /E /I /Y
+            call activate py2
+            set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+            del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
+            C:\\mxnet\\test_gpu.bat"""
         }
+      }
+    },
+    'Python 3: GPU Win':{
+      node('mxnetwindows-gpu') {
+        ws('workspace/ut-python-gpu') {
+         init_git_win()
+         unstash 'vc14_gpu'
+         bat '''rmdir /s/q pkg_vc14_gpu
+           7z x -y vc14_gpu.7z'''
+         bat """xcopy C:\\mxnet\\data data /E /I /Y
+           xcopy C:\\mxnet\\model model /E /I /Y
+           call activate py3
+           set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+           del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
+           C:\\mxnet\\test_gpu.bat"""
+        }
+      }
     }
+  }
 
-    stage('Integration Test') {
-      parallel 'Python': {
-        node('mxnetlinux') {
-          ws('workspace/it-python-gpu') {
-            init_git()
-            unpack_lib('gpu')
-            timeout(time: max_time, unit: 'MINUTES') {
-              sh "${docker_run} gpu PYTHONPATH=./python/ python example/image-classification/test_score.py"
-            }
+  stage('Integration Test') {
+    parallel 'Python GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/it-python-gpu') {
+          init_git()
+          unpack_lib('gpu')
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} gpu --dockerbinary nvidia-docker PYTHONPATH=./python/ python example/image-classification/test_score.py"
           }
         }
-      },
-      'Caffe': {
-        node('mxnetlinux') {
-          ws('workspace/it-caffe') {
-            init_git()
-            unpack_lib('gpu')
-            timeout(time: max_time, unit: 'MINUTES') {
-              sh "${docker_run} caffe_gpu PYTHONPATH=/caffe/python:./python python tools/caffe_converter/test_converter.py"
-            }
+      }
+    },
+    'Caffe GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/it-caffe') {
+          init_git()
+          unpack_lib('gpu')
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} caffe_gpu --dockerbinary nvidia-docker PYTHONPATH=/caffe/python:./python python tools/caffe_converter/test_converter.py"
           }
         }
-      },
-      'cpp-package': {
-        node('mxnetlinux') {
-          ws('workspace/it-cpp-package') {
-            init_git()
-            unpack_lib('gpu')
-            unstash 'cpp_test_score'
-            timeout(time: max_time, unit: 'MINUTES') {
-              sh "${docker_run} gpu cpp-package/tests/ci_test.sh"
-            }
+      }
+    },
+    'cpp-package GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/it-cpp-package') {
+          init_git()
+          unpack_lib('gpu')
+          unstash 'cpp_test_score'
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh "${docker_run} gpu --dockerbinary nvidia-docker cpp-package/tests/ci_test.sh"
           }
         }
       }
     }
+  }
 
-    stage('Deploy') {
-      node('mxnetlinux') {
-        ws('workspace/docs') {
-          if (env.BRANCH_NAME == "master") {
-            init_git()
-            sh "make clean"
-            sh "make docs"
-          }
+  stage('Deploy') {
+    node('mxnetlinux-cpu') {
+      ws('workspace/docs') {
+        if (env.BRANCH_NAME == "master") {
+          init_git()
+          sh "make clean"
+          sh "make docs"
         }
       }
     }
+  }
   // set build status to success at the end
   currentBuild.result = "SUCCESS"
 } catch (caughtError) {
-    node("mxnetlinux") {
-        sh "echo caught ${caughtError}"
-        err = caughtError
-        currentBuild.result = "FAILURE"
-    }
+  node("mxnetlinux-cpu") {
+    sh "echo caught ${caughtError}"
+    err = caughtError
+    currentBuild.result = "FAILURE"
+  }
 } finally {
-    node("mxnetlinux") {
-        // Only send email if master failed
-        if (currentBuild.result == "FAILURE" && env.BRANCH_NAME == "master") {
-            emailext body: 'Build for MXNet branch ${BRANCH_NAME} has broken. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[BUILD FAILED] Branch ${BRANCH_NAME} build ${BUILD_NUMBER}', to: '${EMAIL}'
-        }
-        // Remember to rethrow so the build is marked as failing
-        if (err) {
-            throw err
-        }
+  node("mxnetlinux-cpu") {
+    // Only send email if master failed
+    if (currentBuild.result == "FAILURE" && env.BRANCH_NAME == "master") {
+      emailext body: 'Build for MXNet branch ${BRANCH_NAME} has broken. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[BUILD FAILED] Branch ${BRANCH_NAME} build ${BUILD_NUMBER}', to: '${EMAIL}'
     }
+    // Remember to rethrow so the build is marked as failing
+    if (err) {
+      throw err
+    }
+  }
 }
diff --git a/Makefile b/Makefile
index 72dd26e0e4..8584ab658e 100644
--- a/Makefile
+++ b/Makefile
@@ -145,6 +145,7 @@ ifeq ($(USE_LAPACK), 1)
 ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
 ifeq (,$(wildcard /lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib/liblapack.a))
+ifeq (,$(wildcard /usr/lib64/liblapack.a))
 ifeq (,$(wildcard $(USE_LAPACK_PATH)/liblapack.a))
 	USE_LAPACK = 0
 endif
@@ -152,6 +153,7 @@ endif
 endif
 endif
 endif
+endif
 
 # lapack settings.
 ifeq ($(USE_LAPACK), 1)
@@ -267,6 +269,7 @@ ifeq ($(CUDA_ARCH),)
 	CUDA_ARCH += $(shell $(NVCC) -cuda $(COMPRESS) --x cu /dev/null -o /dev/null >/dev/null 2>&1 && \
 						 echo $(COMPRESS))
 endif
+$(info Running CUDA_ARCH: $(CUDA_ARCH))
 endif
 
 # ps-lite
@@ -330,6 +333,9 @@ ifeq ($(USE_CUDA), 1)
 	CFLAGS += -I$(ROOTDIR)/3rdparty/cub
 	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ) $(PLUGIN_CUOBJ)
 	LDFLAGS += -lcuda -lcufft -lnvrtc
+	# Make sure to add stubs as fallback in order to be able to build 
+	# without full CUDA install (especially if run without nvidia-docker)
+	LDFLAGS += -L/usr/local/cuda/lib64/stubs
 	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-gpu
 	ifeq ($(USE_NCCL), 1)
 		ifneq ($(USE_NCCL_PATH), NONE)
diff --git a/NEWS.md b/NEWS.md
index 666b5d88e6..740621038d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,18 @@
 MXNet Change Log
 ================
+## 0.12.1
+### Bug-fixes
+  - Added GPU support for the `syevd` operator which ensures that there is GPU support for all linalg-operators.
+  - Bugfix for `syevd` on CPU such that it works for `float32`.
+  - Fixed API call when `OMP_NUM_THREADS` environment variable is set. 
+  - Fixed `MakeNonlossGradNode` bug.
+  - Fixed bug related to passing `dtype` to `array()`. 
+  - Fixed some minor bugs for sparse distributed training.
+  - Fixed a bug on `Slice` accessing uninitialized memory in `param.begin` in the file `matrix_op-inl.h`. 
+  - Fixed `gluon.data.RecordFileDataset`.
+  - Fixed a bug that caused `autograd` to crash on some networks.
+  
+  
 ## 0.12.0
 ### Performance
   - Added full support for NVIDIA Volta GPU Architecture and CUDA 9. Training CNNs is up to 3.5x faster than Pascal when using float16 precision.
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 3d57ea876f..6e0f93294b 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 0.12.1
+Version: 1.0.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qk...@qkou.info>
diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index a0562386eb..bc96d770b2 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -11,8 +11,8 @@ mx.callback.log.train.metric <- function(period, logger=NULL) {
   function(iteration, nbatch, env, verbose=TRUE) {
     if (nbatch %% period == 0 && !is.null(env$metric)) {
       result <- env$metric$get(env$train.metric)
-      if (nbatch != 0 & verbose)
-        message(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value))
+      if (nbatch != 0 && verbose)
+        message("Batch [", nbatch, "] Train-", result$name, "=", result$value)
       if (!is.null(logger)) {
         if (class(logger) != "mx.metric.logger") {
           stop("Invalid mx.metric.logger.")
@@ -20,8 +20,8 @@ mx.callback.log.train.metric <- function(period, logger=NULL) {
         logger$train <- c(logger$train, result$value)
         if (!is.null(env$eval.metric)) {
           result <- env$metric$get(env$eval.metric)
-          if (nbatch != 0 & verbose)
-            message(paste0("Batch [", nbatch, "] Validation-", result$name, "=", result$value))
+          if (nbatch != 0 && verbose)
+            message("Batch [", nbatch, "] Validation-", result$name, "=", result$value)
           logger$eval <- c(logger$eval, result$value)
         }
       }
@@ -48,9 +48,9 @@ mx.callback.log.speedometer <- function(batch.size, frequency=50){
         time <- as.double(difftime(Sys.time(), env$tic, units = "secs"))
         speed <- frequency*batch.size/time
         result <- env$metric$get(env$train.metric)
-        if (nbatch != 0 & verbose)
-          message(paste0("Batch [", nbatch, "] Speed: ", speed, " samples/sec Train-",
-                     result$name, "=", result$value))
+        if (nbatch != 0 && verbose)
+          message("Batch [", nbatch, "] Speed: ", speed, " samples/sec Train-",
+                     result$name, "=", result$value)
         env$tic = Sys.time()
       }      
     } else {
@@ -95,7 +95,7 @@ mx.callback.early.stop <- function(train.metric = NULL, eval.metric = NULL, bad.
     if (!is.null(env$metric)) {
       if (!is.null(train.metric)) {
         result <- env$metric$get(env$train.metric)
-        if ((maximize == F & result$value < train.metric) | (maximize == TRUE & result$value > train.metric)) {
+        if ((! maximize && result$value < train.metric) || (maximize && result$value > train.metric)) {
           return(FALSE)
         }
       }
@@ -104,7 +104,7 @@ mx.callback.early.stop <- function(train.metric = NULL, eval.metric = NULL, bad.
       if (!is.null(eval.metric)) {
         if (!is.null(env$eval.metric)) {
           result <- env$metric$get(env$eval.metric)
-          if ((maximize == F & result$value < eval.metric) | (maximize == TRUE & result$value > eval.metric)) {
+          if ((!maximize && result$value < eval.metric) || (maximize && result$value > eval.metric)) {
             return(FALSE)
           }
         }
@@ -135,11 +135,11 @@ mx.callback.early.stop <- function(train.metric = NULL, eval.metric = NULL, bad.
         
         result <- env$metric$get(env$eval.metric)
         
-        if ((maximize == F & result$value > mx.best.score) | (maximize == TRUE & result$value < mx.best.score)) {
+        if ((! maximize && result$value > mx.best.score) || (maximize && result$value < mx.best.score)) {
           
           if (mx.best.iter == bad.steps) {
             if (verbose) {
-              message(paste0("Best score=", mx.best.score, ", iteration [", iteration - bad.steps, "]"))
+              message("Best score=", mx.best.score, ", iteration [", iteration - bad.steps, "]")
             }
             return(FALSE)
           } else {
diff --git a/R-package/R/initializer.R b/R-package/R/initializer.R
index 9f5e75be91..40712432d8 100644
--- a/R-package/R/initializer.R
+++ b/R-package/R/initializer.R
@@ -61,17 +61,12 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
     
     fan_out = shape[length(shape)]
     fan_in  = prod(shape[-length(shape)])
-    factor_val  = 1
-    if (factor_type == "avg") {
-      factor_val = (fan_in + fan_out) / 2
-    } else if (factor_type == "in"){
-      factor_val = fan_in
-    } else if (factor_type == "out"){
-      factor_val = fan_out
-    } else {
-      stop("Not supported factor type. See usage of function mx.init.Xavier")
-    }
-    
+    factor_val <- switch(factor_type,
+                         "avg" = (fan_in + fan_out) / 2,
+                         "in" = fan_in,
+                         "out" = fan_out,
+                         stop("Not supported factor type. See usage of function mx.init.Xavier"))
+
     scale = sqrt(magnitude / factor_val)
     
     if (rnd_type == "uniform"){
@@ -95,9 +90,7 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
 mx.init.create <- function(initializer, shape.array, ctx=NULL, skip.unknown=TRUE) {
   if (length(shape.array) == 0) return(list())
   names = names(shape.array)
-  ret <- lapply(1 : length(names), function(i) {
-    initializer(names[[i]], shape.array[[i]], ctx, allow.unknown=skip.unknown)
-  })
+  ret <- lapply(seq_along(names), function(i) initializer(names[[i]], shape.array[[i]], ctx, allow.unknown=skip.unknown))
   names(ret) <- names
   if (skip.unknown) {
     ret <- mx.util.filter.null(ret)
diff --git a/R-package/R/lr_scheduler.R b/R-package/R/lr_scheduler.R
index bc89721094..8b032cdefe 100644
--- a/R-package/R/lr_scheduler.R
+++ b/R-package/R/lr_scheduler.R
@@ -19,12 +19,10 @@ mx.lr_scheduler.FactorScheduler <- function(step, factor_val, stop_factor_lr=1e-
       lr    <- lr * factor_val
       if(lr < stop_factor_lr){
         lr <- stop_factor_lr
-        if(verbose) message(paste0("Update[", num_update, 
-                               "]: now learning rate arrived at ", lr, 
-                               "will not change in the future"))
+        if(verbose) message("Update[", num_update, "]: now learning rate arrived at ",
+                            lr, "will not change in the future")
       } else{
-        if(verbose) message(paste0("Update[", num_update, 
-                               "]: learning rate is changed to ", lr))
+        if(verbose) message("Update[", num_update, "]: learning rate is changed to ", lr)
       }
       optimizerEnv$lr    <- lr
       optimizerEnv$count <- count      
@@ -62,12 +60,10 @@ mx.lr_scheduler.MultiFactorScheduler <- function(step, factor_val, stop_factor_l
         lr <-  lr * factor_val
         if(lr < stop_factor_lr){
           lr <- stop_factor_lr
-          if(verbose) message(paste0("Update[", num_update, 
-                                 "]: now learning rate arrived at ", lr, 
-                                 "will not change in the future"))
+          if(verbose) message("Update[", num_update, "]: now learning rate arrived at ",
+                              lr, "will not change in the future")
         } else{
-          if(verbose) message(paste0("Update[", num_update, 
-                                 "]: learning rate is changed to ", lr))
+          if(verbose) message("Update[", num_update, "]: learning rate is changed to ", lr)
           
         }
         optimizerEnv$lr           <- lr
diff --git a/R-package/R/metric.R b/R-package/R/metric.R
index 02572f4acd..f8d9c33a72 100644
--- a/R-package/R/metric.R
+++ b/R-package/R/metric.R
@@ -39,7 +39,7 @@ mx.metric.top_k_accuracy <- mx.metric.custom("top_k_accuracy", function(label, p
   if(top_k == 1){
     return(mx.metric.accuracy(label,pred))
   } else{
-    ypred <- apply(pred,2,function(x) order(x, decreasing=TRUE)[1:top_k])
+    ypred <- apply(pred,2,function(x) order(x, decreasing=TRUE)[seq_len(top_k)])
     ans <- apply(ypred, 2, is.num.in.vect, num = as.array(label + 1))
     acc <- sum(ans)/length(label)  
     return(acc)
diff --git a/R-package/R/mlp.R b/R-package/R/mlp.R
index 33134ffbf4..ecc30999d1 100644
--- a/R-package/R/mlp.R
+++ b/R-package/R/mlp.R
@@ -47,7 +47,7 @@ mx.mlp <- function(data, label, hidden_node = 1, out_node, dropout = NULL,
       stop(paste("Length of activation should be",m))
     }
   }
-  for (i in 1:m) {
+  for (i in seq_len(m)) {
     fc <- mx.symbol.FullyConnected(act, num_hidden=hidden_node[i])
     act <- mx.symbol.Activation(fc, act_type=activation[i])
     if (i == m && !is.null(dropout)) {
@@ -55,15 +55,11 @@ mx.mlp <- function(data, label, hidden_node = 1, out_node, dropout = NULL,
     }
   }
   fc <- mx.symbol.FullyConnected(act, num_hidden=out_node)
-  if (out_activation == "rmse") {
-    out <- mx.symbol.LinearRegressionOutput(fc)
-  } else if (out_activation == "softmax") {
-    out <- mx.symbol.SoftmaxOutput(fc)
-  } else if (out_activation == "logistic") {
-    out <- mx.symbol.LogisticRegressionOutput(fc)
-  } else {
-    stop("Not supported yet.")
-  }
+  out <- switch(out_activation,
+                "rmse" = mx.symbol.LinearRegressionOutput(fc),
+                "softmax" = mx.symbol.SoftmaxOutput(fc),
+                "logistic" = mx.symbol.LogisticRegressionOutput(fc),
+                stop("Not supported yet."))
   model <- mx.model.FeedForward.create(out, X=data, y=label, ctx = ctx, ...)
   return(model)
 }
diff --git a/R-package/R/model.R b/R-package/R/model.R
index f607ebb4f2..01b5ed7283 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -4,7 +4,7 @@ mx.model.slice.shape <- function(shape, nsplit) {
     ndim <- length(shape)
     batchsize <- shape[[ndim]]
     step <- as.integer((batchsize + nsplit - 1) / nsplit)
-    lapply(0:(nsplit - 1), function(k) {
+    lapply(seq_len(nsplit) - 1, function(k) {
       begin = min(k * step, batchsize)
       end = min((k + 1) * step, batchsize)
       s <- shape
@@ -16,7 +16,7 @@ mx.model.slice.shape <- function(shape, nsplit) {
     ndim <- length(shape[[1]])
     batchsize <- shape[[1]][[ndim]]
     step <- as.integer((batchsize + nsplit - 1) / nsplit)
-    lapply(0:(nsplit - 1), function(k) {
+    lapply(seq_len(nsplit) - 1, function(k) {
       begin = min(k * step, batchsize)
       end = min((k + 1) * step, batchsize)
       s <- lapply(shape, function(s) {
@@ -58,7 +58,7 @@ mx.model.extract.model <- function(symbol, train.execs) {
   # Get the parameters
   ndevice <- length(train.execs)
   narg <- length(train.execs[[1]]$ref.arg.arrays)
-  arg.params <- lapply(1:narg, function(k) {
+  arg.params <- lapply(seq_len(narg), function(k) {
     if (is.null(train.execs[[1]]$ref.grad.arrays[[k]])) {
       result <- NULL
     } else {
@@ -73,7 +73,7 @@ mx.model.extract.model <- function(symbol, train.execs) {
   # Get the auxiliary
   naux <- length(train.execs[[1]]$ref.aux.arrays)
   if (naux != 0) {
-    aux.params <- lapply(1:naux, function(k) {
+    aux.params <- lapply(seq_len(naux), function(k) {
       reduce.sum(lapply(train.execs, function(texec) {
         mx.nd.copyto(texec$ref.aux.arrays[[k]], mx.cpu())
       })) / ndevice
@@ -95,13 +95,13 @@ mx.model.create.kvstore <- function(kvstore, arg.params, ndevice, verbose=TRUE)
   }
   if (ndevice == 1) return (NULL)
   if (kvstore == "local") {
-    max.size <- max(as.integer(lapply(arg.params, length)))
+    max.size <- max(lengths(arg.params))
     if (max.size < 1024 * 1024 * 16) {
       kvstore <- 'local_update_cpu'
     } else {
       kvstore <- 'local_allreduce_cpu'
     }
-    if(verbose) message(paste0("Auto-select kvstore type = ", kvstore))
+    if(verbose) message("Auto-select kvstore type = ", kvstore)
   }
   return(mx.kv.create(kvstore))
 }
@@ -114,7 +114,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
                            epoch.end.callback, batch.end.callback,
                            kvstore, fixed.param = NULL, verbose = TRUE) {
   ndevice <- length(ctx)
-  if(verbose) message(paste0("Start training with ", ndevice, " devices"))
+  if(verbose) message("Start training with ", ndevice, " devices")
   # create the executors
   input_slice <- mx.model.slice.shape(input.shape, ndevice)
   output_slice <- mx.model.slice.shape(output.shape, ndevice)
@@ -122,7 +122,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
   arg_names <- arguments(symbol)
   output.names <- names(output.shape)
   #label_name <- arg_names[endsWith(arg_names, "label")]
-  train.execs <- lapply(1:ndevice, function(i) {
+  train.execs <- lapply(seq_len(ndevice), function(i) {
     arg_lst <- list(symbol = symbol, ctx = ctx[[i]], grad.req = "write")
     arg_lst <- append(arg_lst, input_slice[[i]]$shape)
     arg_lst <- append(arg_lst, output_slice[[i]]$shape)
@@ -137,7 +137,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
   # KVStore related stuffs
   params.index <-
     as.integer(mx.util.filter.null(
-      lapply(1:length(train.execs[[1]]$ref.grad.arrays), function(k) {
+      lapply(seq_along(train.execs[[1]]$ref.grad.arrays), function(k) {
         if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL
       })))
   update.on.kvstore <- FALSE
@@ -145,7 +145,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
     update.on.kvstore <- TRUE
     kvstore$set.optimizer(optimizer)
   } else {
-    updaters <- lapply(1:ndevice, function(i) {
+    updaters <- lapply(seq_len(ndevice), function(i) {
       mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
     })
   }
@@ -162,13 +162,13 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
     while (train.data$iter.next()) {
       # Get input data slice
       dlist <- train.data$value()
-      slices <- lapply(1:ndevice, function(i) {
+      slices <- lapply(seq_len(ndevice), function(i) {
         s <- input_slice[[i]]
         ret <- sapply(names(dlist), function(n) {mx.nd.slice(dlist[[n]], s$begin, s$end)})
         return(ret)
       })
       # copy data to executor
-      for (i in 1:ndevice) {
+      for (i in seq_len(ndevice)) {
         s <- slices[[i]]
         if (endsWith(output.names, "label")) {
           names(s)[endsWith(names(s), "label")] = output.names 
@@ -205,16 +205,16 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
             texec$ref.grad.arrays[params.index]
           }), -params.index)
         }
-        arg.blocks <- lapply(1:ndevice, function(i) {
+        arg.blocks <- lapply(seq_len(ndevice), function(i) {
           updaters[[i]](train.execs[[i]]$ref.arg.arrays, train.execs[[i]]$ref.grad.arrays)
         })
-        for (i in 1:ndevice) {
+        for (i in seq_len(ndevice)) {
           mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], skip.null=TRUE)
         }
       }
       # Update the evaluation metrics
       if (!is.null(metric)) {
-        for (i in 1 : ndevice) {
+        for (i in seq_len(ndevice)) {
           train.metric <- metric$update(slices[[i]][[length(slices[[i]])]], out.preds[[i]], train.metric)
         }
       }
@@ -227,7 +227,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
     train.data$reset()
     if (!is.null(metric)) {
       result <- metric$get(train.metric)
-      if(verbose) message(paste0("[", iteration, "] Train-", result$name, "=", result$value))
+      if(verbose) message("[", iteration, "] Train-", result$name, "=", result$value)
     }
     if (!is.null(eval.data)) {
       if (!is.null(metric)) {
@@ -235,12 +235,12 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
       }
       while (eval.data$iter.next()) {
         dlist <- eval.data$value()
-        slices <- lapply(1:ndevice, function(i) {
+        slices <- lapply(seq_len(ndevice), function(i) {
           s <- input_slice[[i]]
           ret <- sapply(names(dlist), function(n) {mx.nd.slice(dlist[[n]], s$begin, s$end)})
           return(ret)
         })
-        for (i in 1:ndevice) {
+        for (i in seq_len(ndevice)) {
           s <- slices[[i]]
           if (endsWith(output.names, "label")) {
             names(s)[endsWith(names(s), "label")] = output.names 
@@ -254,7 +254,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
           mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
         })
         if (!is.null(metric)) {
-          for (i in 1 : ndevice) {
+          for (i in seq_len(ndevice)) {
             eval.metric <- metric$update(slices[[i]][[length(slices[[i]])]] , out.preds[[i]], eval.metric)
           }
         }
@@ -262,7 +262,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
       eval.data$reset()
       if (!is.null(metric)) {
         result <- metric$get(eval.metric)
-        if(verbose) message(paste0("[", iteration, "] Validation-", result$name, "=", result$value))
+        if(verbose) message("[", iteration, "] Validation-", result$name, "=", result$value)
       }
     } else {
       eval.metric <- NULL
@@ -290,7 +290,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
 #' @param ctx mx.context. The devices used to perform initialization.
 #' @export
 mx.model.init.params <- function(symbol, input.shape, output.shape, initializer, ctx) {
-  if (!is.MXSymbol(symbol)) stop("symbol need to be MXSymbol")
+  if (!is.MXSymbol(symbol)) stop("symbol needs to be MXSymbol")
 
   arg_lst <- list(symbol = symbol)
   arg_lst <- append(arg_lst, input.shape)
@@ -310,7 +310,7 @@ mx.model.init.iter <- function(X, y, batch.size, is.train) {
     if (is.train) stop("Need to provide parameter y for training with R arrays.")
     shape <- dim(X)
     ndim <- length(shape)
-    y <- c(1:shape[[ndim]]) * 0
+    y <- rep.int(0, times = shape[[ndim]])
   }
   batch.size <- min(length(y), batch.size)
   return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
@@ -349,21 +349,16 @@ mx.model.select.layout.predict <- function(X, model) {
   ret <- mx.symbol.infer.shape(model$symbol, data=c(dimX[[2]], 1))
   if (!is.null(ret)) {
     names = names(model$arg.params)
-    for (i in 1:length(names)) {
-      if (any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]]))) {
-        rowmajor <- 0
-      }
-    }
+    if (any(vapply(seq_along(names),
+                   function(i) any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]])),
+                   logical(1)))) rowmajor <- 0
   }
   # try col major
   ret <- mx.symbol.infer.shape(model$symbol, data=c(dimX[[1]], 1))
   if (!is.null(ret)) {
-    names = names(model$arg.params)
-    for (i in 1:length(names)) {
-      if (any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]]))) {
-        colmajor <- 0
-      }
-    }
+    if (any(vapply(seq_along(names),
+                   function(i) any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]])),
+                   logical(1)))) colmajor <- 0
   }
   if (rowmajor + colmajor != 1) {
     stop("Cannot auto select array.layout, please specify this parameter")
@@ -589,27 +584,20 @@ predict.MXFeedForwardModel <- function(model, X, ctx = NULL, array.batch.size =
 mx.model.load <- function(prefix, iteration) {
   symbol <- mx.symbol.load(path.expand(paste0(prefix, "-symbol.json")))
   save.dict <- mx.nd.load(path.expand(sprintf("%s-%04d.params", prefix, iteration)))
-  names <- names(save.dict)
-  arg.index <- as.integer(mx.util.filter.null(lapply(1:length(names), function(i) {
-    if (startsWith(names[[i]], "arg:")) i else NULL
-  })))
-  aux.index <- as.integer(mx.util.filter.null(lapply(1:length(names), function(i) {
-    if (startsWith(names[[i]], "aux:")) i else NULL
-  })))
+  nms <- names(save.dict)
+  
+  arg.index <- startsWith(nms, "arg:")
+  aux.index <- startsWith(nms, "aux:")
 
-  if (length(arg.index) != 0) {
+  if (any(arg.index)) {
     arg.params <- save.dict[arg.index]
-    names(arg.params) <- as.character(lapply(names[arg.index], function(nm) {
-      substr(nm, 5, nchar(nm))
-    }))
+    names(arg.params) <- substr(nms[arg.index], 5, nchar(nms[arg.index]))
   } else {
     arg.params <- list()
   }
-  if (length(aux.index) != 0) {
+  if (any(aux.index)) {
     aux.params <- save.dict[aux.index]
-    names(aux.params) <- as.character(lapply(names[aux.index], function(nm) {
-      substr(nm, 5, nchar(nm))
-    }))
+    names(aux.params) <- substr(nms[aux.index], 5, nchar(nms[aux.index]))
   } else {
     aux.params <- list()
   }
diff --git a/R-package/R/model.rnn.R b/R-package/R/model.rnn.R
index 8f3ab8c258..78a125ed51 100644
--- a/R-package/R/model.rnn.R
+++ b/R-package/R/model.rnn.R
@@ -7,18 +7,18 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
   
   ndevice <- length(ctx)
   if (verbose) 
-    message(paste0("Start training with ", ndevice, " devices"))
+    message("Start training with ", ndevice, " devices")
   
   input.names <- names(dlist)
   arg.params.names <- names(arg.params)
   
   if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else sym_ini <- symbol
   
-  slices <- lapply(1:ndevice, function(i) {
-    sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = F))
+  slices <- lapply(seq_len(ndevice), function(i) {
+    sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = FALSE))
   })
   
-  train.execs <- lapply(1:ndevice, function(i) {
+  train.execs <- lapply(seq_len(ndevice), function(i) {
     s <- slices[[i]]
     mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s, arg.params)[arg.update.idx], 
                            aux.arrays = aux.params, ctx = ctx[[i]], grad.req = grad.req)
@@ -27,7 +27,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
   # KVStore related stuffs
   params.index <- as.integer(
     mx.util.filter.null(
-      lapply(1:length(train.execs[[1]]$ref.grad.arrays), function(k) {
+      lapply(seq_along(train.execs[[1]]$ref.grad.arrays), function(k) {
         if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL}
       )))
   
@@ -36,7 +36,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
     update.on.kvstore <- TRUE
     kvstore$set.optimizer(optimizer)
   } else {
-    updaters <- lapply(1:ndevice, function(i) {
+    updaters <- lapply(seq_len(ndevice), function(i) {
       mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
     })
   }
@@ -58,20 +58,20 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
       dlist <- train.data$value()[input.names]
       
       # Slice inputs for multi-devices
-      slices <- lapply(1:ndevice, function(i) {
+      slices <- lapply(seq_len(ndevice), function(i) {
         sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = F))
       })
       
       # Assign input to each executor - bug on inference if using BatchNorm
       if (is.list(symbol)) {
-        train.execs <- lapply(1:ndevice, function(i) {
+        train.execs <- lapply(seq_len(ndevice), function(i) {
           s <- slices[[i]]
           mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]], 
                                  arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
                                  aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
         })
       } else {
-        for (i in 1:ndevice) {
+        for (i in seq_len(ndevice)) {
           s <- slices[[i]]
           mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
         }
@@ -107,17 +107,17 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
             texec$ref.grad.arrays[params.index]
           }), -params.index)
         }
-        arg.blocks <- lapply(1:ndevice, function(i) {
+        arg.blocks <- lapply(seq_len(ndevice), function(i) {
           updaters[[i]](train.execs[[i]]$ref.arg.arrays, train.execs[[i]]$ref.grad.arrays)
         })
-        for (i in 1:ndevice) {
+        for (i in seq_len(ndevice)) {
           mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], skip.null = TRUE)
         }
       }
       
       # Update the evaluation metrics
       if (!is.null(metric)) {
-        for (i in 1:ndevice) {
+        for (i in seq_len(ndevice)) {
           train.metric <- metric$update(label = slices[[i]][[length(slices[[i]])]], 
                                         pred = out.preds[[i]], state = train.metric)
         }
@@ -133,7 +133,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
     if (!is.null(metric)) {
       result <- metric$get(train.metric)
       if (verbose) 
-        message(paste0("[", iteration, "] Train-", result$name, "=", result$value))
+        message("[", iteration, "] Train-", result$name, "=", result$value)
     }
     
     if (!is.null(eval.data)) {
@@ -147,20 +147,20 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
         dlist <- eval.data$value()[input.names]
         
         # Slice input to multiple devices
-        slices <- lapply(1:ndevice, function(i) {
-          sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = F))
+        slices <- lapply(seq_len(ndevice), function(i) {
+          sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = FALSE))
         })
         
         # Assign input to each executor - bug on inference if using BatchNorm
         if (is.list(symbol)) {
-          train.execs <- lapply(1:ndevice, function(i) {
+          train.execs <- lapply(seq_len(ndevice), function(i) {
             s <- slices[[i]]
             mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]], 
                                    arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
                                    aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
           })
         } else {
-          for (i in 1:ndevice) {
+          for (i in seq_len(ndevice)) {
             s <- slices[[i]]
             mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
           }
@@ -176,7 +176,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
         })
         
         if (!is.null(metric)) {
-          for (i in 1:ndevice) {
+          for (i in seq_len(ndevice)) {
             eval.metric <- metric$update(slices[[i]][[length(slices[[i]])]], 
                                          out.preds[[i]], eval.metric)
           }
@@ -186,8 +186,8 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
       if (!is.null(metric)) {
         result <- metric$get(eval.metric)
         if (verbose) {
-          message(paste0("[", iteration, "] Validation-", result$name, "=", 
-                         result$value))
+          message("[", iteration, "] Validation-", result$name, "=", 
+                         result$value)
         }
       }
     } else {
@@ -266,7 +266,7 @@ mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric = NULL
     optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
   }
   
-  if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else sym_ini <- symbol
+  sym_ini <- if (is.list(symbol)) symbol[[names(train.data$bucketID)]] else symbol
   
   arguments <- sym_ini$arguments
   input.names <- intersect(names(train.data$value()), arguments)
diff --git a/R-package/R/mx.io.bucket.iter.R b/R-package/R/mx.io.bucket.iter.R
index 8e5ab59eaa..22ac1fae6e 100644
--- a/R-package/R/mx.io.bucket.iter.R
+++ b/R-package/R/mx.io.bucket.iter.R
@@ -20,7 +20,7 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                               .self
                             }, reset = function() {
                               buckets_nb <- length(bucket.names)
-                              buckets_id <- 1:buckets_nb
+                              buckets_id <- seq_len(buckets_nb)
                               buckets.size <- sapply(.self$buckets, function(x) {
                                 dim(x$data)[length(dim(x$data)) - 1]
                               })
@@ -36,7 +36,7 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                               
                               if (.self$shuffle) {
                                 set.seed(.self$seed)
-                                bucket_plan_names <- sample(rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket))
+                                bucket_plan_names <- sample(rep.int(names(.self$batch.per.bucket), times = .self$batch.per.bucket))
                                 .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
                                                          FUN = cumsum)
                                 names(.self$bucket.plan) <- bucket_plan_names
@@ -44,7 +44,7 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                                 .self$bucketID <- .self$bucket.plan[1]
                                 
                                 .self$buckets <- lapply(.self$buckets, function(x) {
-                                  shuffle_id <- sample(dim(x$data)[length(dim(x$data)) - 1])
+                                  shuffle_id <- sample.int(dim(x$data)[length(dim(x$data)) - 1])
                                   if (length(dim(x$label)) == 0) {
                                     list(data = x$data[shuffle_id, ], label = x$label[shuffle_id])
                                   } else {
@@ -52,7 +52,7 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                                   }
                                 })
                               } else {
-                                bucket_plan_names <- rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket)
+                                bucket_plan_names <- rep.int(names(.self$batch.per.bucket), times = .self$batch.per.bucket)
                                 .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
                                                          FUN = cumsum)
                                 names(.self$bucket.plan) <- bucket_plan_names
@@ -60,29 +60,25 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                             }, iter.next = function() {
                               .self$batch <- .self$batch + 1
                               .self$bucketID <- .self$bucket.plan[batch]
-                              if (.self$batch > .self$batch.per.epoch) {
-                                return(FALSE)
-                              } else {
-                                return(TRUE)
-                              }
+                              return(.self$batch < .self$batch.per.epoch)
                             }, value = function() {
                               # bucketID is a named integer: the integer indicates the batch id for the given
                               # bucket (used to fetch appropriate samples within the bucket) the name is the a
                               # character containing the sequence length of the bucket (used to unroll the rnn
                               # to appropriate sequence length)
-                              idx <- (.self$bucketID - 1) * (.self$batch.size) + (1:batch.size)
+                              idx <- (.self$bucketID - 1) * (.self$batch.size) + seq_len(batch.size)
                               
                               ### reuse first idx for padding
                               if (bucketID == .self$batch.per.bucket[names(.self$bucketID)] & !.self$last.batch.pad[names(.self$bucketID)] == 0) {
-                                idx <- c(idx[1:(.self$batch.size - .self$last.batch.pad[names(.self$bucketID)])], 1:(.self$last.batch.pad[names(.self$bucketID)]))
+                                idx <- c(idx[seq_len(.self$batch.size - .self$last.batch.pad[names(.self$bucketID)])], seq_len(.self$last.batch.pad[names(.self$bucketID)]))
                               }
                               
-                              data <- .self$buckets[[names(.self$bucketID)]]$data[idx, , drop = F]
+                              data <- .self$buckets[[names(.self$bucketID)]]$data[idx, , drop = FALSE]
                               seq.mask <- as.integer(names(bucketID)) - apply(data==.self$data.mask.element, 1, sum)
                               if (length(dim(.self$buckets[[names(.self$bucketID)]]$label)) == 0) {
                                 label <- .self$buckets[[names(.self$bucketID)]]$label[idx]
                               } else {
-                                label <- .self$buckets[[names(.self$bucketID)]]$label[idx, , drop = F]
+                                label <- .self$buckets[[names(.self$bucketID)]]$label[idx, , drop = FALSE]
                               }
                               return(list(data = mx.nd.array(data), seq.mask = mx.nd.array(seq.mask), 
                                           label = mx.nd.array(label)))
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 52fc1f24e5..253f031ba4 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -396,22 +396,13 @@ mx.opt.adadelta <- function(rho=0.90,
 #'
 #' @export
 mx.opt.create <- function(name, ...) {
-  if (name == "sgd") {
-    return(mx.opt.sgd(...))
-  }
-  else if (name == "rmsprop") {
-    return (mx.opt.rmsprop(...))
-  }
-  else if (name == "adam") {
-    return (mx.opt.adam(...))
-  }
-  else if (name == "adagrad") {
-    return (mx.opt.adagrad(...))
-  }
-  else if (name == "adadelta") {
-    return (mx.opt.adadelta(...))
-  }
-  stop(paste("Unknown optimizer ", name))
+  switch(name,
+         "sgd" = mx.opt.sgd(...),
+         "rmsprop" = mx.opt.rmsprop(...),
+         "adam" = mx.opt.adam(...),
+         "adagrad" = mx.opt.adagrad(...),
+         "adadelta" = mx.opt.adadelta(...),
+         stop("Unknown optimizer ", name))
 }
 
 #' Get an updater closure that can take list of weight and gradient
@@ -422,16 +413,15 @@ mx.opt.create <- function(name, ...) {
 #'
 #' @export
 mx.opt.get.updater <- function(optimizer, weights) {
-  n <- length(weights)
   # This is the list to keep track of internal states of optimzer
-  state.list <- lapply(1:n, function(i) {
+  state.list <- lapply(seq_along(weights), function(i) {
     if (is.null(weights[[i]])) return(NULL)
     optimizer$create.state(i, weights[[i]])
   })
   update <- optimizer$update
 
   update.closure <- function(weight, grad) {
-    ulist <- lapply(1:n, function(i) {
+    ulist <- lapply(seq_along(weights), function(i) {
       if (!is.null(grad[[i]])) {
         update(i, weight[[i]], grad[[i]], state.list[[i]])
       } else {
diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R
index 2c099f0802..5197882000 100644
--- a/R-package/R/rnn.graph.R
+++ b/R-package/R/rnn.graph.R
@@ -21,8 +21,8 @@ rnn.graph <- function(num.rnn.layer,
                       loss_output = NULL, 
                       config,
                       cell.type,
-                      masking = F,
-                      output_last_state = F) {
+                      masking = FALSE,
+                      output_last_state = FALSE) {
   
   # define input arguments
   data <- mx.symbol.Variable("data")
@@ -48,17 +48,17 @@ rnn.graph <- function(num.rnn.layer,
   
   # RNN cells
   if (cell.type == "lstm") {
-    rnn <- mx.symbol.RNN(data=data, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+    rnn <- mx.symbol.RNN(data=data, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=FALSE, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
     
   } else {
-    rnn <- mx.symbol.RNN(data=data, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+    rnn <- mx.symbol.RNN(data=data, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=FALSE, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
   }
   
   # Decode
   if (config=="seq-to-one") {
     
-    if (masking) mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
-      mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = F, name = "mask")
+    if (masking) mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = TRUE, sequence_length = seq.mask, name = "mask") else
+      mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = FALSE, name = "mask")
     
     decode <- mx.symbol.FullyConnected(data=mask,
                                        weight=cls.weight,
@@ -77,7 +77,7 @@ rnn.graph <- function(num.rnn.layer,
     
   } else if (config=="one-to-one"){
     
-    if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
+    if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = TRUE, sequence_length = seq.mask, value = 0, name = "mask") else
       mask <- mx.symbol.identity(data = rnn[[1]], name = "mask")
     
     mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
@@ -120,7 +120,7 @@ lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, d
     gates <- i2h
   }
   
-  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F, 
+  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = FALSE, 
                                  name = paste0("t", seqidx, ".l", layeridx, ".slice"))
   
   in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
@@ -157,7 +157,7 @@ gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dr
     gates <- i2h
   }
   
-  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F, 
+  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = FALSE, 
                                  name = paste0("t", seqidx, ".l", layeridx, ".split"))
   
   update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
@@ -166,15 +166,11 @@ gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dr
   htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight, 
                                          bias = param$trans.i2h.bias, num.hidden = num.hidden, 
                                          name = paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
-  
-  if (is.null(prev.state)) {
-    h.after.reset <- reset.gate * 0
-  } else {
-    h.after.reset <- prev.state$h * reset.gate
-  }
-  
-  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, 
-                                         bias = param$trans.h2h.bias, num.hidden = num.hidden, 
+
+  h.after.reset <- reset.gate * (if (is.null(prev.state)) 0 else prev.state$h)
+
+  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight,
+                                         bias = param$trans.h2h.bias, num.hidden = num.hidden,
                                          name = paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
   
   h.trans <- htrans.i2h + htrans.h2h
@@ -205,8 +201,8 @@ rnn.graph.unroll <- function(num.rnn.layer,
                              init.state = NULL,
                              config,
                              cell.type = "lstm", 
-                             masking = F, 
-                             output_last_state = F) {
+                             masking = FALSE, 
+                             output_last_state = FALSE) {
   
   
   if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
@@ -214,7 +210,7 @@ rnn.graph.unroll <- function(num.rnn.layer,
   cls.weight <- mx.symbol.Variable("cls.weight")
   cls.bias <- mx.symbol.Variable("cls.bias")
   
-  param.cells <- lapply(1:num.rnn.layer, function(i) {
+  param.cells <- lapply(seq_len(num.rnn.layer), function(i) {
     
     if (cell.type=="lstm"){
       cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
@@ -244,15 +240,15 @@ rnn.graph.unroll <- function(num.rnn.layer,
                                 weight=embed.weight, output_dim = num.embed, name = "embed")
   }
   
-  data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq.len, squeeze_axis = T)
+  data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq.len, squeeze_axis = TRUE)
   
   last.hidden <- list()
   last.states <- list()
   
-  for (seqidx in 1:seq.len) {
+  for (seqidx in seq_len(seq.len)) {
     hidden <- data[[seqidx]]
     
-    for (i in 1:num.rnn.layer) {
+    for (i in seq_len(num.rnn.layer)) {
       
       if (seqidx==1) prev.state<- init.state[[i]] else prev.state <- last.states[[i]]
       
diff --git a/R-package/R/rnn.infer.R b/R-package/R/rnn.infer.R
index c9ccecbddb..a22bae0eb3 100644
--- a/R-package/R/rnn.infer.R
+++ b/R-package/R/rnn.infer.R
@@ -52,7 +52,7 @@ mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
   arg.params.fix <- arguments.ini[arg.params.fix.names]
   
   # Grad request
-  grad.req <- rep("null", length(arguments))
+  grad.req <- rep.int("null", length(arguments))
   
   # Arg array order
   update_names <- c(input.names, arg.params.fix.names, arg.params.names)
@@ -138,7 +138,7 @@ mx.infer.buckets.one <- function(infer.data,
   aux.params <- aux.params
   
   # Grad request
-  grad.req <- rep("null", length(arguments))
+  grad.req <- rep.int("null", length(arguments))
   
   # Arg array order
   update_names <- c(input.names, arg.params.fix.names, arg.params.names)
diff --git a/R-package/R/util.R b/R-package/R/util.R
index acc9510ccf..8eddb5da6b 100644
--- a/R-package/R/util.R
+++ b/R-package/R/util.R
@@ -1,6 +1,6 @@
 # filter out null, keep the names
 mx.util.filter.null <- function(lst) {
-  lst[!sapply(lst, is.null)]
+  Filter(Negate(is.null), lst)
 }
 
 #' Internal function to generate mxnet_generated.R
diff --git a/R-package/R/viz.graph.R b/R-package/R/viz.graph.R
index 6d13de0af1..49f978a2cd 100644
--- a/R-package/R/viz.graph.R
+++ b/R-package/R/viz.graph.R
@@ -65,12 +65,14 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
   
   model_list<- fromJSON(symbol$as.json())
   model_nodes<- model_list$nodes
-  model_nodes$id<- 1:nrow(model_nodes)-1
+  model_nodes$id<- seq_len(nrow(model_nodes))-1
   model_nodes$level<- model_nodes$ID
   
   # extract IDs from string list
-  tuple_str <- function(str) sapply(str_extract_all(str, "\\d+"), function(x) paste0(x, collapse="X"))
-  
+  tuple_str <- function(str) vapply(str_extract_all(str, "\\d+"),
+                                    function(x) paste0(x, collapse="X"),
+                                    character(1))
+
   ### substitute op for heads
   op_id<- sort(unique(model_list$heads[1,]+1))
   op_null<- which(model_nodes$op=="null")
@@ -104,23 +106,23 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
   
   ### remapping for DiagrammeR convention
   nodes_df$id<- nodes_df$id
-  nodes_df$id_graph<- 1:nrow(nodes_df)
+  nodes_df$id_graph<- seq_len(nrow(nodes_df))
   id_dic<- nodes_df$id_graph
   names(id_dic)<- as.character(nodes_df$id)
   
-  edges_id<- model_nodes$id[!sapply(model_nodes$inputs, length)==0 & !model_nodes$op=="null"]
+  edges_id<- model_nodes$id[lengths(model_nodes$inputs)!=0 & model_nodes$op!="null"]
   edges_id<- id_dic[as.character(edges_id)]
-  edges<- model_nodes$inputs[!sapply(model_nodes$inputs, length)==0 & !model_nodes$op=="null"]
-  edges<- sapply(edges, function(x)intersect(as.numeric(x[, 1]), id.to.keep), simplify = F)
+  edges<- model_nodes$inputs[lengths(model_nodes$inputs)!=0 & model_nodes$op!="null"]
+  edges<- sapply(edges, function(x)intersect(as.numeric(x[, 1]), id.to.keep), simplify = FALSE)
   names(edges)<- edges_id
   
   edges_df<- data.frame(
     from=unlist(edges),
-    to=rep(names(edges), time=sapply(edges, length)),
+    to=rep(names(edges), time=lengths(edges)),
     arrows = "to",
     color="black",
     from_name_output=paste0(model_nodes$name[unlist(edges)+1], "_output"), 
-    stringsAsFactors=F)
+    stringsAsFactors=FALSE)
   edges_df$from<- id_dic[as.character(edges_df$from)]
   
   nodes_df_new<- create_node_df(n = nrow(nodes_df), label=nodes_df$label, shape=nodes_df$shape, type="base", penwidth=2, color=nodes_df$color, style="filled", 
@@ -133,14 +135,14 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
     } else edges_labels_raw<- symbol$get.internals()$infer.shape(list(data=shape))$out.shapes
     if (!is.null(edges_labels_raw)){
       edge_label_str <- function(x) paste0(x, collapse="X")
-      edges_labels_raw<- sapply(edges_labels_raw, edge_label_str)
+      edges_labels_raw<- vapply(edges_labels_raw, edge_label_str, character(1))
       names(edges_labels_raw)[names(edges_labels_raw)=="data"]<- "data_output"
       edge_df_new$label<- edges_labels_raw[edges_df$from_name_output]
       edge_df_new$rel<- edge_df_new$label
     }
   }
   
-  graph<- create_graph(nodes_df = nodes_df_new, edges_df = edge_df_new, directed = T) %>% 
+  graph<- create_graph(nodes_df = nodes_df_new, edges_df = edge_df_new, directed = TRUE) %>% 
     set_global_graph_attrs("layout", value = "dot", attr_type = "graph") %>% 
     add_global_graph_attrs("rankdir", value = direction, attr_type = "graph")
   
diff --git a/R-package/README.md b/R-package/README.md
index c39b2b101d..e21d6b17dc 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -2,10 +2,10 @@
 ==========================
 
 You have found MXNet R Package! The MXNet R packages brings flexible and efficient GPU
-computing and state-of-art deep learning to R.
+computing and state-of-the-art deep learning to R.
 
 - It enables you to write seamless tensor/matrix computation with multiple GPUs in R.
-- It also enables you to construct and customize state-of-art deep learning models in R,
+- It also enables you to construct and customize state-of-the-art deep learning models in R,
   and apply them to tasks such as image classification and data science challenges.
 
 Sounds exciting? This page contains links to all the related documentation of the R package.
diff --git a/README.md b/README.md
index fc252a7a72..0326412541 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ deep learning systems, and interesting insights of DL systems for hackers.
 
 What's New
 ----------
+* [Version 0.12.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.1) - MXNet 0.12.1 Patch Release.
 * [Version 0.12.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.0) - MXNet 0.12.0 Release.
 * [Version 0.11.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.11.0) - MXNet 0.11.0 Release.
 * [Apache Incubator](http://incubator.apache.org/projects/mxnet.html) - We are now an Apache Incubator project.
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 2aba8f4bdc..9419898135 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -17,6 +17,7 @@
 
 import sys
 import os.path, re, StringIO
+import platform
 
 blacklist = [
     'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh',
@@ -26,7 +27,8 @@
     'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
     'nvml.h', 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h',
     'omp.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
-    'cusolverDn.h'
+    'cusolverDn.h', 'internal/concurrentqueue_internal_debug.h', 'relacy/relacy_std.hpp',
+    'relacy_shims.h'
     ]
 
 minimum = int(sys.argv[6]) if len(sys.argv) > 5 else 0
@@ -36,6 +38,13 @@
 if minimum != 0:
     blacklist.append('linalg.h')
 
+if platform.system() != 'Darwin':
+  blacklist.append('TargetConditionals.h')
+
+if platform.system() != 'Windows':
+  blacklist.append('windows.h')
+  blacklist.append('process.h')
+
 def pprint(lst):
     for item in lst:
         print item
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
index 4aebb55c50..f3763bbd6e 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.h
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -130,10 +130,8 @@ class OptimizerRegistry {
   OptimizerRegistry() = delete;
   ~OptimizerRegistry() = delete;
 };
-
-#define MXNETCPP_REGISTER_OPTIMIZER(Name, OptimizerType)          \
-  static int __make_ ## OptimizerType ## _ ## Name ## __ = \
-       OptimizerRegistry::__REGISTER__(#Name, [](){return new OptimizerType();})
+#define MXNETCPP_REGISTER_OPTIMIZER(Name, OptimizerType)\
+  OptimizerRegistry::__REGISTER__(#Name, [](){return new OptimizerType();})
 
 class SGDOptimizer : public Optimizer {
  public:
diff --git a/dmlc-core b/dmlc-core
index 87b7ffa59e..2a61609363 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 87b7ffa59eb78f753073ac56f5f60e46d930b93c
+Subproject commit 2a61609363b07a82c9f128d5fc73c0fd3a2bad54
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
index 2d33bd72c4..3162acd85a 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -21,7 +21,7 @@
 # Built files are stored in $built
 # Version numbers are stored in $tag_list.
 # Version numbers are ordered from latest to old and final one is master.
-tag_list="0.12.0 0.11.0 master"
+tag_list="0.12.1 0.12.0 0.11.0 master"
 
 mxnet_url="https://github.com/apache/incubator-mxnet.git"
 mxnet_folder="apache_mxnet"
diff --git a/docs/faq/gradient_compression.md b/docs/faq/gradient_compression.md
new file mode 100644
index 0000000000..4cd58f05d5
--- /dev/null
+++ b/docs/faq/gradient_compression.md
@@ -0,0 +1,107 @@
+# Gradient Compression
+
+Gradient Compression reduces communication bandwidth, and in some scenarios, it can make training more scalable and efficient without significant loss in convergence rate or accuracy. Example implementations with GPUs, CPUs, and distributed training are provided in this document. 
+
+
+## Benefits
+
+**Increased Speed**
+
+For architectures with fully connected layers, the gradient compression capability is observed to speedup training by about 2x, depending on the size of the model and the network bandwidth of the instance. Bigger models see larger speedup with gradient compression.
+
+**Minimal Accuracy Loss**
+
+Gradient compression uses the approach of delaying the synchronization of weight updates which are small. Although small weight updates might not be sent for that batch, this information is not discarded. Once the weight updates for this location accumulate to become a larger value, they will be propagated. Since there is no information loss, but only delayed updates, it does not lead to a significant loss in accuracy or convergence rate. In distributed training experiments[1], the accuracy loss observed due to gradient compression was as low as 1%
+
+
+## When to Use Gradient Compression
+
+When training models whose architectures include large fully connected components, it can be helpful to use gradient compression. For larger models, as well as recurrent neural networks, the communication cost becomes a major factor. Such models stand to benefit greatly with gradient compression.
+
+
+### GPU versus CPU
+
+The greatest benefits from gradient compression are realized when using multi-node (single or multi-GPU) distributed training. Training on CPU would provide a lower compute density per compute node as compared to the massive compute density per compute node on a GPU. Due to this, the required communication bandwidth for CPU-based nodes during training is not as high as for GPU-based nodes. Hence, the benefits of gradient compression are lower for CPU-based nodes as compared to GPU-based nodes.
+
+
+### Network Latency
+
+Benefits of gradient compression can be found when using distributed training with network connected nodes. Depending on the network latency between nodes and the model's size, these can contribute to slow performance such that gradient compression may provide speed improvements.
+
+You may not want to use gradient compression if you have low latency network communication.
+
+
+### Model Size
+
+Distributed training involves synchronization of weights after each batch. Larger models have much higher communication costs during training, hence such models stand to benefit much more from gradient compression.
+When running distributed training with gradient compression, the quantize and dequantize operations happen on CPU parallelized with OpenMP. For smaller models, when training on GPUs, it helps to set `OMP_NUM_THREADS=1` on each node, so that the overhead of launching OMP threads doesn't cause the compression and decompression to be slow.
+
+### Model Architecture
+
+The communication bandwidth requirements during training vary across various neural network architectures and hence the benefits of gradient compression vary accordingly.
+
+In networks which have significant fully connected components, since such layers have low compute cost on GPUs, communication becomes a bottleneck limiting the speed of distributed training. Gradient compression can help reduce the communication cost, and thus speed up training in such cases. We have observed speedup of about 2x on large fully connected neural networks. Models like AlexNet and VGG have large fully connected components as part of the network, hence stand to benefit from gradient compression. As with these models, Long Short-Term Memory architectures require more communication bandwidth, so they also exhibit speed improvements with gradient compression.
+
+Architectures like Convolutional Neural Networks on the other hand have a higher compute cost, in which case some communication can be parallelized with computation. Since communication is not the bottleneck in such networks, gradient compression doesn't help much.
+
+
+### Single Node Gradient Compression
+
+When the training is configured to use device to device communication on a single node with multiple GPUs, gradient compression can be used to reduce the cost of communication. This can provide about 20% speedup for large models using older generation architectures. However, speed benefits may be negligible on a machine with a newer generation architecture where GPUs can communicate at low latency.
+
+
+## Approach
+
+The idea behind gradient compression comes from two observations:
+
+First, when training large neural networks, the gradients of weights computed for a small mini-batch of training data are typically sparse. Only a small fraction of the weights have significant updates after each mini-batch. The synchronization of updates that are near zero can be safely delayed longer than the typical mini-batch size. This essentially means that the rate of weight-update can vary depending on the value of an individual weight.
+
+Secondly, gradients can be compressed significantly by considering only those gradient elements whose absolute values exceed a threshold, and then quantizing them to use lower bits per gradient value. By compressing the gradients, we can reduce communication bandwidth. The delayed gradient values, in the form of quantization error and values that don't meet the threshold, are aggregated into a gradient residual which is communicated when it reaches the threshold.
+
+## Technical Implementation
+
+### Two Bit Quantization
+
+Currently the supported type of quantization uses two bits for each gradient value. Any positive value greater than or equal to the threshold sets two bits as `11`, any negative value whose absolute value is greater or equal to the threshold sets two bits as `10`, and others are set to `00`. This enables us to store 16 quantized gradients as one float. The error in quantization, which is `original_value - quantized_value` is stored in the form of a gradient residual.
+
+### Types of Kvstore
+
+Supported types of `kvstore` are `device` and all distributed kvstores such as `dist_sync`, `dist_async`, and `dist_sync_device`. When `kvstore` is `device`, the communication between GPUs is compressed. Please note that this increases the memory usage of GPUs because of the additional residual stored. When using a distributed kvstore, worker-to-server communication is compressed. In this case, compression and decompression happen on the CPU, and gradient residuals will be stored on the CPU. Server-to-worker communication and device-to-device communication are not compressed to avoid multiple levels of compression.
+
+## Enabling the Gradient Compression in MXNet
+
+Gradient compression is a run-time configuration parameter to be enabled during training. Here are the MXNet APIs to enable gradient compression:
+
+**Gluon API**:
+
+```
+trainer = gluon.Trainer(..., compression_params={'type?:'2bit', 'threshold':0.5})
+```
+A reference `gluon` implementation with a gradient compression option can be found in the [train.py script from a word-level language modeling RNN example](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/word_language_model/train.py).
+
+**Module API**:
+
+```
+mod = mx.mod.Module(..., compression_params={'type?:'2bit', 'threshold':0.5})
+```
+
+A `module` example is provided with [this guide for setting up MXNet with distributed training](https://mxnet.incubator.apache.org/versions/master/how_to/multi_devices.html#distributed-training-with-multiple-machines). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
+
+### Configuration Details
+
+**Threshold**
+
+A default `threshold` value of `0.5` is good for most use cases, but to get the most benefit from gradient compression for a particular scenario, it can be beneficial to experiment. If the threshold is set to a very large value, say `10.0`, then the updates become too infrequent and the training will converge slower. Setting the threshold automatically is expected in a future release.
+
+**Quantization**
+
+This release supports 2-bit quantization for encoding of gradients to reduce the communication bandwidth during training. Future releases will support 1-bit quantization and other approaches for encoding of gradients based on experimental evidence of benefits and user demand.
+
+**Sparse Format**
+
+We believe that the density of data will need to be really low (i.e. around > 90% zeros) to reap benefits of the sparse format. However, this is an area of experimentation that will be explored in a future release.
+
+
+## References
+
+1. [Nikko Storm, Amazon.com, Scalable Distributed Training using commodity GPU cloud computing.](https://s3-us-west-2.amazonaws.com/amazon.jobs-public-documents/strom_interspeech2015.pdf)
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 883d8e6188..e5807f42fc 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -14,12 +14,15 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 * [How do I visualize neural networks as computation graphs?](http://mxnet.io/how_to/visualize_graph.html)
 
 
-## Speed
-
+## Scale
 * [How can I train with multiple CPU/GPUs with data parallelism?](http://mxnet.io/how_to/multi_devices.html)
 
 * [How can I train with multiple GPUs with model parallelism?](http://mxnet.io/how_to/model_parallel_lstm.html)
 
+
+## Speed
+* [How do I use gradient compression with distributed training?](http://mxnet.io/how_to/gradient_compression.html)
+
 * [Can I use nnpack to improve the CPU performance of MXNet?](http://mxnet.io/how_to/nnpack.html)
 
 * [What are the best setup and data-handling tips and tricks for improving speed?](http://mxnet.io/how_to/perf.html)
@@ -55,8 +58,6 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 
 * [How do I set MXNet's environmental variables?](http://mxnet.io/how_to/env_var.html)
 
-* [How do I use MXNet as a front end for Torch?](http://mxnet.io/how_to/torch.html)
-
 ## Questions about Using MXNet
 If you need help with using MXNet, have questions about applying it to a particular kind of problem, or have a discussion topic, please use our [forum](https://discuss.mxnet.io).
 
diff --git a/docs/faq/multi_devices.md b/docs/faq/multi_devices.md
index 3272062243..c79d1f80be 100644
--- a/docs/faq/multi_devices.md
+++ b/docs/faq/multi_devices.md
@@ -167,6 +167,19 @@ python ../../tools/launch.py -n 2 -H hosts --sync-dst-dir /tmp/mxnet \
    python train_mnist.py --network lenet --kv-store dist_sync
 ```
 
+
+### Gradient compression
+
+If your model has fully connected components or recurrent neural networks, you may achieve increased training speed using gradient compression with potentially slight loss of accuracy. Please see [Gradient Compression](https://mxnet.incubator.apache.org/versions/master/faq/gradient_compression.html) for more details on when and how to use it. For the above example, gradient compression can be enabled by running the following:
+
+```bash
+python ../../tools/launch.py -n 2 --launcher ssh -H hosts python train_mnist.py --network lenet \
+    --kv-store dist_sync --gc-type 2bit
+```
+
+In this example, `gc-type` has been set to `2bit`, to enable two bit gradient compression.
+
+
 ### Use a Particular Network Interface
 
 _MXNet_ often chooses the first available network interface.
diff --git a/docs/faq/torch.md b/docs/faq/torch.md
deleted file mode 100644
index 26def878c2..0000000000
--- a/docs/faq/torch.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# How to Use MXNet As an (Almost) Full-function Torch Front End
-
-This topic demonstrates how to use MXNet as a front end to two of Torch's major functionalities:
-
-* Call Torch's tensor mathematical functions with MXNet.NDArray 
-
-* Embed Torch's neural network modules (layers) into MXNet's symbolic graph 
-## Compile with Torch
-
-
-* Install Torch using the [official guide](http://torch.ch/docs/getting-started.html).
-	* If you haven't already done so, copy `make/config.mk` (Linux) or `make/osx.mk` (Mac) into the MXNet root folder as `config.mk`. In `config.mk` uncomment the lines `TORCH_PATH = $(HOME)/torch` and `MXNET_PLUGINS += plugin/torch/torch.mk`.
-    * By default, Torch should be installed in your home folder (so `TORCH_PATH = $(HOME)/torch`). Modify TORCH_PATH to point to your torch installation, if necessary. 
-* Run `make clean && make` to build MXNet with Torch support.
-
-## Tensor Mathematics
-The mxnet.th module supports calling Torch's tensor mathematical functions with mxnet.nd.NDArray. See [complete code](https://github.com/dmlc/mxnet/blob/master/example/torch/torch_function.py):
-
- ```Python
-    import mxnet as mx
-    x = mx.th.randn(2, 2, ctx=mx.cpu(0))
-    print x.asnumpy()
-    y = mx.th.abs(x)
-    print y.asnumpy()
-
-    x = mx.th.randn(2, 2, ctx=mx.cpu(0))
-    print x.asnumpy()
-    mx.th.abs(x, x) # in-place
-    print x.asnumpy()
- ```
-For help, use the `help(mx.th)` command. 
-
-We've added support for most common functions listed on [Torch's documentation page](https://github.com/torch/torch7/blob/master/doc/maths.md). 
-If you find that the function you need is not supported, you can easily register it in `mxnet_root/plugin/torch/torch_function.cc` by using the existing registrations as examples.
-
-## Torch Modules (Layers)
-MXNet supports Torch's neural network modules through  the`mxnet.symbol.TorchModule` symbol.
-For example, the following code defines a three-layer DNN for classifying MNIST digits ([full code](https://github.com/dmlc/mxnet/blob/master/example/torch/torch_module.py)):
-
- ```Python
-    data = mx.symbol.Variable('data')
-    fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')
-    act1 = mx.symbol.TorchModule(data_0=fc1, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu1')
-    fc2 = mx.symbol.TorchModule(data_0=act1, lua_string='nn.Linear(128, 64)', num_data=1, num_params=2, num_outputs=1, name='fc2')
-    act2 = mx.symbol.TorchModule(data_0=fc2, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu2')
-    fc3 = mx.symbol.TorchModule(data_0=act2, lua_string='nn.Linear(64, 10)', num_data=1, num_params=2, num_outputs=1, name='fc3')
-    mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
- ```
-Let's break it down. First `data = mx.symbol.Variable('data')` defines a Variable as a placeholder for input.
-Then, it's fed through Torch's nn modules with:
-     `fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')`.
-To use Torch's criterion as loss functions, you can replace the last line with:
- ```Python
-    logsoftmax = mx.symbol.TorchModule(data_0=fc3, lua_string='nn.LogSoftMax()', num_data=1, num_params=0, num_outputs=1, name='logsoftmax')
-    # Torch's label starts from 1
-    label = mx.symbol.Variable('softmax_label') + 1
-    mlp = mx.symbol.TorchCriterion(data=logsoftmax, label=label, lua_string='nn.ClassNLLCriterion()', name='softmax')
- ```
-The input to the nn module is named data_i for i = 0 ... num_data-1. `lua_string` is a single Lua statement that creates the module object.
-For Torch's built-in module, this is simply `nn.module_name(arguments)`.
-If you are using a custom module, place it in a .lua script file and load it with `require 'module_file.lua'` if your script returns a torch.nn object, or `(require 'module_file.lua')()` if your script returns a torch.nn class.
-
diff --git a/docs/install/build_from_source.md b/docs/install/build_from_source.md
index 82baa1bb02..4f7083a824 100644
--- a/docs/install/build_from_source.md
+++ b/docs/install/build_from_source.md
@@ -317,6 +317,43 @@ Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build
 In Visual Studio, open the solution file,```.sln```, and compile it.
 These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
 
+</div>
+
+<div class="linux ubuntu">
+
+## Build MXNet using NCCL
+- Download and install the latest NCCL library from NVIDIA.
+- Note the directory path in which NCCL libraries and header files are installed.
+- Ensure that the installation directory contains ```lib``` and ```include``` folders.
+- Ensure that the prerequisites for using NCCL such as Cuda libraries are met. 
+- Append the ```config.mk``` file with following, in addition to the CUDA related options.
+- USE_NCCL=1
+- USE_NCCL_PATH=path-to-nccl-installation-folder
+``` bash
+echo "USE_NCCL=1" >> make/config.mk
+echo "USE_NCCP_PATH=path-to-nccl-installation-folder" >> make/config.mk
+cp make/config.mk .
+```
+- Run make command
+``` bash
+make -j"$(nproc)"
+```
+
+## Validation
+- Follow the steps to install MXNet Python binding.
+- Comment the following line in ```test_nccl.py``` file at ```incubator-mxnet/tests/python/gpu/test_nccl.py```
+``` bash
+@unittest.skip("Test requires NCCL library installed and enabled during build")
+```
+- Run test_nccl.py script as follows. The test should complete. It does not produce any output.
+``` bash
+nosetests --verbose tests/python/gpu/test_nccl.py
+```
+
+## Recommendation for best performance
+It is recommended to set environment variable NCCL_LAUNCH_MODE to PARALLEL when using NCCL version 2.1 or newer.
+
+
 </div>
 
 ## Build the C++ package
diff --git a/docs/install/index.md b/docs/install/index.md
index 24d6aeeea1..52dfaa84a4 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -229,7 +229,7 @@ $ sudo apt-get install -y libopencv-dev
 **Step 4** Download MXNet sources and build MXNet core shared library.
 
 ```bash
-$ git clone --recursive https://github.com/apache/incubator-mxnet 
+$ git clone --recursive https://github.com/apache/incubator-mxnet
 $ cd incubator-mxnet
 $ make -j $(nproc) USE_OPENCV=1 USE_BLAS=openblas
 ```
@@ -284,8 +284,8 @@ The following installation instructions have been tested on Ubuntu 14.04 and 16.
 
 Install the following NVIDIA libraries to setup *MXNet* with GPU support:
 
-1. Install CUDA 8.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
-2. Install cuDNN 5 for CUDA 8.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
+1. Install CUDA 9.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+2. Install cuDNN 7 for CUDA 9.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
 
 **Note:** Make sure to add CUDA install path to `LD_LIBRARY_PATH`.
 
@@ -304,10 +304,10 @@ $ sudo apt-get install -y wget python
 $ wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
 ```
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 8.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
 
 ```bash
-$ pip install mxnet-cu80
+$ pip install mxnet-cu90
 ```
 
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
@@ -320,7 +320,7 @@ pip install graphviz
 
 **Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
 ```bash
-$ pip install mxnet-cu80mkl
+$ pip install mxnet-cu90mkl
 ```
 
 </div>
@@ -364,10 +364,10 @@ Installing *MXNet* with pip requires a latest version of `pip`. Install the late
 (mxnet)$ pip install --upgrade pip
 ```
 
-Install *MXNet* with GPU support using CUDA 8.0.
+Install *MXNet* with GPU support using CUDA 9.0.
 
 ```bash
-(mxnet)$ pip install mxnet-cu80
+(mxnet)$ pip install mxnet-cu90
 ```
 
 **Step 4**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
@@ -692,7 +692,7 @@ $ bash install-mxnet-osx-python.sh
 More details and verified installation instructions for macOS, with GPUs, coming soon.
 
 
-*MXNet* is expected to be compatible on macOS with NVIDIA GPUs. Please install CUDA 8.0 and cuDNN 5.0, prior to installing GPU version of *MXNet*.
+*MXNet* is expected to be compatible on macOS with NVIDIA GPUs. Please install CUDA 9.0 and cuDNN 7, prior to installing GPU version of *MXNet*.
 
 </div>
 </div>
@@ -704,11 +704,9 @@ More details and verified installation instructions for macOS, with GPUs, coming
 
 <div class="cloud">
 
-AWS Marketplace distributes AMIs (Amazon Machine Image) with MXNet pre-installed. You can launch an Amazon EC2 instance with one of the below AMIs:
-1. Deep Learning AMI (Amazon Machine Image) for [Ubuntu](https://aws.amazon.com/marketplace/pp/B06VSPXKDX)
-2. Deep Learning AMI for [Amazon Linux](https://aws.amazon.com/marketplace/pp/B01M0AXXQB)
+AWS Marketplace distributes Deep Learning AMIs (Amazon Machine Image) with MXNet pre-installed. You can launch one of these Deep Learning AMIs by following instructions in the [AWS Deep Learning AMI Developer Guide](http://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html).
 
-You could also run distributed deeplearning with *MXNet* on AWS using [Cloudformation Template](https://github.com/awslabs/deeplearning-cfn/blob/master/README.md).
+You can also run distributed deep learning with *MXNet* on AWS using [Cloudformation Template](https://github.com/awslabs/deeplearning-cfn/blob/master/README.md).
 
 </div>
 
@@ -814,8 +812,8 @@ The following installation instructions have been tested on Ubuntu 14.04 and 16.
 
 Install the following NVIDIA libraries to setup *MXNet* with GPU support:
 
-1. Install CUDA 8.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
-2. Install cuDNN 5 for CUDA 8.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
+1. Install CUDA 9.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+2. Install cuDNN 7 for CUDA 9.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
 
 **Note:** Make sure to add CUDA install path to `LD_LIBRARY_PATH`.
 
@@ -1077,7 +1075,7 @@ Clone the MXNet source code repository using the following ```git``` command in
 Edit the Makefile to install the MXNet with CUDA bindings to leverage the GPU on the Jetson:
 ```bash
     cp make/config.mk .
-    echo "USE_CUDA=1" >> config.mk    
+    echo "USE_CUDA=1" >> config.mk
     echo "USE_CUDA_PATH=/usr/local/cuda" >> config.mk
     echo "USE_CUDNN=1" >> config.mk
 ```
@@ -1110,7 +1108,7 @@ Add the mxnet folder to the path:
 
 ```bash
     cd ..
-    export MXNET_HOME=$(pwd)                       
+    export MXNET_HOME=$(pwd)
     echo "export PYTHONPATH=$MXNET_HOME/python:$PYTHONPATH" >> ~/.bashrc
     source ~/.bashrc
 ```
@@ -1458,15 +1456,13 @@ Will be available soon.
   </div>
     <div class="gpu">
 
-The following installation instructions have been tested on Ubuntu 14.04 and 16.04.
-
 
 **Prerequisites**
 
 Install the following NVIDIA libraries to setup *MXNet* with GPU support:
 
-1. Install CUDA 8.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows).
-2. Install cuDNN 7 for CUDA 8.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
+1. Install CUDA 9.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows).
+2. Install cuDNN 7 for CUDA 9.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
 
 **Note:** Make sure to add CUDA install path to `PATH`.
 
@@ -1477,10 +1473,10 @@ Install the following NVIDIA libraries to setup *MXNet* with GPU support:
 
 Recommend install ```Anaconda3``` [here](https://www.anaconda.com/download/)
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 8.0
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
 
 ```bash
-$ pip install mxnet-cu80
+$ pip install mxnet-cu90
 ```
 
 </div>
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index bf1673ac87..e5e92a7303 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -25,16 +25,28 @@ To build and install MXNet yourself, you need the following dependencies. Instal
 2. Download and Install [CMake](https://cmake.org/) if it is not already installed.
 3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
 4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory```.
-6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
-7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
-8. Download and install [CuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```c:\utils\opencv\build``` for example).
+6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. Typically, you can find the directory in
+```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
+7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
+8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
+9. Download and install [CuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
-1. Download the MXNet source code from [GitHub](https://github.com/dmlc/mxnet).
-2. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build```.
-3. In Visual Studio, open the solution file,```.sln```, and compile it.
+1. Download the MXNet source code from [GitHub](https://github.com/dmlc/mxnet). Don't forget to pull the submodules:
+```
+    git clone https://github.com/apache/incubator-mxnet.git ~/mxnet --recursive
+```
+2. Start a Visual Studio command prompt.
+3. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
+[CMake](https://cmake.org/) command:
+```
+    mkdir build
+    cd build
+    cmake -G "Visual Studio 14 Win64" ..
+```
+4. In Visual Studio, open the solution file,```.sln```, and compile it.
 These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
 
 
diff --git a/docs/tutorials/basic/image_io.md b/docs/tutorials/basic/image_io.md
index b017c9fb14..e6434257b7 100644
--- a/docs/tutorials/basic/image_io.md
+++ b/docs/tutorials/basic/image_io.md
@@ -7,9 +7,9 @@ iterators to process image data.
 
 There are mainly three ways of loading image data in MXNet:
 
-- [NEW] `mx.img.ImageIter`: implemented in python, easily customizable, can load
+- [NEW] [mx.img.ImageIter](https://mxnet.incubator.apache.org/versions/master/api/python/image/image.html#mxnet.image.ImageIter): implemented in python, easily customizable, can load
   from both .rec files and raw image files.
-- [OLD] `mx.io.ImageRecordIter`: implemented in backend (C++), less customizable
+- [OLD] [mx.io.ImageRecordIter](https://mxnet.incubator.apache.org/versions/master/api/python/io.html#mxnet.io.ImageRecordIter): implemented in backend (C++), less customizable
   but can be used in all language bindings, load from .rec files
 - Custom iterator by inheriting mx.io.DataIter
 
@@ -17,7 +17,7 @@ First, we explain the record io file format used by mxnet:
 
 ## RecordIO
 
-Record IO is the main file format used by MXNet for data IO. It supports reading
+[Record IO](https://mxnet.incubator.apache.org/architecture/note_data_loading.html#data-format) is the main file format used by MXNet for data IO. It supports reading
 and writing on various file systems including distributed file systems like
 Hadoop HDFS and AWS S3.  First, we download the Caltech 101 dataset that
 contains 101 classes of objects and convert them into record io format:
@@ -34,7 +34,7 @@ import matplotlib.pyplot as plt
 MXNET_HOME = '/scratch/mxnet'
 ```
 
-Download and unzip:
+Download and unzip the dataset. The dataset is about ~126MB and may take some time:
 
 ```python
 os.system('wget http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz -P data/')
@@ -43,15 +43,18 @@ os.system('tar -xf 101_ObjectCategories.tar.gz')
 os.chdir('../')
 ```
 
-Let's take a look at the data. As you can see, under the
-[root folder](./data/101_ObjectCategories) every category has a
-[subfolder](./data/101_ObjectCategories/yin_yang).
+Let's take a look at the data. 
+
+As you can see, under the
+root folder (data/101_ObjectCategories) every category has a
+subfolder (e.g. data/101_ObjectCategories/yin_yang).
 
 Now let's convert them into record io format. First we need to make a list that
 contains all the image files and their categories:
 
 
 ```python
+assert(MXNET_HOME != '/scratch/mxnet'), "Please update your MXNet location"
 os.system('python %s/tools/im2rec.py --list=1 --recursive=1 --shuffle=1 --test-ratio=0.2 data/caltech data/101_ObjectCategories'%MXNET_HOME)
 ```
 
@@ -66,7 +69,7 @@ Then we can use this list to create our record io file:
 os.system("python %s/tools/im2rec.py --num-thread=4 --pass-through=1 data/caltech data/101_ObjectCategories"%MXNET_HOME)
 ```
 
-The record io files are now saved at [here](./data)
+The record io files are now saved in the "data" directory.
 
 ## ImageRecordIter
 
diff --git a/docs/tutorials/basic/ndarray_indexing.md b/docs/tutorials/basic/ndarray_indexing.md
new file mode 100644
index 0000000000..37168b3401
--- /dev/null
+++ b/docs/tutorials/basic/ndarray_indexing.md
@@ -0,0 +1,377 @@
+
+# NDArray Indexing - Array indexing features
+
+MXNet's advanced indexing features are modeled after [NumPy's implementation and documentation](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing). You will see direct adaptations of many NumPy indexing features and examples which are close, if not identical, so we borrow much from their documentation.
+
+`NDArray`s can be indexed using the standard Python `x[obj]` syntax, where _x_ is the array and _obj_ the selection.
+
+There are two kinds of indexing available:
+
+1. basic slicing
+1. advanced indexing
+
+In MXNet, we support both basic and advanced indexing following the convention of indexing NumPy's `ndarray`.
+
+
+## Basic Slicing and Indexing
+
+Basic slicing extends Python?s basic concept of slicing to N dimensions. For a quick review:
+
+```
+a[start:end] # items start through end-1
+a[start:]    # items start through the rest of the array
+a[:end]      # items from the beginning through end-1
+a[:]         # a copy of the whole array
+```
+
+
+```python
+from mxnet import nd
+```
+
+For some working examples of basic slicing we'll start simple.
+
+
+```python
+x = nd.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')
+x[5:]
+```
+
+
+
+
+
+    [5 6 7 8 9]
+    <NDArray 5 @cpu(0)>
+
+
+
+
+```python
+x = nd.array([0, 1, 2, 3])
+print('1D complete array, x=', x)
+s = x[1:3]
+print('slicing the 2nd and 3rd elements, s=', s)
+```
+
+    1D complete array, x=
+    [ 0.  1.  2.  3.]
+    <NDArray 4 @cpu(0)>
+    slicing the 2nd and 3rd elements, s=
+    [ 1.  2.]
+    <NDArray 2 @cpu(0)>
+
+
+Now let's try slicing the 2nd and 3rd elements of a multi-dimensional array.
+
+
+```python
+x = nd.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
+print('multi-D complete array, x=', x)
+s = x[1:3]
+print('slicing the 2nd and 3rd elements, s=', s)
+```
+
+    multi-D complete array, x=
+    [[  1.   2.   3.   4.]
+     [  5.   6.   7.   8.]
+     [  9.  10.  11.  12.]]
+    <NDArray 3x4 @cpu(0)>
+    slicing the 2nd and 3rd elements, s=
+    [[  5.   6.   7.   8.]
+     [  9.  10.  11.  12.]]
+    <NDArray 2x4 @cpu(0)>
+
+
+Now let's try writing to a specific element. We'll write `9` to element `2` using `x[2] = 9.0`, which will update the whole row.
+
+
+```python
+print('original x, x=', x)
+x[2] = 9.0
+print('replaced entire row with x[2] = 9.0, x=', x)
+```
+
+    original x, x=
+    [[  1.   2.   3.   4.]
+     [  5.   6.   7.   8.]
+     [  9.  10.  11.  12.]]
+    <NDArray 3x4 @cpu(0)>
+    replaced entire row with x[2] = 9.0, x=
+    [[ 1.  2.  3.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+
+
+We can target specific elements too. Let's replace the number `3` in the first row with the number `9` using `x[0, 2] = 9.0`.
+
+
+```python
+print('original x, x=', x)
+x[0, 2] = 9.0
+print('replaced specific element with x[0, 2] = 9.0, x=', x)
+```
+
+    original x, x=
+    [[ 1.  2.  3.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+    replaced specific element with x[0, 2] = 9.0, x=
+    [[ 1.  2.  9.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+
+
+Now lets target even more by selecting a couple of targets at the same time. We'll replace the `6` and the `7` with `x[1:2, 1:3] = 5.0`.
+
+
+```python
+print('original x, x=', x)
+x[1:2, 1:3] = 5.0
+print('replaced range of elements with x[1:2, 1:3] = 5.0, x=', x)
+```
+
+    original x, x=
+    [[ 1.  2.  9.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+    replaced range of elements with x[1:2, 1:3] = 5.0, x=
+    [[ 1.  2.  9.  4.]
+     [ 5.  5.  5.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+
+
+## New Indexing Features in v1.0
+
+### Step
+
+The basic slice syntax is `i:j:k` where _i_ is the starting index, _j_ is the stopping index, and _k_ is the step (_k_ must be nonzero).
+
+**Note**: Previously, MXNet supported basic slicing and indexing only with `step=1`. From release 1.0, arbitrary values of `step` are supported.
+
+
+```python
+x = nd.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')
+# Select elements 1 through 7, and use a step of 2
+x[1:7:2]
+```
+
+
+
+
+
+    [1 3 5]
+    <NDArray 3 @cpu(0)>
+
+
+
+## Negative Indices
+Negative _i_ and _j_ are interpreted as _n + i_ and _n + j_ where _n_ is the number of elements in the corresponding dimension. Negative _k_ makes stepping go towards smaller indices.
+
+
+```python
+x[-2:10]
+```
+
+
+
+
+
+    [8 9]
+    <NDArray 2 @cpu(0)>
+
+
+
+If the number of objects in the selection tuple is less than N , then : is assumed for any subsequent dimensions.
+
+
+```python
+x = nd.array([[[1],[2],[3]],
+              [[4],[5],[6]]], dtype='int32')
+x[1:2]
+```
+
+
+
+
+
+    [[[4]
+      [5]
+      [6]]]
+    <NDArray 1x3x1 @cpu(0)>
+
+
+
+You may use slicing to set values in the array, but (unlike lists) you can never grow the array. The size of the value to be set in `x[obj] = value` must be able to broadcast to the same shape as `x[obj]`.
+
+
+```python
+x = nd.arange(16, dtype='int32').reshape((4, 4))
+print(x)
+```
+
+
+    [[ 0  1  2  3]
+     [ 4  5  6  7]
+     [ 8  9 10 11]
+     [12 13 14 15]]
+    <NDArray 4x4 @cpu(0)>
+
+
+
+```python
+print(x[1:4:2, 3:0:-1])
+```
+
+
+    [[ 7  6  5]
+     [15 14 13]]
+    <NDArray 2x3 @cpu(0)>
+
+
+
+```python
+x[1:4:2, 3:0:-1] = [[16], [17]]
+print(x)
+```
+
+
+    [[ 0  1  2  3]
+     [ 4 16 16 16]
+     [ 8  9 10 11]
+     [12 17 17 17]]
+    <NDArray 4x4 @cpu(0)>
+
+
+## New Advanced Indexing Features in v1.0
+
+Advanced indexing is triggered when the selection object, obj, is a non-tuple sequence object (e.g. a Python list), a NumPy `ndarray` (of data type integer), an MXNet `NDArray`, or a tuple with at least one sequence object.
+
+Advanced indexing always returns a __copy__ of the data.
+
+**Note**:
+- When the selection object is a Python list, it must be a list of integers. MXNet does not support the selection object being a nested list. That is, `x[[1, 2]]` is supported, while `x[[1], [2]]` is not.
+- When the selection object is a NumPy `ndarray` or an MXNet `NDArray`, there is no dimension restrictions on the object.
+- When the selection object is a tuple containing Python list(s), both integer lists and nested lists are supported. That is, both `x[1:4, [1, 2]]` and `x[1:4, [[1], [2]]` are supported.
+
+### Purely Integer Array Indexing
+When the index consists of as many integer arrays as the array being indexed has dimensions, the indexing is straight forward, but different from slicing.
+
+Advanced indexes always are [broadcast](https://docs.scipy.org/doc/numpy-1.13.0/reference/ufuncs.html#ufuncs-broadcasting) and iterated as one:
+```python
+result[i_1, ..., i_M] == x[ind_1[i_1, ..., i_M], ind_2[i_1, ..., i_M],
+                           ..., ind_N[i_1, ..., i_M]]
+```
+Note that the result shape is identical to the (broadcast) indexing array shapes `ind_1, ..., ind_N`.
+
+**Example:**
+From each row, a specific element should be selected. The row index is just [0, 1, 2] and the column index specifies the element to choose for the corresponding row, here [0, 1, 0]. Using both together the task can be solved using advanced indexing:
+
+
+```python
+x = nd.array([[1, 2],
+              [3, 4],
+              [5, 6]], dtype='int32')
+x[[0, 1, 2], [0, 1, 0]]
+```
+
+
+
+
+
+    [1 4 5]
+    <NDArray 3 @cpu(0)>
+
+
+
+To achieve a behavior similar to the basic slicing above, broadcasting can be used. This is best understood with an example.
+
+Example:
+From a 4x3 array the corner elements should be selected using advanced indexing. Thus all elements for which the column is one of `[0, 2]` and the row is one of `[0, 3]` need to be selected. To use advanced indexing one needs to select all elements explicitly. Using the method explained previously one could write:
+
+
+```python
+x = nd.array([[ 0,  1,  2],
+              [ 3,  4,  5],
+              [ 6,  7,  8],
+              [ 9, 10, 11]], dtype='int32')
+x[[[0, 0], [3, 3]],
+  [[0, 2], [0, 2]]]
+```
+
+
+
+
+
+    [[ 0  2]
+     [ 9 11]]
+    <NDArray 2x2 @cpu(0)>
+
+
+
+However, since the indexing arrays above just repeat themselves, broadcasting can be used.
+
+
+```python
+x[[[0], [3]],
+  [[0, 2]]]
+```
+
+
+
+
+
+    [[ 0  2]
+     [ 9 11]]
+    <NDArray 2x2 @cpu(0)>
+
+
+
+### Combining Advanced and Basic Indexing
+There are three situations we need to consider when mix advanced and basic indices in a single selection object. Let's look at examples to understand each one's behavior.
+
+- There is only one advanced index in the selection object. For example, `x` is an `NDArray` with `shape=(10, 20, 30, 40, 50)` and `result=x[:, :, ind]` has one advanced index `ind` with `shape=(2, 3, 4)` on the third axis. The `result` will have `shape=(10, 20, 2, 3, 4, 40, 50)` because the subspace of `x` in the third dimension is replaced by the subspace of `shape=(2, 3, 4)`. If we let _i_, _j_, _k_ loop over the (2, 3, 4)-shaped subspace, it is equivalent to `result[:, :, i, j, k, :, :] = x[:, :, ind[i, j, k], :, :]`.
+
+
+```python
+import numpy as np
+shape = (10, 20, 30, 40, 50)
+x = nd.arange(np.prod(shape), dtype='int32').reshape(shape)
+ind = nd.arange(24).reshape((2, 3, 4))
+print(x[:, :, ind].shape)
+```
+
+    (10, 20, 2, 3, 4, 40, 50)
+
+
+- There are at least two advanced indices in the selection object, and all the advanced indices are adjacent to each other. For example, `x` is an `NDArray` with `shape=(10, 20, 30, 40, 50)` and `result=x[:, :, ind1, ind2, :]` has two advanced indices with shapes that are broadcastable to `shape=(2, 3, 4)`. Then the `result` has `shape=(10, 20, 2, 3, 4, 50)` because `(30, 40)`-shaped subspace has been replaced with `(2, 3, 4)`-shaped subspace from the indices.
+
+
+```python
+ind1 = [0, 1, 2, 3]
+ind2 = [[[0], [1], [2]], [[3], [4], [5]]]
+print(x[:, :, ind1, ind2, :].shape)
+```
+
+    (10, 20, 2, 3, 4, 50)
+
+
+- There are at least two advanced indices in the selection object, and there is at least one advanced index separated from the others by basic indices. For example,  `x` is an `NDArray` with `shape=(10, 20, 30, 40, 50)` and `result=x[:, :, ind1, :, ind2]` has two advanced indices with shapes that are broadcastable to `shape=(2, 3, 4)`. Then the `result` has `shape=(2, 3, 4, 10, 20, 40)` because there is no unambiguous place to place the indexing subspace, hence it is prepended to the beginning.
+
+
+```python
+print(x[:, :, ind1, :, ind2].shape)
+```
+
+    (2, 3, 4, 10, 20, 40)
+
+## References
+
+[NumPy documentation](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing)
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/c++/basics.md b/docs/tutorials/c++/basics.md
index cdf1a28ecd..d3231e7a1f 100644
--- a/docs/tutorials/c++/basics.md
+++ b/docs/tutorials/c++/basics.md
@@ -16,8 +16,8 @@ Except linking the MXNet shared library, the C++ package itself is a header-only
 which means all you need to do is to include the header files. Among the header files,
 `op.h` is special since it is generated dynamically. The generation should be done when
 [building the C++ package](http://mxnet.io/get_started/build_from_source.html#build-the-c++-package).
-After that, you also need to copy the shared library (`libmxnet.so` in linux,
-`libmxnet.dll` in windows) from `/path/to/mxnet/lib` to the working directory.
+It is important to note that you need to **copy the shared library** (`libmxnet.so` in Linux and MacOS,
+`libmxnet.dll` in Windows) from `/path/to/mxnet/lib` to the working directory.
 We do not recommend you to use pre-built binaries because MXNet is under heavy development,
 the operator definitions in `op.h` may be incompatible with the pre-built version.
 
@@ -49,7 +49,7 @@ auto val_iter = MXDataIter("MNISTIter")
     .CreateDataIter();
 ```
 
-The data have been successfully loaded, we can now easily construct various models to identify
+The data have been successfully loaded. We can now easily construct various models to identify
 the digits with the help of C++ package.
 
 
@@ -159,7 +159,12 @@ while (val_iter.Next()) {
 ```
 
 You can find the complete code in `mlp_cpu.cpp`. Use `make mlp_cpu` to compile it,
- and `./mlp_cpu` to run it.
+ and `./mlp_cpu` to run it. If it complains that the shared library `libmxnet.so` is not found
+ after typing `./mlp_cpu`, you will need to specify the path to the shared library in
+ the environment variable `LD_LIBRARY_PATH` in Linux and `DYLD_LIBRARY_PATH`
+ in MacOS. For example, if you are using MacOS, typing
+ `DYLD_LIBRARY_PATH+=. ./mlp_cpu` would solve the problem. It basically tells the system
+ to find the shared library under the current directory since we have just copied it here.
 
 GPU Support
 -----------
@@ -186,4 +191,6 @@ data_batch.label.CopyTo(&args["label"]);
 NDArray::WaitAll();
 ```
 
-By replacing the former code to the latter one, we successfully port the code to GPU. You can find the complete code in `mlp_gpu.cpp`. Compilation is similar to the cpu version. (Note: The shared library should be built with GPU support on)
+By replacing the former code to the latter one, we successfully port the code to GPU.
+You can find the complete code in `mlp_gpu.cpp`. Compilation is similar to the cpu version.
+Note that the shared library must be built with GPU support enabled.
diff --git a/docs/tutorials/general_ml/recommendation_systems.md b/docs/tutorials/general_ml/recommendation_systems.md
deleted file mode 100644
index 81dc587101..0000000000
--- a/docs/tutorials/general_ml/recommendation_systems.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Recommendation Systems
-
-Get the source code for an example of a recommendation system based on MXNet on [GitHub](https://github.com/dmlc/mxnet-notebooks/tree/master/python/recommendation_systems).
-
-## Next Steps
-* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
\ No newline at end of file
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 643261573f..d691ecc427 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -5,7 +5,7 @@
 Gluon is the high-level interface for MXNet. It is more intuitive and easier to use than the lower level interface.
 Gluon supports dynamic (define-by-run) graphs with JIT-compilation to achieve both flexibility and efficiency.
 
-This is a selected subset of Gluon tutorials that explains basic usage of Gluon and fundamental concepts in deep learning. For the comprehensive tutorial on Gluon that covers topics from basic statistics and probability theory to reinforcement learning and recommender systems, please see [gluon.mxnet.io](http://gluon.mxnet.io). 
+This is a selected subset of Gluon tutorials that explain basic usage of Gluon and fundamental concepts in deep learning. For a comprehensive tutorial on Gluon that covers topics from basic statistics and probability theory to reinforcement learning and recommender systems, please see [gluon.mxnet.io](http://gluon.mxnet.io).
 
 ### Basics
 
@@ -38,6 +38,7 @@ These tutorials introduce a few fundamental concepts in deep learning and how to
    :maxdepth: 1
 
    basic/ndarray
+   basic/ndarray_indexing
    basic/symbol
    basic/module
    basic/data
@@ -66,6 +67,15 @@ These tutorials introduce a few fundamental concepts in deep learning and how to
    sparse/train
 ```
 
+### Advanced Neural networks
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   unsupervised_learning/gan
+```
+
 <br>
 More tutorials and examples are available in the GitHub [repository](https://github.com/dmlc/mxnet/tree/master/example).
 
diff --git a/docs/tutorials/nlp/cnn.md b/docs/tutorials/nlp/cnn.md
index 23f74c4efb..7f56b76531 100644
--- a/docs/tutorials/nlp/cnn.md
+++ b/docs/tutorials/nlp/cnn.md
@@ -1,6 +1,6 @@
 # Text Classification Using a Convolutional Neural Network on MXNet
 
-This tutorial is based of Yoon Kim's [paper](https://arxiv.org/abs/1408.5882) on using convolutional neural networks for sentence sentiment classification.
+This tutorial is based of Yoon Kim's [paper](https://arxiv.org/abs/1408.5882) on using convolutional neural networks for sentence sentiment classification. The tutorial has been tested on MXNet 1.0 running under Python 2.7 and Python 3.6.
 
 For this tutorial, we will train a convolutional deep network model on movie review sentences from Rotten Tomatoes labeled with their sentiment. The result will be a model that can classify a sentence based on its sentiment (with 1 being a purely positive sentiment, 0 being a purely negative sentiment and 0.5 being neutral).
 
@@ -8,16 +8,24 @@ Our first step will be to fetch the labeled training data of positive and negati
 
 
 ```python
-import urllib2
+from __future__ import print_function
+
+from collections import Counter
+import itertools
 import numpy as np
 import re
-import itertools
-from collections import Counter
 
+try:
+    # For Python 3.0 and later
+    from urllib.request import urlopen
+except ImportError:
+    # Fall back to Python 2's urllib2
+    from urllib2 import urlopen
+    
 def clean_str(string):
     """
-    Tokenization/string cleaning for all datasets except for SST.
-    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
+    Tokenization/string cleaning.
+    Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
     """
     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
     string = re.sub(r"\'s", " \'s", string)
@@ -32,38 +40,42 @@ def clean_str(string):
     string = re.sub(r"\)", " \) ", string)
     string = re.sub(r"\?", " \? ", string)
     string = re.sub(r"\s{2,}", " ", string)
+    
     return string.strip().lower()
 
+def download_sentences(url):
+    """
+    Download sentences from specified URL. 
+    
+    Strip trailing newline, convert to Unicode.
+    """
+    
+    remote_file = urlopen(url)
+    return [line.decode('Latin1').strip() for line in remote_file.readlines()]
+    
 def load_data_and_labels():
     """
-    Loads MR polarity data from files, splits the data into words and generates labels.
+    Loads polarity data from files, splits the data into words and generates labels.
     Returns split sentences and labels.
     """
-    # Pull sentences with positive sentiment
-    pos_file = urllib2.urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
-
-    # Pull sentences with negative sentiment
-    neg_file = urllib2.urlopen('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
-
-    # Load data from files
-    positive_examples = list(pos_file.readlines())
-    positive_examples = [s.strip() for s in positive_examples]
-    negative_examples = list(neg_file.readlines())
-    negative_examples = [s.strip() for s in negative_examples]
-    # Split by words
+
+    positive_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
+    negative_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
+    
+    # Tokenize
     x_text = positive_examples + negative_examples
-    x_text = [clean_str(sent) for sent in x_text]
-    x_text = [s.split(" ") for s in x_text]
+    x_text = [clean_str(sent).split(" ") for sent in x_text]
+
     # Generate labels
     positive_labels = [1 for _ in positive_examples]
     negative_labels = [0 for _ in negative_examples]
     y = np.concatenate([positive_labels, negative_labels], 0)
-    return [x_text, y]
+    return x_text, y
 
 
 def pad_sentences(sentences, padding_word="</s>"):
     """
-    Pads all sentences to the same length. The length is defined by the longest sentence.
+    Pads all sentences to be the length of the longest sentence.
     Returns padded sentences.
     """
     sequence_length = max(len(x) for x in sentences)
@@ -73,33 +85,40 @@ def pad_sentences(sentences, padding_word="</s>"):
         num_padding = sequence_length - len(sentence)
         new_sentence = sentence + [padding_word] * num_padding
         padded_sentences.append(new_sentence)
+        
     return padded_sentences
 
 
 def build_vocab(sentences):
     """
-    Builds a vocabulary mapping from word to index based on the sentences.
+    Builds a vocabulary mapping from token to index based on the sentences.
     Returns vocabulary mapping and inverse vocabulary mapping.
     """
     # Build vocabulary
     word_counts = Counter(itertools.chain(*sentences))
+    
     # Mapping from index to word
     vocabulary_inv = [x[0] for x in word_counts.most_common()]
+    
     # Mapping from word to index
     vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
-    return [vocabulary, vocabulary_inv]
+    
+    return vocabulary, vocabulary_inv
 
 
 def build_input_data(sentences, labels, vocabulary):
     """
     Maps sentences and labels to vectors based on a vocabulary.
     """
-    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
+    x = np.array([
+            [vocabulary[word] for word in sentence]
+            for sentence in sentences])
     y = np.array(labels)
-    return [x, y]
+    
+    return x, y
 
 """
-Loads and preprocessed data for the MR dataset.
+Loads and preprocesses data for the MR dataset.
 Returns input vectors, labels, vocabulary, and inverse vocabulary.
 """
 # Load and preprocess data
@@ -123,11 +142,11 @@ y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
 
 sentence_size = x_train.shape[1]
 
-print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev))
-print 'train shape:', x_train.shape
-print 'dev shape:', x_dev.shape
-print 'vocab_size', vocab_size
-print 'sentence max words', sentence_size
+print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
+print('train shape:', x_train.shape)
+print('dev shape:', x_dev.shape)
+print('vocab_size', vocab_size)
+print('sentence max words', sentence_size)
 ```
 
     Train/Dev split: 9662/1000
@@ -150,8 +169,8 @@ import sys,os
 Define batch size and the place holders for network inputs and outputs
 '''
 
-batch_size = 50 # the size of batches to train network with
-print 'batch size', batch_size
+batch_size = 50
+print('batch size', batch_size)
 
 input_x = mx.sym.Variable('data') # placeholder for input data
 input_y = mx.sym.Variable('softmax_label') # placeholder for output label
@@ -163,7 +182,7 @@ Define the first network layer (embedding)
 
 # create embedding layer to learn representation of words in a lower dimensional subspace (much like word2vec)
 num_embed = 300 # dimensions to embed words into
-print 'embedding dimensions', num_embed
+print('embedding dimensions', num_embed)
 
 embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
 
@@ -185,14 +204,14 @@ Because each convolution+pool filter produces tensors of different shapes we nee
 ```python
 # create convolution + (max) pooling layer for each filter operation
 filter_list=[3, 4, 5] # the size of filters to use
-print 'convolution filters', filter_list
+print('convolution filters', filter_list)
 
 num_filter=100
 pooled_outputs = []
-for i, filter_size in enumerate(filter_list):
+for filter_size in filter_list:
     convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
     relui = mx.sym.Activation(data=convi, act_type='relu')
-    pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
+    pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1, 1))
     pooled_outputs.append(pooli)
 
 # combine all pooled outputs
@@ -206,14 +225,14 @@ h_pool = mx.sym.Reshape(data=concat, target_shape=(batch_size, total_filters))
     convolution filters [3, 4, 5]
 
 
-Next, we add dropout regularization, which will randomly disable a fraction of neurons in the layer (set to 50% here) to ensure that that model does not overfit. This works by preventing neurons from co-adapting and forcing them to learn individually useful features.
+Next, we add dropout regularization, which will randomly disable a fraction of neurons in the layer (set to 50% here) to ensure that that model does not overfit. This prevents neurons from co-adapting and forces them to learn individually useful features.
 
 This is necessary for our model because the dataset has a vocabulary of size around 20k and only around 10k examples so since this data set is pretty small we're likely to overfit with a powerful model (like this neural net).
 
 
 ```python
 # dropout layer
-dropout=0.5
+dropout = 0.5
 print 'dropout probability', dropout
 
 if dropout > 0.0:
@@ -231,7 +250,7 @@ Finally, we add a fully connected layer to add non-linearity to the model. We th
 
 ```python
 # fully connected layer
-num_label=2
+num_label = 2
 
 cls_weight = mx.sym.Variable('cls_weight')
 cls_bias = mx.sym.Variable('cls_bias')
@@ -252,16 +271,16 @@ Now that we have defined our CNN model we will define the device on our machine
 
 ```python
 from collections import namedtuple
-import time
 import math
+import time
 
 # Define the structure of our CNN Model (as a named tuple)
 CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])
 
 # Define what device to train/test on
-ctx=mx.gpu(0)
+ctx = mx.gpu(0)
 # If you have no GPU on your machine change this to
-# ctx=mx.cpu(0)
+# ctx = mx.cpu(0)
 
 arg_names = cnn.list_arguments()
 
@@ -280,16 +299,14 @@ cnn_exec = cnn.bind(ctx=ctx, args=arg_arrays, args_grad=args_grad, grad_req='add
 
 param_blocks = []
 arg_dict = dict(zip(arg_names, cnn_exec.arg_arrays))
-initializer=mx.initializer.Uniform(0.1)
+initializer = mx.initializer.Uniform(0.1)
 for i, name in enumerate(arg_names):
     if name in ['softmax_label', 'data']: # input, output
         continue
-    initializer(name, arg_dict[name])
+    initializer(mx.init.InitDesc(name), arg_dict[name])
 
     param_blocks.append( (i, arg_dict[name], args_grad[name], name) )
 
-out_dict = dict(zip(cnn.list_outputs(), cnn_exec.outputs))
-
 data = cnn_exec.arg_dict['data']
 label = cnn_exec.arg_dict['softmax_label']
 
@@ -304,15 +321,15 @@ We can now execute the training and testing of our network, which in-part mxnet
 Train the cnn_model using back prop
 '''
 
-optimizer='rmsprop'
-max_grad_norm=5.0
-learning_rate=0.0005
-epoch=50
+optimizer = 'rmsprop'
+max_grad_norm = 5.0
+learning_rate = 0.0005
+epoch = 50
 
-print 'optimizer', optimizer
-print 'maximum gradient', max_grad_norm
-print 'learning rate (step size)', learning_rate
-print 'epochs to train for', epoch
+print('optimizer', optimizer)
+print('maximum gradient', max_grad_norm)
+print('learning rate (step size)', learning_rate)
+print('epochs to train for', epoch)
 
 # create optimizer
 opt = mx.optimizer.create(optimizer)
@@ -320,9 +337,6 @@ opt.lr = learning_rate
 
 updater = mx.optimizer.get_updater(opt)
 
-# create logging output
-logs = sys.stderr
-
 # For each training epoch
 for iteration in range(epoch):
     tic = time.time()
@@ -369,7 +383,7 @@ for iteration in range(epoch):
     # Decay learning rate for this epoch to ensure we are not "overshooting" optima
     if iteration % 50 == 0 and iteration > 0:
         opt.lr *= 0.5
-        print >> logs, 'reset learning rate to %g' % opt.lr
+        print('reset learning rate to %g' % opt.lr)
 
     # End of training loop for this epoch
     toc = time.time()
@@ -380,11 +394,11 @@ for iteration in range(epoch):
     if (iteration + 1) % 10 == 0:
         prefix = 'cnn'
         cnn_model.symbol.save('./%s-symbol.json' % prefix)
-        save_dict = {('arg:%s' % k) :v  for k, v in cnn_model.cnn_exec.arg_dict.items()}
+        save_dict = {('arg:%s' % k) : v  for k, v in cnn_model.cnn_exec.arg_dict.items()}
         save_dict.update({('aux:%s' % k) : v for k, v in cnn_model.cnn_exec.aux_dict.items()})
         param_name = './%s-%04d.params' % (prefix, iteration)
         mx.nd.save(param_name, save_dict)
-        print >> logs, 'Saved checkpoint to %s' % param_name
+        print('Saved checkpoint to %s' % param_name)
 
 
     # Evaluate model after this epoch on dev (test) set
@@ -406,10 +420,28 @@ for iteration in range(epoch):
         num_total += len(batchY)
 
     dev_acc = num_correct * 100 / float(num_total)
-    print >> logs, 'Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
-            --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc)
+    print('Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
+            --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc))
 ```
 
+
+    optimizer rmsprop
+    maximum gradient 5.0
+    learning rate (step size) 0.0005
+    epochs to train for 50
+    Iter [0] Train: Time: 3.903s, Training Accuracy: 56.290             --- Dev Accuracy thus far: 63.300
+    Iter [1] Train: Time: 3.142s, Training Accuracy: 71.917             --- Dev Accuracy thus far: 69.400
+    Iter [2] Train: Time: 3.146s, Training Accuracy: 80.508             --- Dev Accuracy thus far: 73.900
+    Iter [3] Train: Time: 3.142s, Training Accuracy: 87.233             --- Dev Accuracy thus far: 76.300
+    Iter [4] Train: Time: 3.145s, Training Accuracy: 91.057             --- Dev Accuracy thus far: 77.100
+    Iter [5] Train: Time: 3.145s, Training Accuracy: 94.073             --- Dev Accuracy thus far: 77.700
+    Iter [6] Train: Time: 3.147s, Training Accuracy: 96.000             --- Dev Accuracy thus far: 77.400
+    Iter [7] Train: Time: 3.150s, Training Accuracy: 97.399             --- Dev Accuracy thus far: 77.100
+    Iter [8] Train: Time: 3.144s, Training Accuracy: 98.425             --- Dev Accuracy thus far: 78.000
+    Saved checkpoint to ./cnn-0009.params
+    Iter [9] Train: Time: 3.151s, Training Accuracy: 99.192             --- Dev Accuracy thus far: 77.100
+    ...
+
 Now that we have gone through the trouble of training the model, we have stored the learned parameters in the .params file in our local directory. We can now load this file whenever we want and predict the sentiment of new sentences by running them through a forward pass of the trained model.
 
 ## References
diff --git a/docs/tutorials/nlp/nce_loss.md b/docs/tutorials/nlp/nce_loss.md
deleted file mode 100644
index 564b9e8f59..0000000000
--- a/docs/tutorials/nlp/nce_loss.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# NCE Loss
-This tutorial shows how to use nce-loss to speed up multi-class classification when the number of classes is huge.
-
-You can get the source code for this example on [GitHub](https://github.com/dmlc/mxnet/tree/master/example/nce-loss).
-
-## Toy Examples
-
-* toy_softmax.py. A multi class example using softmax output
-* toy_nce.py. A multi-class example using nce loss
-
-### Word2Vec
-
-* word2vec.py. A CBOW word2vec example using nce loss
-
-Run word2vec.py with the following command:
-
-```
-    ./get_text8.sh
-    python word2vec.py
-```
-
-### LSTM
-
-* lstm_word.py. An LSTM example using nce loss
-
-Run lstm_word.py with the  following command:
-
-```
-    ./get_text8.sh
-    python lstm_word.py
-```
-
-## References
-
-For more details, see [http://www.jianshu.com/p/e439b43ea464](http://www.jianshu.com/p/e439b43ea464) (in Chinese).
-
-## Next Steps
-* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
\ No newline at end of file
diff --git a/docs/tutorials/nlp/rnn.md b/docs/tutorials/nlp/rnn.md
deleted file mode 100644
index e2d2265ece..0000000000
--- a/docs/tutorials/nlp/rnn.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Recurrent Neural Networks
-This folder contains RNN examples using a low-level symbol interface. You can get the source code for this example on [GitHub](https://github.com/dmlc/mxnet/tree/master/example/rnn).
-
-## Python
-
-- [https://github.com/dmlc/mxnet/blob/master/example/rnn/lstm_bucketing.py](lstm_bucketing.py). A PennTreeBank language model using LSTM
-- [https://github.com/dmlc/mxnet/blob/master/example/rnn/cudnn_lstm_bucketing.py](cudnn_lstm_bucketing.py). A PennTreeBank language model using LSTM and CUDNN
-
-Performance Note:
-
-Using more ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For information on setting ```MXNET_GPU_WORKER_NTHREADS```, refer to [Environment Variables](http://mxnet.io/how_to/env_var.html).
-
-## Next Steps
-* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
diff --git a/docs/tutorials/python/linear-regression.md b/docs/tutorials/python/linear-regression.md
index c26435dec6..fc3e7136c1 100644
--- a/docs/tutorials/python/linear-regression.md
+++ b/docs/tutorials/python/linear-regression.md
@@ -156,9 +156,9 @@ parameters of the model to fit the training data. This is accomplished using the
 ```python
 model.fit(train_iter, eval_iter,
             optimizer_params={'learning_rate':0.005, 'momentum': 0.9},
-            num_epoch=50,
+            num_epoch=20,
             eval_metric='mse',
-            batch_end_callback = mx.callback.Speedometer(batch_size, 2))
+            batch_end_callback = mx.callback.Speedometer(batch_size, 2))	    
 ```
 
 ## Using a trained model: (Testing and Inference)
@@ -176,6 +176,7 @@ evaluating our model's mean squared error (MSE) on the evaluation data.
 ```python
 metric = mx.metric.MSE()
 model.score(eval_iter, metric)
+assert model.score(eval_iter, metric)[0][1] < 0.01001, "Achieved MSE (%f) is larger than expected (0.01001)" % model.score(eval_iter, metric)[0][1]
 ```
 
 Let us try and add some noise to the evaluation data and see how the MSE changes:
diff --git a/docs/tutorials/sparse/train.md b/docs/tutorials/sparse/train.md
index 22ce039ee7..d75f741cc7 100644
--- a/docs/tutorials/sparse/train.md
+++ b/docs/tutorials/sparse/train.md
@@ -244,6 +244,7 @@ for epoch in range(10):
         mod.backward()                          # compute gradients
         mod.update()                            # update parameters
     print('Epoch %d, Metric = %s' % (epoch, metric.get()))
+assert metric.get()[1] < 1, "Achieved MSE (%f) is larger than expected (1.0)" % metric.get()[1]    
 ```
 
 
diff --git a/docs/tutorials/unsupervised_learning/gan.md b/docs/tutorials/unsupervised_learning/gan.md
index 709e1323c6..71774bc989 100644
--- a/docs/tutorials/unsupervised_learning/gan.md
+++ b/docs/tutorials/unsupervised_learning/gan.md
@@ -1,43 +1,43 @@
-# Generative Adversarial Networks
+# Generative Adversarial Network (GAN)
 
-GANs are an application of unsupervised learning - you don't need labels for your dataset in order to train a GAN.
- 
-The GAN framework composes of two neural networks: a generator network and a discriminator network.
+Generative Adversarial Networks (GANs) are a class of algorithms used in unsupervised learning - you don't need labels for your dataset in order to train a GAN.
 
-The generator's job is to take a set of random numbers and produce data (such as images or text).
+The GAN framework is composed of two neural networks: a Generator network and a Discriminator network.
 
-The discriminator then takes in that data as well as samples of that data from a dataset and tries to determine if is "fake" (created by the generator network) or "real" (from the original dataset).
+The Generator's job is to take a set of random numbers and produce the data (such as images or text).
 
-During training, the two networks play a game against each other. The generator tries to create realistic data, so that it can fool the discriminator into thinking that the data it generated is from the original dataset. At the same time, the discriminator tries to not be fooled - it learns to become better at determining if data is real or fake.
+The Discriminator then takes in that data as well as samples of that data from a dataset and tries to determine if it is "fake" (created by the Generator network) or "real" (from the original dataset).
 
-Since the two networks are fighting in this game, they can be seen as as adversaries, which is where the term "Generative Adverserial Network" comes from.
+During training, the two networks play a game against each other. The Generator tries to create realistic data, so that it can fool the Discriminator into thinking that the data it generated is from the original dataset. At the same time, the Discriminator tries to not be fooled - it learns to become better at determining if data is real or fake.
+
+Since the two networks are fighting in this game, they can be seen as as adversaries, which is where the term "Generative Adversarial Network" comes from.
 
 ## Deep Convolutional Generative Adversarial Networks
 
 This tutorial takes a look at Deep Convolutional Generative Adversarial Networks (DCGAN), which combines Convolutional Neural Networks (CNNs) and GANs.
 
-We will create a DCGAN that is able to create images of handwritten digits from random numbers.The tutorial uses the neural net architecture and guidelines outlined in [this paper](https://arxiv.org/abs/1511.06434), and the MNIST dataset.
+We will create a DCGAN that is able to create images of handwritten digits from random numbers. The tutorial uses the neural net architecture and guidelines outlined in [this paper](https://arxiv.org/abs/1511.06434), and the MNIST dataset.
 
-##How to Use This Tutorial
+## How to Use This Tutorial
 You can use this tutorial by executing each snippet of python code in order as it appears in the tutorial.
 
 
-1. The first net is the "generator" and creates images of handwritten digits from random numbers.
-2. The second net is the "discriminator" and determines if the image created by the generator is real (a realistic looking image of handwritten digits) or fake (an image that doesn't look like it came from the original dataset).
-    
+1. The first net is the "Generator" and creates images of handwritten digits from random numbers.
+2. The second net is the "Discriminator" and determines if the image created by the Generator is real (a realistic looking image of handwritten digits) or fake (an image that does not look like it is from the original dataset).
+
 Apart from creating a DCGAN, you'll also learn:
 
-- How to manipulate and iterate through batches images that you can feed into your neural network.
+- How to manipulate and iterate through batches of image data that you can feed into your neural network.
 
 - How to create a custom MXNet data iterator that generates random numbers from a normal distribution.
 
-- How to create a custom training process in MXNet, using lower level functions from the MXNet Module API such as .bind() .forward() and .backward(). The training process for a DCGAN is more complex than many other neural net's, so we need to use these functions instead of using the higher level .fit() function.
+- How to create a custom training process in MXNet, using lower level functions from the MXNet Module API such as .bind() .forward() and .backward(). The training process for a DCGAN is more complex than many other neural networks, so we need to use these functions instead of using the higher level .fit() function.
 
 - How to visualize images as they are going through the training process
 
 ## Prerequisites
 
-This tutorial assumes you're familiar with the concept of CNN's and have implemented one in MXNet. You should also be familiar with the concept of logistic regression. Having a basic understanding for MXNet data iterators helps, since we'll create a custom Data Iterator to iterate though random numbers as inputs to our generator network. 
+This tutorial assumes you are familiar with the concepts of CNNs and have implemented one in MXNet. You should also be familiar with the concept of logistic regression. Having a basic understanding of MXNet data iterators helps, since we will create a custom data iterator to iterate though random numbers as inputs to the Generator network.
 
 This example is designed to be trained on a single GPU. Training this network on CPU can be slow, so it's recommended that you use a GPU for training.
 
@@ -47,17 +47,17 @@ To complete this tutorial, you need:
 - Python 2.7, and the following libraries for Python:
     - Numpy - for matrix math
     - OpenCV - for image manipulation
-    - Scikit-learn - to easily get our dataset
-    - Matplotlib - to visualize our output
+    - Scikit-learn - to easily get the MNIST dataset
+    - Matplotlib - to visualize the output
 
 ## The Data
-We need two pieces of data to train our DCGAN:
+We need two pieces of data to train the DCGAN:
     1. Images of handwritten digits from the MNIST dataset
     2. Random numbers from a normal distribution
 
-Our generator network will use the random numbers as the input to produce images of handwritten digits, and out discriminator network will use images of handwritten digits from the MNIST dataset to determine if images produced by our generator are realistic.
+The Generator network will use the random numbers as the input to produce the images of handwritten digits, and the Discriminator network will use images of handwritten digits from the MNIST dataset to determine if images produced by the Generator are realistic.
 
-We are going to use the python library, scikit-learn, to get the MNIST dataset. Scikit-learn comes with a function that gets the dataset for us, which we will then manipulate to create our training and testing inputs.
+We are going to use the python library, scikit-learn, to get the MNIST dataset. Scikit-learn comes with a function that gets the dataset for us, which we will then manipulate to create the training and testing inputs.
 
 The MNIST dataset contains 70,000 images of handwritten digits. Each image is 28x28 pixels in size. To create random numbers, we're going to create a custom MXNet data iterator, which will returns random numbers from a normal distribution as we need then.
 
@@ -65,13 +65,14 @@ The MNIST dataset contains 70,000 images of handwritten digits. Each image is 28
 
 ### 1. Preparing the MNSIT dataset
 
-Let's start by preparing our handwritten digits from the MNIST dataset. We import the fetch_mldata function from scikit-learn, and use it to get the MNSIT dataset. Notice that it's shape is 70000x784. This contains the 70000 images on every row and 784 pixels of each image in the columns of each row. Each image is 28x28 pixels, but has been flattened so that all 784 images are represented in a single list.
+Let us start by preparing the handwritten digits from the MNIST dataset. We import the fetch_mldata function from scikit-learn, and use it to get the MNSIT dataset. Notice that it's shape is 70000x784. This contains 70000 images, one per row and 784 pixels of each image in the columns of each row. Each image is 28x28 pixels, but has been flattened so that all 784 pixels are represented in a single list.
+
 ```python
 from sklearn.datasets import fetch_mldata
 mnist = fetch_mldata('MNIST original')
 ```
 
-Next, we'll randomize the handwritten digits by using numpy to create random permutations on the dataset on our rows (images). We'll then reshape our dataset from 70000x786 to 70000x28x28, so that every image in our dataset is arranged into a 28x28 grid, where each cell in the grid represents 1 pixel of the image.
+Next, we will randomize the handwritten digits by using numpy to create random permutations on the dataset on the rows (images). We will then reshape the dataset from 70000x786 to 70000x28x28, so that every image in the dataset is arranged into a 28x28 grid, where each cell in the grid represents 1 pixel of the image.
 
 ```python
 import numpy as np
@@ -81,22 +82,23 @@ p = np.random.permutation(mnist.data.shape[0])
 X = mnist.data[p]
 X = X.reshape((70000, 28, 28))
 ```
-Since the DCGAN that we're creating takes in a 64x64 image as the input, we'll use OpenCV to resize the each 28x28 image to 64x64 images:
+Since the DCGAN that we're creating takes in a 64x64 image as the input, we will use OpenCV to resize the each 28x28 image to 64x64 images:
 ```python
 import cv2
 X = np.asarray([cv2.resize(x, (64,64)) for x in X])
 ```
-Each pixel in our 64x64 image is represented by a number between 0-255, that represents the intensity of the pixel. However, we want to input numbers between -1 and 1 into our DCGAN, as suggested by the research paper. To rescale our pixels to be in the range of -1 to 1, we'll divide each pixel by (255/2). This put our images on a scale of 0-2. We can then subtract by 1, to get them in the range of -1 to 1.
+Each pixel in the 64x64 image is represented by a number between 0-255, that represents the intensity of the pixel. However, we want to input numbers between -1 and 1 into the DCGAN, as suggested by the [research paper](https://arxiv.org/abs/1511.06434). To rescale the pixel values, we will divide it by (255/2). This changes the scale to 0-2. We then subtract by 1 to get them in the range of -1 to 1.
+
 ```python
 X = X.astype(np.float32)/(255.0/2) - 1.0
 ```
-Ultimately, images are inputted into our neural net from a 70000x3x64x64 array, and they are currently in a 70000x64x64 array. We need to add 3 channels to our images. Typically when we are working with images, the 3 channels represent the red, green, and blue components of each image. Since the MNIST dataset is grayscale, we only need 1 channel to represent our dataset. We will pad the other channels with 0's:
+Ultimately, images are fed into the neural net through a 70000x3x64x64 array but they are currently in a 70000x64x64 array. We need to add 3 channels to the images. Typically, when we are working with the images, the 3 channels represent the red, green, and blue (RGB) components of each image. Since the MNIST dataset is grayscale, we only need 1 channel to represent the dataset. We will pad the other channels with 0's:
 
 ```python
 X = X.reshape((70000, 1, 64, 64))
 X = np.tile(X, (1, 3, 1, 1))
 ```
-Finally, we'll put our images into MXNet's NDArrayIter, which will allow MXNet to easily iterate through our images during training. We'll also split up them images into a batches, with 64 images in each batch. Every time we iterate, we'll get a 4 dimensional array with size (64, 3, 64, 64), representing a batch of 64 images.
+Finally, we will put the images into MXNet's NDArrayIter, which will allow MXNet to easily iterate through the images during training. We will also split them up into batches of 64 images each. Every time we iterate, we will get a 4 dimensional array with size (64, 3, 64, 64), representing a batch of 64 images.
 ```python
 import mxnet as mx
 batch_size = 64
@@ -104,7 +106,8 @@ image_iter = mx.io.NDArrayIter(X, batch_size=batch_size)
 ```
 ### 2. Preparing Random Numbers
 
-We need to input random numbers from a normal distribution to our generator network, so we'll create an MXNet DataIter that produces random numbers for each training batch. The DataIter is the base class of MXNet's Data Loading API. Below, we create a class called RandIter which is a subclass of DataIter. We use MXNet's built in mx.random.normal function in order to return the normally distributed random numbers every time we iterate.
+We need to input random numbers from a normal distribution to the Generator network, so we will create an MXNet DataIter that produces random numbers for each training batch. The DataIter is the base class of MXNet's Data Loading API. Below, we create a class called RandIter which is a subclass of DataIter. We use MXNet's built-in mx.random.normal function to return the random numbers from a normal distribution during the iteration.
+
 ```python
 class RandIter(mx.io.DataIter):
     def __init__(self, batch_size, ndim):
@@ -117,22 +120,22 @@ class RandIter(mx.io.DataIter):
         return True
 
     def getdata(self):
-        #Returns random numbers from a gaussian (normal) distribution 
+        #Returns random numbers from a gaussian (normal) distribution
         #with mean=0 and standard deviation = 1
         return [mx.random.normal(0, 1.0, shape=(self.batch_size, self.ndim, 1, 1))]
 ```
-When we initalize our RandIter, we need to provide two numbers: the batch size and how many random numbers we want to produce a single image from. This number is referred to as Z, and we'll set this to 100. This value comes from the research paper on the topic. Every time we iterate and get a batch of random numbers, we will get a 4 dimensional array with shape: (batch_size, Z, 1, 1), which in our example is (64, 100, 1, 1).
+When we initialize the RandIter, we need to provide two numbers: the batch size and how many random numbers we want in order to produce a single image from. This number is referred to as Z, and we will set this to 100. This value comes from the research paper on the topic. Every time we iterate and get a batch of random numbers, we will get a 4 dimensional array with shape: (batch_size, Z, 1, 1), which in the example is (64, 100, 1, 1).
 ```python
 Z = 100
 rand_iter = RandIter(batch_size, Z)
 ```
 ## Create the Model
 
-Our model has two networks that we will train together - the generator network and the disciminator network.
+The model has two networks that we will train together - the Generator network and the Discriminator network.
 
 ### The Generator
 
-Let's start off by defining the generator network, which uses deconvolutional layers (also callled fractionally strided layers) to generate an image form random numbers :
+Let us start off by defining the Generator network, which uses Deconvolution layers (also called as fractionally strided layers) to generate an image form random numbers :
 ```python
 no_bias = True
 fix_gamma = True
@@ -160,16 +163,16 @@ g5 = mx.sym.Deconvolution(gact4, name='g5', kernel=(4,4), stride=(2,2), pad=(1,1
 generatorSymbol = mx.sym.Activation(g5, name='gact5', act_type='tanh')
 ```
 
-Our generator image starts with random numbers that will be obtained from the RandIter we created earlier, so we created the rand variable for this input.
+The Generator image starts with random numbers that will be obtained from the RandIter we created earlier, so we created the rand variable for this input.
 We then start creating the model starting with a Deconvolution layer (sometimes called 'fractionally strided layer'). We apply batch normalization and ReLU activation after the Deconvolution layer.
 
-We repeat this process 4 times, applying a (2,2) stride and (1,1) pad at each Deconvolutional layer, which doubles the size of our image at each layer. By creating these layers, our generator network will have to learn to upsample our input vector of random numbers, Z at each layer, so that network output a final image. We also reduce half the number of filters at each layer, reducing dimensionality at each layer. Ultimatley, our output layer is a 64x64x3 layer, representing the size and channels of our image. We use tanh activation instead of relu on the last layer, as recommended by the research on DCGANs. The output of neurons in the final gout layer represent the pixels of generated image.
+We repeat this process 4 times, applying a (2,2) stride and (1,1) pad at each Deconvolution layer, which doubles the size of the image at each layer. By creating these layers, the Generator network will have to learn to upsample the input vector of random numbers, Z at each layer, so that network output a final image. We also reduce by half the number of filters at each layer, reducing dimensionality at each layer. Ultimately, the output layer is a 64x64x3 layer, representing the size and channels of the image. We use tanh activation instead of relu on the last layer, as recommended by the research on DCGANs. The output of neurons in the final gout layer represent the pixels of generated image.
 
-Notice we used 3 parameters to help us create our model: no_bias, fixed_gamma, and epsilon. Neurons in our network won't have a bias added to them, this seems to work better in practice for the DCGAN. In our batch norm layer, we set fixed_gamma=True, which means gamma=1 for all of our batch norm layers. epsilon is a small number that gets added to our batch norm so that we don't end up dividing by zero. By default, CuDNN requires that this number is greater than 1e-5, so we add a small number to this value, ensuring this values stays small.
+Notice we used 3 parameters to help us create the model: no_bias, fixed_gamma, and epsilon. Neurons in the network won't have a bias added to them, this seems to work better in practice for the DCGAN. In the batch norm layer, we set fixed_gamma=True, which means gamma=1 for all of the batch norm layers. epsilon is a small number that gets added to the batch norm so that we don't end up dividing by zero. By default, CuDNN requires that this number is greater than 1e-5, so we add a small number to this value, ensuring this values stays small.
 
 ### The Discriminator
 
-Let's now create our discriminator network, which will take in images of handwritten digits from the MNIST dataset and images created by the generator network:
+Let us now create the Discriminator network, which will take in images of handwritten digits from the MNIST dataset and images created by the Generator network:
 ```python
 data = mx.sym.Variable('data')
 
@@ -195,19 +198,22 @@ label = mx.sym.Variable('label')
 discriminatorSymbol = mx.sym.LogisticRegressionOutput(data=d5, label=label, name='dloss')
 ```
 
-We start off by creating the data variable, which is used to hold our input images to the discriminator.
+We start off by creating the data variable, which is used to hold the input images to the Discriminator.
+
+The Discriminator then goes through a series of 5 convolutional layers, each with a 4x4 kernel, 2x2 stride, and 1x1 pad. These layers half the size of the image (which starts at 64x64) at each convolutional layer. The model also increases dimensionality at each layer by doubling the number of filters per convolutional layer, starting at 128 filters and ending at 1024 filters before we flatten the output.
 
-The discriminator then goes through a series of 5 convolutional layers, each with a 4x4 kernel, 2x2 stride, and 1x1 pad. These layers half the size of the image (which starts at 64x64) at each convolutional layer. Our model also increases dimensionality at each layer by doubling the number of filters per convolutional layer, starting at 128 filters and ending at 1024 filters before we flatten the output.
+At the final convolution, we flatten the neural net to get one number as the final output of Discriminator network. This number is the probability that the image is real, as determined by the Discriminator. We use logistic regression to determine this probability. When we pass in "real" images from the MNIST dataset, we can label these as 1 and we can label the "fake" images from the Generator net as 0 to perform logistic regression on the Discriminator network.
 
-At the final convolution, we flatten the neural net to get one number as the final output of discriminator network. This number is the probability the image is real, as determined by our discriminator. We use logistic regression to determine this probability. When we pass in "real" images from the MNIST dataset, we can label these as 1 and we can label the "fake" images from the generator net as 0 to perform logistic regression on the discriminator network.
-Prepare the models using the Module API
+### Prepare the models using the Module API
 
-So far we have defined a MXNet Symbol for both the generator and the discriminator network. Before we can train our model, we need to bind these symbols using the Module API, which creates the computation graph for our models. It also allows us to decide how we want to initialize our model and what type of optimizer we want to use. Let's set up Module for both of our networks:
+So far we have defined a MXNet Symbol for both the Generator and the Discriminator network. Before we can train the model, we need to bind these symbols using the Module API, which creates the computation graph for the models. It also allows us to decide how we want to initialize the model and what type of optimizer we want to use. Let us set up the Module for both the networks:
 ```python
-#Hyperperameters
+#Hyper-parameters
 sigma = 0.02
 lr = 0.0002
 beta1 = 0.5
+# If you do not have a GPU. Use the below outlined
+# ctx = mx.cpu()
 ctx = mx.gpu(0)
 
 #=============Generator Module=============
@@ -236,27 +242,27 @@ discriminator.init_optimizer(
     })
 mods.append(discriminator)
 ```
-First, we create Modules for our networks and then bind the symbols that we've created in the previous steps to our modules.
-We use rand_iter.provide_data as the  data_shape to bind our generator network. This means that as we iterate though batches of data on the generator Module, our RandIter will provide us with random numbers to feed our Module using it's provide_data function.
+First, we create Modules for the networks and then bind the symbols that we've created in the previous steps to the modules.
+We use rand_iter.provide_data as the  data_shape to bind the Generator network. This means that as we iterate though batches of the data on the Generator Module, the RandIter will provide us with random numbers to feed the Module using it's provide_data function.
 
-Similarly, we bind the discriminator Module to image_iter.provide_data, which gives us images from MNIST from the NDArrayIter we had set up earlier, called image_iter.
+Similarly, we bind the Discriminator Module to image_iter.provide_data, which gives us images from MNIST from the NDArrayIter we had set up earlier, called image_iter.
 
-Notice that we're using the Normal initialization, with the hyperparameter sigma=0.02. This means our weight initializations for the neurons in our networks will random numbers from a Gaussian (normal) distribution with a mean of 0 and a standard deviation of 0.02.
+Notice that we are using the Normal Initialization, with the hyperparameter sigma=0.02. This means the weight initializations for the neurons in the networks will be random numbers from a Gaussian (normal) distribution with a mean of 0 and a standard deviation of 0.02.
 
-We also use the adam optimizer for gradient decent. We've set up two hyperparameters, lr and beta1 based on the values used in the DCGAN paper. We're using a single gpu, gpu(0) for training.
+We also use the Adam optimizer for gradient decent. We've set up two hyperparameters, lr and beta1 based on the values used in the DCGAN paper. We're using a single gpu, gpu(0) for training. Set the context to cpu() if you do not have a GPU on your machine.
 
-### Visualizing Our Training
-Before we train the model, let's set up some helper functions that will help visualize what our generator is producing, compared to what the real image is:
+### Visualizing The Training
+Before we train the model, let us set up some helper functions that will help visualize what the Generator is producing, compared to what the real image is:
 ```python
 from matplotlib import pyplot as plt
 
-#Takes the images in our batch and arranges them in an array so that they can be
+#Takes the images in the batch and arranges them in an array so that they can be
 #Plotted using matplotlib
 def fill_buf(buf, num_images, img, shape):
     width = buf.shape[0]/shape[1]
     height = buf.shape[1]/shape[0]
-    img_width = (num_images%width)*shape[0]
-    img_hight = (num_images/height)*shape[1]
+    img_width = int(num_images%width)*shape[0]
+    img_hight = int(num_images/height)*shape[1]
     buf[img_hight:img_hight+shape[1], img_width:img_width+shape[0], :] = img
 
 #Plots two images side by side using matplotlib
@@ -268,8 +274,8 @@ def visualize(fake, real):
     #Repeat for real image
     real = real.transpose((0, 2, 3, 1))
     real = np.clip((real+1.0)*(255.0/2.0), 0, 255).astype(np.uint8)
-    
-    #Create buffer array that will hold all the images in our batch
+
+    #Create buffer array that will hold all the images in the batch
     #Fill the buffer so to arrange all images in the batch onto the buffer array
     n = np.ceil(np.sqrt(fake.shape[0]))
     fbuff = np.zeros((int(n*fake.shape[1]), int(n*fake.shape[2]), int(fake.shape[3])), dtype=np.uint8)
@@ -278,9 +284,9 @@ def visualize(fake, real):
     rbuff = np.zeros((int(n*real.shape[1]), int(n*real.shape[2]), int(real.shape[3])), dtype=np.uint8)
     for i, img in enumerate(real):
         fill_buf(rbuff, i, img, real.shape[1:3])
-        
+
     #Create a matplotlib figure with two subplots: one for the real and the other for the fake
-    #fill each plot with our buffer array, which creates the image
+    #fill each plot with the buffer array, which creates the image
     fig = plt.figure()
     ax1 = fig.add_subplot(2,2,1)
     ax1.imshow(fbuff)
@@ -288,22 +294,22 @@ def visualize(fake, real):
     ax2.imshow(rbuff)
     plt.show()
 ```
- 
+
 ## Fit the Model
 Training the DCGAN is a complex process that requires multiple steps.
-To fit the model, for every batch of data in our dataset:
+To fit the model, for every batch of data in the MNIST dataset:
 
-1. Use the Z vector, which contains our random numbers to do a forward pass through our generator. This outputs the "fake" image, since it's created from our generator.
+1. Use the Z vector, which contains the random numbers to do a forward pass through the Generator network. This outputs the "fake" image, since it is created from the Generator.
 
-2. Use the fake image as the input to do a forward and backwards pass through the discriminator network. We set our labels for our logistic regression to 0 to represent that this is a fake image. This trains the discriminator to learn what a fake image looks like. We save the gradient produced in backpropogation for the next step.
+2. Use the fake image as the input to do a forward and backward pass through the Discriminator network. We set the labels for logistic regression to 0 to represent that this is a fake image. This trains the Discriminator to learn what a fake image looks like. We save the gradient produced in backpropagation for the next step.
 
-3. Do a forwards and backwards pass through the discriminator using a real image from our dataset. Our label for logistic regression will now be 1 to represent real images, so our discriminator can learn to recognize a real image.
+3. Do a forward and backward pass through the Discriminator using a real image from the MNIST dataset. The label for logistic regression will now be 1 to represent the real images, so the Discriminator can learn to recognize a real image.
 
-4. Update the discriminator by adding the result of the gradient generated during backpropogation on the fake image with the gradient from backpropogation on the real image.
+4. Update the Discriminator by adding the result of the gradient generated during backpropagation on the fake image with the gradient from backpropagation on the real image.
 
-5. Now that the discriminator has been updated for the this batch, we still need to update the generator. First, do a forward and backwards pass with the same batch on the updated discriminator, to produce a new gradient. Use the new gradient to do a backwards pass
+5. Now that the Discriminator has been updated for the this data batch, we still need to update the Generator. First, do a forward and backwards pass with the same data batch on the updated Discriminator, to produce a new gradient. Use the new gradient to do a backwards pass
 
-Here's the main training loop for our DCGAN:
+Here is the main training loop for the DCGAN:
 
 ```python
 # =============train===============
@@ -317,29 +323,29 @@ for epoch in range(1):
         generator.forward(rbatch, is_train=True)
         #Output of training batch is the 64x64x3 image
         outG = generator.get_outputs()
-        
+
         #Pass the generated (fake) image through the discriminator, and save the gradient
         #Label (for logistic regression) is an array of 0's since this image is fake
         label = mx.nd.zeros((batch_size,), ctx=ctx)
         #Forward pass on the output of the discriminator network
         discriminator.forward(mx.io.DataBatch(outG, [label]), is_train=True)
-        #Do the backwards pass and save the gradient
+        #Do the backward pass and save the gradient
         discriminator.backward()
         gradD = [[grad.copyto(grad.context) for grad in grads] for grads in discriminator._exec_group.grad_arrays]
-        
+
         #Pass a batch of real images from MNIST through the discriminator
         #Set the label to be an array of 1's because these are the real images
         label[:] = 1
         batch.label = [label]
         #Forward pass on a batch of MNIST images
         discriminator.forward(batch, is_train=True)
-        #Do the backwards pass and add the saved gradient from the fake images to the gradient 
+        #Do the backward pass and add the saved gradient from the fake images to the gradient
         #generated by this backwards pass on the real images
         discriminator.backward()
         for gradsr, gradsf in zip(discriminator._exec_group.grad_arrays, gradD):
             for gradr, gradf in zip(gradsr, gradsf):
                 gradr += gradf
-        #Update gradient on the discriminator 
+        #Update gradient on the discriminator
         discriminator.update()
 
         #Now that we've updated the discriminator, let's update the generator
@@ -353,7 +359,7 @@ for epoch in range(1):
         generator.backward(diffD)
         #Update the gradients on the generator
         generator.update()
-        
+
         #Increment to the next batch, printing every 50 batches
         i += 1
         if i % 50 == 0:
@@ -364,20 +370,20 @@ for epoch in range(1):
             visualize(outG[0].asnumpy(), batch.data[0].asnumpy())
 ```
 
-This causes our GAN to train and we can visualize the progress that we're making as our networks train. After every 25 iterations, we're calling the visualize function that we created earlier, which creates the visual plots during training.
+This will train the GAN network and visualize the progress that we are making as the networks are trained. After every 25 iterations, we are calling the visualize function that we created earlier, which plots the intermediate results.
 
-The plot on our left will represent what our generator created (the fake image) in the most recent iteration. The plot on the right will represent the original (real) image from the MNIST dataset that was inputted to the discriminator on the same iteration.
+The plot on the left will represent what the Generator created (the fake image) in the most recent iteration. The plot on the right will represent the Original (real) image from the MNIST dataset that was inputted to the Discriminator on the same iteration.
 
-As training goes on the generator becomes better at generating realistic images. You can see this happening since images on the left become closer to the original dataset with each iteration.
+As the training goes on, the Generator becomes better at generating realistic images. You can see this happening since the images on the left becomes closer to the original dataset with each iteration.
 
 ## Summary
 
-We've now sucessfully used Apache MXNet to train a Deep Convolutional GAN using the MNIST dataset.
+We have now successfully used Apache MXNet to train a Deep Convolutional Generative Adversarial Neural Networks (DCGAN) using the MNIST dataset.
 
-As a result, we've created two neural nets: a generator, which is able to create images of handwritten digits from random numbers, and a discriminator, which is able to take an image and determine if it is an image of handwritten digits.
+As a result, we have created two neural nets: a Generator, which is able to create images of handwritten digits from random numbers, and a Discriminator, which is able to take an image and determine if it is an image of handwritten digits.
 
-Along the way, we've learned how to do the image manipulation and visualization that's associted with training deep neural nets. We've also learned how to some of MXNet's advanced training functionality to fit our model.
+Along the way, we have learned how to do the image manipulation and visualization that is associated with the training of deep neural nets. We have also learned how to use MXNet's Module APIs to perform advanced model training functionality to fit the model.
 
 ## Acknowledgements
-This tutorial is based on [MXNet DCGAN codebase](https://github.com/apache/incubator-mxnet/blob/master/example/gan/dcgan.py), 
-[The original paper on GANs](https://arxiv.org/abs/1406.2661), as well as [this paper on deep convolutional GANs](https://arxiv.org/abs/1511.06434).
\ No newline at end of file
+This tutorial is based on [MXNet DCGAN codebase](https://github.com/apache/incubator-mxnet/blob/master/example/gan/dcgan.py),
+[The original paper on GANs](https://arxiv.org/abs/1406.2661), as well as [this paper on deep convolutional GANs](https://arxiv.org/abs/1511.06434).
diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py
index 2eadd869e7..4290e71063 100644
--- a/example/caffe/train_model.py
+++ b/example/caffe/train_model.py
@@ -85,15 +85,8 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
             args.gpus is None or len(args.gpus.split(',')) is 1):
         kv = None
 
-    model = mx.model.FeedForward(
-        ctx                = devs,
-        symbol             = network,
-        num_epoch          = args.num_epochs,
-        learning_rate      = args.lr,
-        momentum           = 0.9,
-        wd                 = 0.00001,
-        initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34),
-        **model_args)
+
+    mod = mx.mod.Module(network, context=devs)
 
     if eval_metrics is None:
         eval_metrics = ['accuracy']
@@ -108,10 +101,9 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
         batch_end_callback = []
     batch_end_callback.append(mx.callback.Speedometer(args.batch_size, 50))
 
-    model.fit(
-       X                  = train,
-       eval_data          = val,
-       eval_metric        = eval_metrics,
-       kvstore            = kv,
-       batch_end_callback = batch_end_callback,
-       epoch_end_callback = checkpoint)
+    mod.fit(train_data=train, eval_metric=eval_metrics, eval_data=val, optimizer='sgd',
+        optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001},
+        num_epoch=args.num_epochs, batch_end_callback=batch_end_callback,
+        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
+        kvstore=kv, epoch_end_callback=checkpoint, **model_args)
+
diff --git a/example/capsnet/README.md b/example/capsnet/README.md
new file mode 100644
index 0000000000..49a6dd1072
--- /dev/null
+++ b/example/capsnet/README.md
@@ -0,0 +1,66 @@
+**CapsNet-MXNet**
+=========================================
+
+This example is MXNet implementation of [CapsNet](https://arxiv.org/abs/1710.09829):  
+Sara Sabour, Nicholas Frosst, Geoffrey E Hinton. Dynamic Routing Between Capsules. NIPS 2017
+- The current `best test error is 0.29%` and `average test error is 0.303%`
+- The `average test error on paper is 0.25%`  
+
+Log files for the error rate are uploaded in [repository](https://github.com/samsungsds-rnd/capsnet.mxnet).  
+* * *
+## **Usage**
+Install scipy with pip  
+```
+pip install scipy
+```
+Install tensorboard with pip
+```
+pip install tensorboard
+```
+
+On Single gpu
+```
+python capsulenet.py --devices gpu0
+```
+On Multi gpus
+```
+python capsulenet.py --devices gpu0,gpu1
+```
+Full arguments  
+```
+python capsulenet.py --batch_size 100 --devices gpu0,gpu1 --num_epoch 100 --lr 0.001 --num_routing 3 --model_prefix capsnet
+```  
+
+* * *
+## **Prerequisities**
+
+MXNet version above (0.11.0)  
+scipy version above (0.19.0)
+
+***
+## **Results**  
+Train time takes about 36 seconds for each epoch (batch_size=100, 2 gtx 1080 gpus)  
+
+CapsNet classification test error on MNIST  
+
+```
+python capsulenet.py --devices gpu0,gpu1 --lr 0.0005 --decay 0.99 --model_prefix lr_0_0005_decay_0_99 --batch_size 100 --num_routing 3 --num_epoch 200
+```
+
+![](result.PNG)
+
+| Trial | Epoch | train err(%) | test err(%) | train loss | test loss |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| 1 | 120 | 0.06 | 0.31 | 0.0056 | 0.0064 |
+| 2 | 167 | 0.03 | 0.29 | 0.0048 | 0.0058 |
+| 3 | 182 | 0.04 | 0.31 | 0.0046 | 0.0058 |
+| average | - | 0.043 | 0.303 | 0.005 | 0.006 |
+
+We achieved `the best test error rate=0.29%` and `average test error=0.303%`. It is the best accuracy and fastest training time result among other implementations(Keras, Tensorflow at 2017-11-23).
+The result on paper is `0.25% (average test error rate)`.
+
+| Implementation| test err(%) | ?train time/epoch | GPU  Used|
+| :---: | :---: | :---: |:---: |
+| MXNet | 0.29 | 36 sec | 2 GTX 1080 |
+| tensorflow | 0.49 | ? 10 min | Unknown(4GB Memory) |
+| Keras | 0.30 | 55 sec | 2 GTX 1080 Ti |
diff --git a/example/capsnet/capsulelayers.py b/example/capsnet/capsulelayers.py
new file mode 100644
index 0000000000..5ac4fad491
--- /dev/null
+++ b/example/capsnet/capsulelayers.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+
+def squash(data, squash_axis, name=''):
+    epsilon = 1e-08
+    s_squared_norm = mx.sym.sum(data=mx.sym.square(data, name='square_'+name),
+                                axis=squash_axis, keepdims=True, name='s_squared_norm_'+name)
+    scale = s_squared_norm / (1 + s_squared_norm) / mx.sym.sqrt(data=(s_squared_norm+epsilon),
+                                                                name='s_squared_norm_sqrt_'+name)
+    squashed_net = mx.sym.broadcast_mul(scale, data, name='squashed_net_'+name)
+    return squashed_net
+
+
+def primary_caps(data, dim_vector, n_channels, kernel, strides, name=''):
+    out = mx.sym.Convolution(data=data,
+                             num_filter=dim_vector * n_channels,
+                             kernel=kernel,
+                             stride=strides,
+                             name=name
+                             )
+    out = mx.sym.Reshape(data=out, shape=(0, -1, dim_vector))
+    out = squash(out, squash_axis=2)
+    return out
+
+
+class CapsuleLayer:
+    """
+    The capsule layer with dynamic routing.
+    [batch_size, input_num_capsule, input_dim_vector] => [batch_size, num_capsule, dim_vector]
+    """
+
+    def __init__(self, num_capsule, dim_vector, batch_size, kernel_initializer, bias_initializer, num_routing=3):
+        self.num_capsule = num_capsule
+        self.dim_vector = dim_vector
+        self.batch_size = batch_size
+        self.num_routing = num_routing
+        self.kernel_initializer = kernel_initializer
+        self.bias_initializer = bias_initializer
+
+    def __call__(self, data):
+        _, out_shapes, __ = data.infer_shape(data=(self.batch_size, 1, 28, 28))
+        _, input_num_capsule, input_dim_vector = out_shapes[0]
+
+        # build w and bias
+        # W : (input_num_capsule, num_capsule, input_dim_vector, dim_vector)
+        # bias : (batch_size, input_num_capsule, num_capsule ,1, 1)
+        w = mx.sym.Variable('Weight',
+                            shape=(1, input_num_capsule, self.num_capsule, input_dim_vector, self.dim_vector),
+                            init=self.kernel_initializer)
+        bias = mx.sym.Variable('Bias',
+                               shape=(self.batch_size, input_num_capsule, self.num_capsule, 1, 1),
+                               init=self.bias_initializer)
+        bias = mx.sym.BlockGrad(bias)
+        bias_ = bias
+
+        # input : (batch_size, input_num_capsule, input_dim_vector)
+        # inputs_expand : (batch_size, input_num_capsule, 1, input_dim_vector, 1)
+        inputs_expand = mx.sym.Reshape(data=data, shape=(0, 0, -4, -1, 1))
+        inputs_expand = mx.sym.Reshape(data=inputs_expand, shape=(0, 0, -4, 1, -1, 0))
+        # input_tiled (batch_size, input_num_capsule, num_capsule, input_dim_vector, 1)
+        inputs_tiled = mx.sym.tile(data=inputs_expand, reps=(1, 1, self.num_capsule, 1, 1))
+        # w_tiled : [(1L, input_num_capsule, num_capsule, input_dim_vector, dim_vector)]
+        w_tiled = mx.sym.tile(w, reps=(self.batch_size, 1, 1, 1, 1))
+
+        # inputs_hat : [(1L, input_num_capsule, num_capsule, 1, dim_vector)]
+        inputs_hat = mx.sym.linalg_gemm2(w_tiled, inputs_tiled, transpose_a=True)
+
+        inputs_hat = mx.sym.swapaxes(data=inputs_hat, dim1=3, dim2=4)
+        inputs_hat_stopped = inputs_hat
+        inputs_hat_stopped = mx.sym.BlockGrad(inputs_hat_stopped)
+
+        for i in range(0, self.num_routing):
+            c = mx.sym.softmax(bias_, axis=2, name='c' + str(i))
+            if i == self.num_routing - 1:
+                outputs = squash(
+                    mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat, name='broadcast_mul_' + str(i)),
+                               axis=1, keepdims=True,
+                               name='sum_' + str(i)), name='output_' + str(i), squash_axis=4)
+            else:
+                outputs = squash(
+                    mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='broadcast_mul_' + str(i)),
+                               axis=1, keepdims=True,
+                               name='sum_' + str(i)), name='output_' + str(i), squash_axis=4)
+                bias_ = bias_ + mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='bias_broadcast_mul' + str(i)),
+                                           axis=4,
+                                           keepdims=True, name='bias_' + str(i))
+
+        outputs = mx.sym.Reshape(data=outputs, shape=(-1, self.num_capsule, self.dim_vector))
+        return outputs
diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py
new file mode 100644
index 0000000000..6b44c3dfca
--- /dev/null
+++ b/example/capsnet/capsulenet.py
@@ -0,0 +1,348 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import mxnet as mx
+import numpy as np
+import os
+import re
+import urllib
+import gzip
+import struct
+import scipy.ndimage as ndi
+from capsulelayers import primary_caps, CapsuleLayer
+
+from tensorboard import SummaryWriter
+
+def margin_loss(y_true, y_pred):
+    loss = y_true * mx.sym.square(mx.sym.maximum(0., 0.9 - y_pred)) +\
+        0.5 * (1 - y_true) * mx.sym.square(mx.sym.maximum(0., y_pred - 0.1))
+    return mx.sym.mean(data=mx.sym.sum(loss, 1))
+
+
+def capsnet(batch_size, n_class, num_routing,recon_loss_weight):
+    # data.shape = [batch_size, 1, 28, 28]
+    data = mx.sym.Variable('data')
+
+    input_shape = (1, 28, 28)
+    # Conv2D layer
+    # net.shape = [batch_size, 256, 20, 20]
+    conv1 = mx.sym.Convolution(data=data,
+                               num_filter=256,
+                               kernel=(9, 9),
+                               layout='NCHW',
+                               name='conv1')
+    conv1 = mx.sym.Activation(data=conv1, act_type='relu', name='conv1_act')
+    # net.shape = [batch_size, 256, 6, 6]
+
+    primarycaps = primary_caps(data=conv1,
+                               dim_vector=8,
+                               n_channels=32,
+                               kernel=(9, 9),
+                               strides=[2, 2],
+                               name='primarycaps')
+    primarycaps.infer_shape(data=(batch_size, 1, 28, 28))
+    # CapsuleLayer
+    kernel_initializer = mx.init.Xavier(rnd_type='uniform', factor_type='avg', magnitude=3)
+    bias_initializer = mx.init.Zero()
+    digitcaps = CapsuleLayer(num_capsule=10,
+                             dim_vector=16,
+                             batch_size=batch_size,
+                             kernel_initializer=kernel_initializer,
+                             bias_initializer=bias_initializer,
+                             num_routing=num_routing)(primarycaps)
+
+    # out_caps : (batch_size, 10)
+    out_caps = mx.sym.sqrt(data=mx.sym.sum(mx.sym.square(digitcaps), 2))
+    out_caps.infer_shape(data=(batch_size, 1, 28, 28))
+
+    y = mx.sym.Variable('softmax_label', shape=(batch_size,))
+    y_onehot = mx.sym.one_hot(y, n_class)
+    y_reshaped = mx.sym.Reshape(data=y_onehot, shape=(batch_size, -4, n_class, -1))
+    y_reshaped.infer_shape(softmax_label=(batch_size,))
+
+    # inputs_masked : (batch_size, 16)
+    inputs_masked = mx.sym.linalg_gemm2(y_reshaped, digitcaps, transpose_a=True)
+    inputs_masked = mx.sym.Reshape(data=inputs_masked, shape=(-3, 0))
+    x_recon = mx.sym.FullyConnected(data=inputs_masked, num_hidden=512, name='x_recon')
+    x_recon = mx.sym.Activation(data=x_recon, act_type='relu', name='x_recon_act')
+    x_recon = mx.sym.FullyConnected(data=x_recon, num_hidden=1024, name='x_recon2')
+    x_recon = mx.sym.Activation(data=x_recon, act_type='relu', name='x_recon_act2')
+    x_recon = mx.sym.FullyConnected(data=x_recon, num_hidden=np.prod(input_shape), name='x_recon3')
+    x_recon = mx.sym.Activation(data=x_recon, act_type='sigmoid', name='x_recon_act3')
+
+    data_flatten = mx.sym.flatten(data=data)
+    squared_error = mx.sym.square(x_recon-data_flatten)
+    recon_error = mx.sym.mean(squared_error)
+    recon_error_stopped = recon_error
+    recon_error_stopped = mx.sym.BlockGrad(recon_error_stopped)
+    loss = mx.symbol.MakeLoss((1-recon_loss_weight)*margin_loss(y_onehot, out_caps)+recon_loss_weight*recon_error)
+
+    out_caps_blocked = out_caps
+    out_caps_blocked = mx.sym.BlockGrad(out_caps_blocked)
+    return mx.sym.Group([out_caps_blocked, loss, recon_error_stopped])
+
+
+def download_data(url, force_download=False):
+    fname = url.split("/")[-1]
+    if force_download or not os.path.exists(fname):
+        urllib.urlretrieve(url, fname)
+    return fname
+
+
+def read_data(label_url, image_url):
+    with gzip.open(download_data(label_url)) as flbl:
+        magic, num = struct.unpack(">II", flbl.read(8))
+        label = np.fromstring(flbl.read(), dtype=np.int8)
+    with gzip.open(download_data(image_url), 'rb') as fimg:
+        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
+        image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols)
+    return label, image
+
+
+def to4d(img):
+    return img.reshape(img.shape[0], 1, 28, 28).astype(np.float32)/255
+
+
+class LossMetric(mx.metric.EvalMetric):
+    def __init__(self, batch_size, num_gpu):
+        super(LossMetric, self).__init__('LossMetric')
+        self.batch_size = batch_size
+        self.num_gpu = num_gpu
+        self.sum_metric = 0
+        self.num_inst = 0
+        self.loss = 0.0
+        self.batch_sum_metric = 0
+        self.batch_num_inst = 0
+        self.batch_loss = 0.0
+        self.recon_loss = 0.0
+        self.n_batch = 0
+
+    def update(self, labels, preds):
+        batch_sum_metric = 0
+        batch_num_inst = 0
+        for label, pred_outcaps in zip(labels[0], preds[0]):
+            label_np = int(label.asnumpy())
+            pred_label = int(np.argmax(pred_outcaps.asnumpy()))
+            batch_sum_metric += int(label_np == pred_label)
+            batch_num_inst += 1
+        batch_loss = preds[1].asnumpy()
+        recon_loss = preds[2].asnumpy()
+        self.sum_metric += batch_sum_metric
+        self.num_inst += batch_num_inst
+        self.loss += batch_loss
+        self.recon_loss += recon_loss
+        self.batch_sum_metric = batch_sum_metric
+        self.batch_num_inst = batch_num_inst
+        self.batch_loss = batch_loss
+        self.n_batch += 1 
+
+    def get_name_value(self):
+        acc = float(self.sum_metric)/float(self.num_inst)
+        mean_loss = self.loss / float(self.n_batch)
+        mean_recon_loss = self.recon_loss / float(self.n_batch)
+        return acc, mean_loss, mean_recon_loss
+
+    def get_batch_log(self, n_batch):
+        print("n_batch :"+str(n_batch)+" batch_acc:" +
+              str(float(self.batch_sum_metric) / float(self.batch_num_inst)) +
+              ' batch_loss:' + str(float(self.batch_loss)/float(self.batch_num_inst)))
+        self.batch_sum_metric = 0
+        self.batch_num_inst = 0
+        self.batch_loss = 0.0
+
+    def reset(self):
+        self.sum_metric = 0
+        self.num_inst = 0
+        self.loss = 0.0
+        self.recon_loss = 0.0
+        self.n_batch = 0
+
+
+class SimpleLRScheduler(mx.lr_scheduler.LRScheduler):
+    """A simple lr schedule that simply return `dynamic_lr`. We will set `dynamic_lr`
+    dynamically based on performance on the validation set.
+    """
+
+    def __init__(self, learning_rate=0.001):
+        super(SimpleLRScheduler, self).__init__()
+        self.learning_rate = learning_rate
+
+    def __call__(self, num_update):
+        return self.learning_rate
+
+
+def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay):
+    summary_writer = SummaryWriter(args.tblog_dir)
+    lr_scheduler = SimpleLRScheduler(learning_rate)
+    optimizer_params = {'lr_scheduler': lr_scheduler}
+    module.init_params()
+    module.init_optimizer(kvstore=kvstore,
+                          optimizer=optimizer,
+                          optimizer_params=optimizer_params)
+    n_epoch = 0
+    while True:
+        if n_epoch >= num_epoch:
+            break
+        train_iter.reset()
+        val_iter.reset()
+        loss_metric.reset()
+        for n_batch, data_batch in enumerate(train_iter):
+            module.forward_backward(data_batch)
+            module.update()
+            module.update_metric(loss_metric, data_batch.label)
+            loss_metric.get_batch_log(n_batch)
+        train_acc, train_loss, train_recon_err = loss_metric.get_name_value()
+        loss_metric.reset()
+        for n_batch, data_batch in enumerate(val_iter):
+            module.forward(data_batch)
+            module.update_metric(loss_metric, data_batch.label)
+            loss_metric.get_batch_log(n_batch)
+        val_acc, val_loss, val_recon_err = loss_metric.get_name_value()
+
+        summary_writer.add_scalar('train_acc', train_acc, n_epoch)
+        summary_writer.add_scalar('train_loss', train_loss, n_epoch)
+        summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch)
+        summary_writer.add_scalar('val_acc', val_acc, n_epoch)
+        summary_writer.add_scalar('val_loss', val_loss, n_epoch)
+        summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch)
+
+        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err))
+        print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err))
+        print('SAVE CHECKPOINT')
+
+        module.save_checkpoint(prefix=model_prefix, epoch=n_epoch)
+        n_epoch += 1
+        lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
+
+
+def apply_transform(x,
+                    transform_matrix,
+                    fill_mode='nearest',
+                    cval=0.):
+    x = np.rollaxis(x, 0, 0)
+    final_affine_matrix = transform_matrix[:2, :2]
+    final_offset = transform_matrix[:2, 2]
+    channel_images = [ndi.interpolation.affine_transform(
+        x_channel,
+        final_affine_matrix,
+        final_offset,
+        order=0,
+        mode=fill_mode,
+        cval=cval) for x_channel in x]
+    x = np.stack(channel_images, axis=0)
+    x = np.rollaxis(x, 0, 0 + 1)
+    return x
+
+
+def random_shift(x, width_shift_fraction, height_shift_fraction):
+    tx = np.random.uniform(-height_shift_fraction, height_shift_fraction) * x.shape[2]
+    ty = np.random.uniform(-width_shift_fraction, width_shift_fraction) * x.shape[1]
+    shift_matrix = np.array([[1, 0, tx],
+                             [0, 1, ty],
+                             [0, 0, 1]])
+    x = apply_transform(x, shift_matrix, 'nearest')
+    return x
+
+def _shuffle(data, idx):
+    """Shuffle the data."""
+    shuffle_data = []
+
+    for k, v in data:
+        shuffle_data.append((k, mx.ndarray.array(v.asnumpy()[idx], v.context)))
+
+    return shuffle_data
+
+class MNISTCustomIter(mx.io.NDArrayIter):
+    
+    def reset(self):
+        # shuffle data
+        if self.is_train:
+            np.random.shuffle(self.idx)
+            self.data = _shuffle(self.data, self.idx)
+            self.label = _shuffle(self.label, self.idx)
+        if self.last_batch_handle == 'roll_over' and self.cursor > self.num_data:
+            self.cursor = -self.batch_size + (self.cursor%self.num_data)%self.batch_size
+        else:
+            self.cursor = -self.batch_size
+    def set_is_train(self, is_train):
+        self.is_train = is_train
+    def next(self):
+        if self.iter_next():
+            if self.is_train:
+                data_raw_list = self.getdata()
+                data_shifted = []
+                for data_raw in data_raw_list[0]:
+                    data_shifted.append(random_shift(data_raw.asnumpy(), 0.1, 0.1))
+                return mx.io.DataBatch(data=[mx.nd.array(data_shifted)], label=self.getlabel(),
+                                       pad=self.getpad(), index=None)
+            else:
+                 return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), \
+                                  pad=self.getpad(), index=None)
+
+        else:
+            raise StopIteration
+
+
+if __name__ == "__main__":
+    # Read mnist data set
+    path = 'http://yann.lecun.com/exdb/mnist/'
+    (train_lbl, train_img) = read_data(
+        path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz')
+    (val_lbl, val_img) = read_data(
+        path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz')
+    # set batch size
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch_size', default=100, type=int)
+    parser.add_argument('--devices', default='gpu0', type=str)
+    parser.add_argument('--num_epoch', default=100, type=int)
+    parser.add_argument('--lr', default=0.001, type=float)
+    parser.add_argument('--num_routing', default=3, type=int)
+    parser.add_argument('--model_prefix', default='capsnet', type=str)
+    parser.add_argument('--decay', default=0.9, type=float)
+    parser.add_argument('--tblog_dir', default='tblog', type=str)
+    parser.add_argument('--recon_loss_weight', default=0.392, type=float)
+    args = parser.parse_args()
+    for k, v in sorted(vars(args).items()):
+        print("{0}: {1}".format(k, v))
+    contexts = re.split(r'\W+', args.devices)
+    for i, ctx in enumerate(contexts):
+        if ctx[:3] == 'gpu':
+            contexts[i] = mx.context.gpu(int(ctx[3:]))
+        else:
+            contexts[i] = mx.context.cpu()
+    num_gpu = len(contexts)
+
+    if args.batch_size % num_gpu != 0:
+        raise Exception('num_gpu should be positive divisor of batch_size')
+
+    # generate train_iter, val_iter
+    train_iter = MNISTCustomIter(data=to4d(train_img), label=train_lbl, batch_size=args.batch_size, shuffle=True)
+    train_iter.set_is_train(True)
+    val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=args.batch_size,)
+    val_iter.set_is_train(False)
+    # define capsnet
+    final_net = capsnet(batch_size=args.batch_size/num_gpu, n_class=10, num_routing=args.num_routing, recon_loss_weight=args.recon_loss_weight)
+    # set metric
+    loss_metric = LossMetric(args.batch_size/num_gpu, 1)
+
+    # run model
+    module = mx.mod.Module(symbol=final_net, context=contexts, data_names=('data',), label_names=('softmax_label',))
+    module.bind(data_shapes=train_iter.provide_data,
+                label_shapes=val_iter.provide_label,
+                for_training=True)
+    do_training(num_epoch=args.num_epoch, optimizer='adam', kvstore='device', learning_rate=args.lr,
+                model_prefix=args.model_prefix, decay=args.decay)
diff --git a/example/capsnet/result.PNG b/example/capsnet/result.PNG
new file mode 100644
index 0000000000..62885dd011
Binary files /dev/null and b/example/capsnet/result.PNG differ
diff --git a/example/captcha/mxnet_captcha.R b/example/captcha/mxnet_captcha.R
index 4874ad5354..cf69379d39 100644
--- a/example/captcha/mxnet_captcha.R
+++ b/example/captcha/mxnet_captcha.R
@@ -32,8 +32,8 @@ data.shape <- c(80, 30, 3)
 batch_size <- 40
 
 train <- mx.io.ImageRecordIter(
-  path.imgrec     = "train.rec",
-  path.imglist    = "train.lst",
+  path.imgrec     = "captcha_train.rec",
+  path.imglist    = "captcha_train.lst",
   batch.size      = batch_size,
   label.width     = 4,
   data.shape      = data.shape,
@@ -41,8 +41,8 @@ train <- mx.io.ImageRecordIter(
 )
 
 val <- mx.io.ImageRecordIter(
-  path.imgrec     = "test.rec",
-  path.imglist    = "test.lst",
+  path.imgrec     = "captcha_test.rec",
+  path.imglist    = "captcha_test.lst",
   batch.size      = batch_size,
   label.width     = 4,
   data.shape      = data.shape,
diff --git a/example/deep-embedded-clustering/README.md b/example/deep-embedded-clustering/README.md
new file mode 100644
index 0000000000..90803d2ed1
--- /dev/null
+++ b/example/deep-embedded-clustering/README.md
@@ -0,0 +1,9 @@
+# DEC Implementation
+This is based on the paper `Unsupervised deep embedding for clustering analysis` by  Junyuan Xie, Ross Girshick, and Ali Farhadi
+
+## Prerequisite
+  - Install Scikit-learn: `python -m pip install --user sklearn`
+  - Install SciPy: `python -m pip install --user scipy`
+
+## Usage
+run `python dec.py`
\ No newline at end of file
diff --git a/example/dec/dec.py b/example/deep-embedded-clustering/dec.py
similarity index 99%
rename from example/dec/dec.py
rename to example/deep-embedded-clustering/dec.py
index ac6545abb1..44e582d9f9 100644
--- a/example/dec/dec.py
+++ b/example/deep-embedded-clustering/dec.py
@@ -81,7 +81,7 @@ def list_arguments(self):
             return ['data', 'mu', 'label']
 
     def setup(self, X, num_centers, alpha, save_to='dec_model'):
-        sep = X.shape[0]*9/10
+        sep = X.shape[0]*9//10
         X_train = X[:sep]
         X_val = X[sep:]
         ae_model = AutoEncoderModel(self.xpu, [X.shape[1],500,500,2000,10], pt_dropout=0.2)
diff --git a/example/fcn-xs/README.md b/example/fcn-xs/README.md
index 66ae08fe71..145aa31cb7 100644
--- a/example/fcn-xs/README.md
+++ b/example/fcn-xs/README.md
@@ -1,6 +1,7 @@
-FCN-xs EXAMPLES
----------------
-This folder contains the examples of image segmentation in MXNet.
+FCN-xs EXAMPLE
+--------------
+This folder contains an example implementation for Fully Convolutional Networks (FCN) in MXNet.  
+The example is based on the [FCN paper](https://arxiv.org/abs/1411.4038) by long et al. of UC Berkeley.
 
 ## Sample results
 ![fcn-xs pasval_voc result](https://github.com/dmlc/web-data/blob/master/mxnet/image/fcnxs-example-result.jpg)
@@ -17,32 +18,36 @@ We have trained a simple fcn-xs model, the hyper-parameters are below:
 
 The training dataset size is only 2027, and the validation dataset size is 462.  
 
-## How to train fcn-xs in mxnet
-#### Getting Started
+## Training the model
+
+### Step 1: setup pre-requisites
 
 - Install python package?`Pillow` (required by `image_segment.py`).
 ```shell
-[sudo] pip install Pillow
+pip install --upgrade Pillow
 ```
-- Assume that we are in a working directory, such as `~/train_fcn_xs`, and MXNet is built as `~/mxnet`. Now, copy example scripts into working directory.
+- Setup your working directory. Assume your working directory is `~/train_fcn_xs`, and MXNet is built as `~/mxnet`. Copy example scripts into the working directory.
 ```shell
 cp ~/mxnet/example/fcn-xs/* .
 ```
-#### Step1: Download the vgg16fc model and experiment data
-* vgg16fc model : you can download the ```VGG_FC_ILSVRC_16_layers-symbol.json``` and ```VGG_FC_ILSVRC_16_layers-0074.params```   [baidu yun](http://pan.baidu.com/s/1bgz4PC), [dropbox](https://www.dropbox.com/sh/578n5cxej7ofd6m/AACuSeSYGcKQDi1GoB72R5lya?dl=0).  
+### Step 2: Download the vgg16fc model and training data
+* vgg16fc model: you can download the ```VGG_FC_ILSVRC_16_layers-symbol.json``` and ```VGG_FC_ILSVRC_16_layers-0074.params``` from [baidu yun](http://pan.baidu.com/s/1bgz4PC), [dropbox](https://www.dropbox.com/sh/578n5cxej7ofd6m/AACuSeSYGcKQDi1GoB72R5lya?dl=0).  
 this is the fully convolution style of the origin
 [VGG_ILSVRC_16_layers.caffemodel](http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_16_layers.caffemodel), and the corresponding [VGG_ILSVRC_16_layers_deploy.prototxt](https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-vgg_ilsvrc_16_layers_deploy-prototxt), the vgg16 model has [license](http://creativecommons.org/licenses/by-nc/4.0/) for non-commercial use only.
-* experiment data : you can download the ```VOC2012.rar```  [robots.ox.ac.uk](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar), and extract it. the file/folder will be like:  
-```JPEGImages folder```, ```SegmentationClass folder```, ```train.lst```, ```val.lst```, ```test.lst```
+* Training data: download the ```VOC2012.rar```  [robots.ox.ac.uk](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar), and extract it into ```.\VOC2012```
+* Mapping files: download ```train.lst```, ```val.lst``` from [baidu yun](http://pan.baidu.com/s/1bgz4PC) into the ```.\VOC2012``` directory
+
+Once you completed all these steps, your working directory should contain a ```.\VOC2012``` directory, which contains the following: ```JPEGImages folder```, ```SegmentationClass folder```, ```train.lst```, ```val.lst```
 
-#### Step2: Train fcn-xs model
-* Configure GPU/CPU for training in `fcn_xs.py`.
+#### Step 3: Train the fcn-xs model
+* Based on your hardware, configure GPU or CPU for training in `fcn_xs.py`. It is recommended to use GPU due to the computational complexity and data load.
 ```python
 # ctx = mx.cpu(0)
 ctx = mx.gpu(0)
 ```
-* If you want to train the fcn-8s model, it's better for you trained the fcn-32s and fcn-16s model firstly.
-when training the fcn-32s model, run in shell ```./run_fcnxs.sh```, the script in it is:
+* It is recommended to train fcn-32s and fcn-16s before training the fcn-8s model
+
+To train the fcn-32s model, run the following:
 ```shell
 python -u fcn_xs.py --model=fcn32s --prefix=VGG_FC_ILSVRC_16_layers --epoch=74 --init-type=vgg16
 ```
@@ -64,14 +69,15 @@ INFO:root:Epoch[0] Batch [350]  Speed: 1.12 samples/sec Train-accuracy=0.912080
 ```
 
 ## Using the pre-trained model for image segmentation
-* Similarly, you should first download the pre-trained model from  [yun.baidu](http://pan.baidu.com/s/1bgz4PC), the symbol and model file is ```FCN8s_VGG16-symbol.json```, ```FCN8s_VGG16-0019.params```
-* Then put the image in your directory for segmentation, and change the ```img = YOUR_IMAGE_NAME``` in ```image_segmentaion.py```
-* At last, use ```image_segmentaion.py``` to segmentation one image by running in shell ```python image_segmentaion.py```, then you will get the segmentation image like the sample results above.
+To try out the pre-trained model, follow these steps:
+* Download the pre-trained symbol and weights from [yun.baidu](http://pan.baidu.com/s/1bgz4PC). You should download these files: ```FCN8s_VGG16-symbol.json``` and ```FCN8s_VGG16-0019.params```
+* Run the segmentation script, providing it your input image path: ```python image_segmentaion.py --input <your JPG image path>```
+* The segmented output ```.png``` file will be generated in the working directory
 
 ## Tips
-* This is the whole image size training, that is to say, we do not need resize/crop the image to the same size, so the batch_size during training is set to 1.
-* The fcn-xs model is based on vgg16 model, with some crop, deconv, element-sum layer added, so the model is quite big, moreover, the example is using whole image size training, if the input image is large(such as 700*500), then it may consume lots of memories, so I suggest you using the GPU with 12G memory.
-* If you don't have GPU with 12G memory, maybe you should change the ```cut_off_size``` to a small value when you construct your FileIter, like this:  
+* This example runs full image size training, so there is no need to resize or crop input images to the same size. Accordingly, batch_size during training is set to 1.
+* The fcn-xs model is based on vgg16 model, with some crop, deconv, element-sum layer added, so the model is quite big, moreover, the example is using whole image size training, if the input image is large(such as 700*500), then memory consumption may be high. Due to that, I suggest you use GPU with at least 12GB memory for training.
+* If you don't have access to GPU with 12GB memory for training, I suggest you change the ```cut_off_size``` to a small value when constructing the FileIter, example below:  
 ```python
 train_dataiter = FileIter(
       root_dir             = "./VOC2012",
@@ -80,4 +86,4 @@ train_dataiter = FileIter(
       rgb_mean             = (123.68, 116.779, 103.939),
       )
 ```
-* We are looking forward you to making this example more powerful, thanks.
+
diff --git a/example/fcn-xs/image_segmentaion.py b/example/fcn-xs/image_segmentaion.py
index ddd850fe4e..75df2d128a 100644
--- a/example/fcn-xs/image_segmentaion.py
+++ b/example/fcn-xs/image_segmentaion.py
@@ -15,38 +15,68 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: skip-file
+"""
+This module encapsulates running image segmentation model for inference.
+
+Example usage:
+    $ python image_segmentaion.py --input <your JPG image path>
+"""
+
+import argparse
+import os
 import numpy as np
 import mxnet as mx
 from PIL import Image
 
-def getpallete(num_cls):
-    # this function is to get the colormap for visualizing the segmentation mask
-    n = num_cls
-    pallete = [0]*(n*3)
-    for j in xrange(0,n):
-            lab = j
-            pallete[j*3+0] = 0
-            pallete[j*3+1] = 0
-            pallete[j*3+2] = 0
-            i = 0
-            while (lab > 0):
-                    pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
-                    pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
-                    pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
-                    i = i + 1
-                    lab >>= 3
-    return pallete
+def make_file_extension_assertion(extension):
+    """Function factory for file extension argparse assertion
+        Args:
+            extension (string): the file extension to assert
+
+        Returns:
+            string: the supplied extension, if assertion is successful.
+
+    """
+    def file_extension_assertion(file_path):
+        base, ext = os.path.splitext(file_path)
+        if ext.lower() != extension:
+            raise argparse.ArgumentTypeError('File must have ' + extension + ' extension')
+        return file_path
+    return file_extension_assertion
+
+def get_palette(num_colors=256):
+    """generates the colormap for visualizing the segmentation mask
+            Args:
+                num_colors (int): the number of colors to generate in the output palette
 
-pallete = getpallete(256)
-img = "./person_bicycle.jpg"
-seg = img.replace("jpg", "png")
-model_previx = "FCN8s_VGG16"
-epoch = 19
-ctx = mx.gpu(0)
+            Returns:
+                string: the supplied extension, if assertion is successful.
+
+    """
+    pallete = [0]*(num_colors*3)
+    for j in range(0, num_colors):
+        lab = j
+        pallete[j*3+0] = 0
+        pallete[j*3+1] = 0
+        pallete[j*3+2] = 0
+        i = 0
+        while (lab > 0):
+            pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
+            pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
+            pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
+            i = i + 1
+            lab >>= 3
+    return pallete
 
 def get_data(img_path):
-    """get the (1, 3, h, w) np.array data for the img_path"""
+    """get the (1, 3, h, w) np.array data for the supplied image
+                Args:
+                    img_path (string): the input image path
+
+                Returns:
+                    np.array: image data in a (1, 3, h, w) shape
+
+    """
     mean = np.array([123.68, 116.779, 103.939])  # (R,G,B)
     img = Image.open(img_path)
     img = np.array(img, dtype=np.float32)
@@ -58,18 +88,37 @@ def get_data(img_path):
     return img
 
 def main():
-    fcnxs, fcnxs_args, fcnxs_auxs = mx.model.load_checkpoint(model_previx, epoch)
-    fcnxs_args["data"] = mx.nd.array(get_data(img), ctx)
+    """Module main execution"""
+    # Initialization variables - update to change your model and execution context
+    model_prefix = "FCN8s_VGG16"
+    epoch = 19
+
+    # By default, MXNet will run on the CPU. Uncomment the line below to execute on the GPU
+    # ctx = mx.gpu()
+
+    fcnxs, fcnxs_args, fcnxs_auxs = mx.model.load_checkpoint(model_prefix, epoch)
+    fcnxs_args["data"] = mx.nd.array(get_data(args.input), ctx)
     data_shape = fcnxs_args["data"].shape
     label_shape = (1, data_shape[2]*data_shape[3])
     fcnxs_args["softmax_label"] = mx.nd.empty(label_shape, ctx)
-    exector = fcnxs.bind(ctx, fcnxs_args ,args_grad=None, grad_req="null", aux_states=fcnxs_args)
+    exector = fcnxs.bind(ctx, fcnxs_args, args_grad=None, grad_req="null", aux_states=fcnxs_args)
     exector.forward(is_train=False)
     output = exector.outputs[0]
     out_img = np.uint8(np.squeeze(output.asnumpy().argmax(axis=1)))
     out_img = Image.fromarray(out_img)
-    out_img.putpalette(pallete)
-    out_img.save(seg)
+    out_img.putpalette(get_palette())
+    out_img.save(args.output)
 
 if __name__ == "__main__":
+    # Handle command line arguments
+    parser = argparse.ArgumentParser(description='Run VGG16-FCN-8s to segment an input image')
+    parser.add_argument('--input',
+                        required=True,
+                        type=make_file_extension_assertion('.jpg'),
+                        help='The segmentation input JPG image')
+    parser.add_argument('--output',
+                        default='segmented.png',
+                        type=make_file_extension_assertion('.png'),
+                        help='The segmentation putput PNG image')
+    args = parser.parse_args()
     main()
diff --git a/example/gan/dcgan.py b/example/gan/dcgan.py
deleted file mode 100644
index 981f4a4778..0000000000
--- a/example/gan/dcgan.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import mxnet as mx
-import numpy as np
-from sklearn.datasets import fetch_mldata
-from matplotlib import pyplot as plt
-import logging
-import cv2
-from datetime import datetime
-
-def make_dcgan_sym(ngf, ndf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12):
-    BatchNorm = mx.sym.BatchNorm
-    rand = mx.sym.Variable('rand')
-
-    g1 = mx.sym.Deconvolution(rand, name='g1', kernel=(4,4), num_filter=ngf*8, no_bias=no_bias)
-    gbn1 = BatchNorm(g1, name='gbn1', fix_gamma=fix_gamma, eps=eps)
-    gact1 = mx.sym.Activation(gbn1, name='gact1', act_type='relu')
-
-    g2 = mx.sym.Deconvolution(gact1, name='g2', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=ngf*4, no_bias=no_bias)
-    gbn2 = BatchNorm(g2, name='gbn2', fix_gamma=fix_gamma, eps=eps)
-    gact2 = mx.sym.Activation(gbn2, name='gact2', act_type='relu')
-
-    g3 = mx.sym.Deconvolution(gact2, name='g3', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=ngf*2, no_bias=no_bias)
-    gbn3 = BatchNorm(g3, name='gbn3', fix_gamma=fix_gamma, eps=eps)
-    gact3 = mx.sym.Activation(gbn3, name='gact3', act_type='relu')
-
-    g4 = mx.sym.Deconvolution(gact3, name='g4', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=ngf, no_bias=no_bias)
-    gbn4 = BatchNorm(g4, name='gbn4', fix_gamma=fix_gamma, eps=eps)
-    gact4 = mx.sym.Activation(gbn4, name='gact4', act_type='relu')
-
-    g5 = mx.sym.Deconvolution(gact4, name='g5', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=nc, no_bias=no_bias)
-    gout = mx.sym.Activation(g5, name='gact5', act_type='tanh')
-
-    data = mx.sym.Variable('data')
-    label = mx.sym.Variable('label')
-
-    d1 = mx.sym.Convolution(data, name='d1', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=ndf, no_bias=no_bias)
-    dact1 = mx.sym.LeakyReLU(d1, name='dact1', act_type='leaky', slope=0.2)
-
-    d2 = mx.sym.Convolution(dact1, name='d2', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=ndf*2, no_bias=no_bias)
-    dbn2 = BatchNorm(d2, name='dbn2', fix_gamma=fix_gamma, eps=eps)
-    dact2 = mx.sym.LeakyReLU(dbn2, name='dact2', act_type='leaky', slope=0.2)
-
-    d3 = mx.sym.Convolution(dact2, name='d3', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=ndf*4, no_bias=no_bias)
-    dbn3 = BatchNorm(d3, name='dbn3', fix_gamma=fix_gamma, eps=eps)
-    dact3 = mx.sym.LeakyReLU(dbn3, name='dact3', act_type='leaky', slope=0.2)
-
-    d4 = mx.sym.Convolution(dact3, name='d4', kernel=(4,4), stride=(2,2), pad=(1,1), num_filter=ndf*8, no_bias=no_bias)
-    dbn4 = BatchNorm(d4, name='dbn4', fix_gamma=fix_gamma, eps=eps)
-    dact4 = mx.sym.LeakyReLU(dbn4, name='dact4', act_type='leaky', slope=0.2)
-
-    d5 = mx.sym.Convolution(dact4, name='d5', kernel=(4,4), num_filter=1, no_bias=no_bias)
-    d5 = mx.sym.Flatten(d5)
-
-    dloss = mx.sym.LogisticRegressionOutput(data=d5, label=label, name='dloss')
-    return gout, dloss
-
-def get_mnist():
-    mnist = fetch_mldata('MNIST original')
-    np.random.seed(1234) # set seed for deterministic ordering
-    p = np.random.permutation(mnist.data.shape[0])
-    X = mnist.data[p]
-    X = X.reshape((70000, 28, 28))
-
-    X = np.asarray([cv2.resize(x, (64,64)) for x in X])
-
-    X = X.astype(np.float32)/(255.0/2) - 1.0
-    X = X.reshape((70000, 1, 64, 64))
-    X = np.tile(X, (1, 3, 1, 1))
-    X_train = X[:60000]
-    X_test = X[60000:]
-
-    return X_train, X_test
-
-class RandIter(mx.io.DataIter):
-    def __init__(self, batch_size, ndim):
-        self.batch_size = batch_size
-        self.ndim = ndim
-        self.provide_data = [('rand', (batch_size, ndim, 1, 1))]
-        self.provide_label = []
-
-    def iter_next(self):
-        return True
-
-    def getdata(self):
-        return [mx.random.normal(0, 1.0, shape=(self.batch_size, self.ndim, 1, 1))]
-
-class ImagenetIter(mx.io.DataIter):
-    def __init__(self, path, batch_size, data_shape):
-        self.internal = mx.io.ImageRecordIter(
-            path_imgrec = path,
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            rand_crop   = True,
-            rand_mirror = True,
-            max_crop_size = 256,
-            min_crop_size = 192)
-        self.provide_data = [('data', (batch_size,) + data_shape)]
-        self.provide_label = []
-
-    def reset(self):
-        self.internal.reset()
-
-    def iter_next(self):
-        return self.internal.iter_next()
-
-    def getdata(self):
-        data = self.internal.getdata()
-        data = data * (2.0/255.0)
-        data -= 1
-        return [data]
-
-def fill_buf(buf, i, img, shape):
-    n = buf.shape[0]/shape[1]
-    m = buf.shape[1]/shape[0]
-
-    sx = (i%m)*shape[0]
-    sy = (i/m)*shape[1]
-    buf[sy:sy+shape[1], sx:sx+shape[0], :] = img
-
-def visual(title, X):
-    assert len(X.shape) == 4
-    X = X.transpose((0, 2, 3, 1))
-    X = np.clip((X+1.0)*(255.0/2.0), 0, 255).astype(np.uint8)
-    n = np.ceil(np.sqrt(X.shape[0]))
-    buff = np.zeros((int(n*X.shape[1]), int(n*X.shape[2]), int(X.shape[3])), dtype=np.uint8)
-    for i, img in enumerate(X):
-        fill_buf(buff, i, img, X.shape[1:3])
-    buff = cv2.cvtColor(buff, cv2.COLOR_BGR2RGB)
-    plt.imshow(buff)
-    plt.title(title)
-    plt.show()
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG)
-
-    # =============setting============
-    dataset = 'mnist'
-    imgnet_path = './train.rec'
-    ndf = 64
-    ngf = 64
-    nc = 3
-    batch_size = 64
-    Z = 100
-    lr = 0.0002
-    beta1 = 0.5
-    ctx = mx.gpu(0)
-    check_point = False
-
-    symG, symD = make_dcgan_sym(ngf, ndf, nc)
-    #mx.viz.plot_network(symG, shape={'rand': (batch_size, 100, 1, 1)}).view()
-    #mx.viz.plot_network(symD, shape={'data': (batch_size, nc, 64, 64)}).view()
-
-    # ==============data==============
-    if dataset == 'mnist':
-        X_train, X_test = get_mnist()
-        train_iter = mx.io.NDArrayIter(X_train, batch_size=batch_size)
-    elif dataset == 'imagenet':
-        train_iter = ImagenetIter(imgnet_path, batch_size, (3, 64, 64))
-    rand_iter = RandIter(batch_size, Z)
-    label = mx.nd.zeros((batch_size,), ctx=ctx)
-
-    # =============module G=============
-    modG = mx.mod.Module(symbol=symG, data_names=('rand',), label_names=None, context=ctx)
-    modG.bind(data_shapes=rand_iter.provide_data)
-    modG.init_params(initializer=mx.init.Normal(0.02))
-    modG.init_optimizer(
-        optimizer='adam',
-        optimizer_params={
-            'learning_rate': lr,
-            'wd': 0.,
-            'beta1': beta1,
-        })
-    mods = [modG]
-
-    # =============module D=============
-    modD = mx.mod.Module(symbol=symD, data_names=('data',), label_names=('label',), context=ctx)
-    modD.bind(data_shapes=train_iter.provide_data,
-              label_shapes=[('label', (batch_size,))],
-              inputs_need_grad=True)
-    modD.init_params(initializer=mx.init.Normal(0.02))
-    modD.init_optimizer(
-        optimizer='adam',
-        optimizer_params={
-            'learning_rate': lr,
-            'wd': 0.,
-            'beta1': beta1,
-        })
-    mods.append(modD)
-
-
-    # ============printing==============
-    def norm_stat(d):
-        return mx.nd.norm(d)/np.sqrt(d.size)
-    mon = mx.mon.Monitor(10, norm_stat, pattern=".*output|d1_backward_data", sort=True)
-    mon = None
-    if mon is not None:
-        for mod in mods:
-            pass
-
-    def facc(label, pred):
-        pred = pred.ravel()
-        label = label.ravel()
-        return ((pred > 0.5) == label).mean()
-
-    def fentropy(label, pred):
-        pred = pred.ravel()
-        label = label.ravel()
-        return -(label*np.log(pred+1e-12) + (1.-label)*np.log(1.-pred+1e-12)).mean()
-
-    mG = mx.metric.CustomMetric(fentropy)
-    mD = mx.metric.CustomMetric(fentropy)
-    mACC = mx.metric.CustomMetric(facc)
-
-    print('Training...')
-    stamp =  datetime.now().strftime('%Y_%m_%d-%H_%M')
-
-    # =============train===============
-    for epoch in range(100):
-        train_iter.reset()
-        for t, batch in enumerate(train_iter):
-            rbatch = rand_iter.next()
-
-            if mon is not None:
-                mon.tic()
-
-            modG.forward(rbatch, is_train=True)
-            outG = modG.get_outputs()
-
-            # update discriminator on fake
-            label[:] = 0
-            modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
-            modD.backward()
-            #modD.update()
-            gradD = [[grad.copyto(grad.context) for grad in grads] for grads in modD._exec_group.grad_arrays]
-
-            modD.update_metric(mD, [label])
-            modD.update_metric(mACC, [label])
-
-            # update discriminator on real
-            label[:] = 1
-            batch.label = [label]
-            modD.forward(batch, is_train=True)
-            modD.backward()
-            for gradsr, gradsf in zip(modD._exec_group.grad_arrays, gradD):
-                for gradr, gradf in zip(gradsr, gradsf):
-                    gradr += gradf
-            modD.update()
-
-            modD.update_metric(mD, [label])
-            modD.update_metric(mACC, [label])
-
-            # update generator
-            label[:] = 1
-            modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
-            modD.backward()
-            diffD = modD.get_input_grads()
-            modG.backward(diffD)
-            modG.update()
-
-            mG.update([label], modD.get_outputs())
-
-
-            if mon is not None:
-                mon.toc_print()
-
-            t += 1
-            if t % 10 == 0:
-                print('epoch:', epoch, 'iter:', t, 'metric:', mACC.get(), mG.get(), mD.get())
-                mACC.reset()
-                mG.reset()
-                mD.reset()
-
-                visual('gout', outG[0].asnumpy())
-                diff = diffD[0].asnumpy()
-                diff = (diff - diff.mean())/diff.std()
-                visual('diff', diff)
-                visual('data', batch.data[0].asnumpy())
-
-        if check_point:
-            print('Saving...')
-            modG.save_params('%s_G_%s-%04d.params'%(dataset, stamp, epoch))
-            modD.save_params('%s_D_%s-%04d.params'%(dataset, stamp, epoch))
diff --git a/example/gluon/tree_lstm/main.py b/example/gluon/tree_lstm/main.py
index f04a69f267..67644f97d3 100644
--- a/example/gluon/tree_lstm/main.py
+++ b/example/gluon/tree_lstm/main.py
@@ -16,7 +16,11 @@
 # under the License.
 
 # This example is inspired by https://github.com/dasguptar/treelstm.pytorch
-import argparse, cPickle, math, os, random
+import argparse, math, os, random
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 import logging
 logging.basicConfig(level=logging.INFO)
 import numpy as np
@@ -66,9 +70,9 @@
 batch_size = opt.batch_size
 
 # read dataset
-if os.path.exists('dataset.cPickle'):
-    with open('dataset.cPickle', 'rb') as f:
-        train_iter, dev_iter, test_iter, vocab = cPickle.load(f)
+if os.path.exists('dataset.pickle'):
+    with open('dataset.pickle', 'rb') as f:
+        train_iter, dev_iter, test_iter, vocab = pickle.load(f)
 else:
     root_dir = opt.data
     segments = ['train', 'dev', 'test']
@@ -80,8 +84,8 @@
 
     train_iter, dev_iter, test_iter = [SICKDataIter(os.path.join(root_dir, segment), vocab, num_classes)
                                        for segment in segments]
-    with open('dataset.cPickle', 'wb') as f:
-        cPickle.dump([train_iter, dev_iter, test_iter, vocab], f)
+    with open('dataset.pickle', 'wb') as f:
+        pickle.dump([train_iter, dev_iter, test_iter, vocab], f)
 
 logging.info('==> SICK vocabulary size : %d ' % vocab.size)
 logging.info('==> Size of train data   : %d ' % len(train_iter))
diff --git a/example/gluon/word_language_model/train.py b/example/gluon/word_language_model/train.py
index 0b504998be..b419277dcf 100644
--- a/example/gluon/word_language_model/train.py
+++ b/example/gluon/word_language_model/train.py
@@ -54,6 +54,11 @@
                     help='report interval')
 parser.add_argument('--save', type=str, default='model.params',
                     help='path to save the final model')
+parser.add_argument('--gctype', type=str, default='none',
+                    help='type of gradient compression to use, \
+                          takes `2bit` or `none` for now.')
+parser.add_argument('--gcthreshold', type=float, default=0.5,
+                    help='threshold for 2bit gradient compression')
 args = parser.parse_args()
 
 
@@ -90,10 +95,13 @@ def batchify(data, batch_size):
 model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                        args.nlayers, args.dropout, args.tied)
 model.collect_params().initialize(mx.init.Xavier(), ctx=context)
+
+compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold}
 trainer = gluon.Trainer(model.collect_params(), 'sgd',
                         {'learning_rate': args.lr,
                          'momentum': 0,
-                         'wd': 0})
+                         'wd': 0},
+                        compression_params=compression_params)
 loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
 ###############################################################################
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index aeacffa82b..b6d1d642c8 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -33,6 +33,9 @@ def get_symbol(network, batch_size):
     if 'resnet' in network:
         num_layers = int(network.split('-')[1])
         network = 'resnet'
+    if 'vgg' in network:
+        num_layers = int(network.split('-')[1])
+        network = 'vgg'
     net = import_module('symbols.'+network)
     sym = net.get_symbol(num_classes = 1000,
                          image_shape = ','.join([str(i) for i in image_shape]),
@@ -65,7 +68,7 @@ def score(network, dev, batch_size, num_batches):
     return num_batches*batch_size/(time.time() - tic)
 
 if __name__ == '__main__':
-    networks = ['alexnet', 'vgg', 'inception-bn', 'inception-v3', 'resnet-50', 'resnet-152']
+    networks = ['alexnet', 'vgg-16', 'inception-bn', 'inception-v3', 'resnet-50', 'resnet-152']
     devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
     # Enable USE_MKL2017_EXPERIMENTAL for better CPU performance
     devs.append(mx.cpu())
diff --git a/example/image-classification/common/modelzoo.py b/example/image-classification/common/modelzoo.py
index 1fe14ca4fc..ce8fd5e0ed 100644
--- a/example/image-classification/common/modelzoo.py
+++ b/example/image-classification/common/modelzoo.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import os
-from util import download_file
+from common.util import download_file
 
 _base_model_url = 'http://data.mxnet.io/models/'
 _default_model_info = {
diff --git a/example/image-classification/symbols/mobilenet.py b/example/image-classification/symbols/mobilenet.py
index 42b9636261..bf3de4a2c6 100644
--- a/example/image-classification/symbols/mobilenet.py
+++ b/example/image-classification/symbols/mobilenet.py
@@ -14,48 +14,130 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# -*- coding:utf-8 -*-
+'''
+mobilenet
+Suittable for image with around resolution x resolution, resolution is multiple of 32.
+
+Reference:
+MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
+https://arxiv.org/abs/1704.04861
+'''
+
+__author__ = 'qingzhouzhen'
+__date__ = '17/8/5'
+__modify__ = 'dwSun'
+__modified_date__ = '17/11/30'
+
 
 import mxnet as mx
 
-def Conv(data, num_filter=1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), num_group=1, name=None, suffix=''):
-    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, num_group=num_group, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
-    bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' %(name, suffix), fix_gamma=True)
-    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+alpha_values = [0.25, 0.50, 0.75, 1.0]
+
+
+def Conv(data, num_filter=1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), num_group=1, name='', suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, num_group=num_group, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' % (name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' % (name, suffix), fix_gamma=True)
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' % (name, suffix))
     return act
 
-def get_symbol(num_classes, **kwargs):
-    data = mx.symbol.Variable(name="data") # 224
-    conv_1 = Conv(data, num_filter=32, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_1") # 224/112
-    conv_2_dw = Conv(conv_1, num_group=32, num_filter=32, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_2_dw") # 112/112
-    conv_2 = Conv(conv_2_dw, num_filter=64, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_2") # 112/112
-    conv_3_dw = Conv(conv_2, num_group=64, num_filter=64, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_3_dw") # 112/56
-    conv_3 = Conv(conv_3_dw, num_filter=128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_3") # 56/56
-    conv_4_dw = Conv(conv_3, num_group=128, num_filter=128, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_4_dw") # 56/56
-    conv_4 = Conv(conv_4_dw, num_filter=128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_4") # 56/56
-    conv_5_dw = Conv(conv_4, num_group=128, num_filter=128, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_5_dw") # 56/28
-    conv_5 = Conv(conv_5_dw, num_filter=256, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_5") # 28/28
-    conv_6_dw = Conv(conv_5, num_group=256, num_filter=256, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_6_dw") # 28/28
-    conv_6 = Conv(conv_6_dw, num_filter=256, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_6") # 28/28
-    conv_7_dw = Conv(conv_6, num_group=256, num_filter=256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_7_dw") # 28/14
-    conv_7 = Conv(conv_7_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_7") # 14/14
-
-    conv_8_dw = Conv(conv_7, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_8_dw") # 14/14
-    conv_8 = Conv(conv_8_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_8") # 14/14
-    conv_9_dw = Conv(conv_8, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_9_dw") # 14/14
-    conv_9 = Conv(conv_9_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_9") # 14/14
-    conv_10_dw = Conv(conv_9, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_10_dw") # 14/14
-    conv_10 = Conv(conv_10_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_10") # 14/14
-    conv_11_dw = Conv(conv_10, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_11_dw") # 14/14
-    conv_11 = Conv(conv_11_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_11") # 14/14
-    conv_12_dw = Conv(conv_11, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_12_dw") # 14/14
-    conv_12 = Conv(conv_12_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_12") # 14/14
-
-    conv_13_dw = Conv(conv_12, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_13_dw") # 14/7
-    conv_13 = Conv(conv_13_dw, num_filter=1024, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_13") # 7/7
-    conv_14_dw = Conv(conv_13, num_group=1024, num_filter=1024, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_14_dw") # 7/7
-    conv_14 = Conv(conv_14_dw, num_filter=1024, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_14") # 7/7
-
-    pool = mx.sym.Pooling(data=conv_14, kernel=(7, 7), stride=(1, 1), pool_type="avg", name="global_pool")
+
+def Conv_DPW(data, depth=1, stride=(1, 1), name='', idx=0, suffix=''):
+    conv_dw = Conv(data, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=stride, name="conv_%d_dw" % (idx), suffix=suffix)
+    conv = Conv(conv_dw, num_filter=depth * stride[0], kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_%d" % (idx), suffix=suffix)
+    return conv
+
+
+def get_symbol_compact(num_classes, alpha=1, resolution=224, **kwargs):
+    assert alpha in alpha_values, 'Invalid alpha={0}, must be one of {1}'.format(alpha, alpha_values)
+    assert resolution % 32 == 0, 'resolution must be multiple of 32'
+
+    base = int(32 * alpha)
+
+    data = mx.symbol.Variable(name="data")  # 224
+    conv_1 = Conv(data, num_filter=base, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_1")  # 32*alpha, 224/112
+
+    conv_2_dw = Conv(conv_1, num_group=base, num_filter=base, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_2_dw")  # 112/112
+    conv_2 = Conv(conv_2_dw, num_filter=base * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_2")  # 32*alpha, 112/112
+
+    conv_3_dpw = Conv_DPW(conv_2, depth=base * 2, stride=(2, 2), idx=3)  # 64*alpha, 112/56 => 56/56
+    conv_4_dpw = Conv_DPW(conv_3_dpw, depth=base * 4, stride=(1, 1), idx=4)  # 128*alpha, 56/56 =>56/56
+    conv_5_dpw = Conv_DPW(conv_4_dpw, depth=base * 4, stride=(2, 2), idx=5)  # 128*alpha, 56/28 => 28/28
+    conv_6_dpw = Conv_DPW(conv_5_dpw, depth=base * 8, stride=(1, 1), idx=6)  # 256*alpha, 28/28 => 28/28
+    conv_7_dpw = Conv_DPW(conv_6_dpw, depth=base * 8, stride=(2, 2), idx=7)  # 256*alpha, 28/14 => 14/14
+    conv_dpw = conv_7_dpw
+
+    for idx in range(8, 13):
+        conv_dpw = Conv_DPW(conv_dpw, depth=base * 16, stride=(1, 1), idx=idx)  # 512*alpha, 14/14
+
+    conv_12_dpw = conv_dpw
+    conv_13_dpw = Conv_DPW(conv_12_dpw, depth=base * 16, stride=(2, 2), idx=13)  # 512*alpha, 14/7 => 7/7
+    conv_14_dpw = Conv_DPW(conv_13_dpw, depth=base * 32, stride=(1, 1), idx=14)  # 1024*alpha, 7/7 => 7/7
+
+    pool_size = int(resolution / 32)
+    pool = mx.sym.Pooling(data=conv_14_dpw, kernel=(pool_size, pool_size), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc')
+    softmax = mx.symbol.SoftmaxOutput(data=fc, name='softmax')
+    return softmax
+
+
+def get_symbol(num_classes, alpha=1, resolution=224, **kwargs):
+    assert alpha in alpha_values, 'Invalid alpha=[{0}], must be one of [{1}]'.format(alpha, alpha_values)
+    assert resolution % 32 == 0, 'resolution must be multpile of 32'
+
+    base = int(32 * alpha)
+
+    data = mx.symbol.Variable(name="data")  # 224
+    depth = base  # 32*alpha
+    conv_1 = Conv(data, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_1")  # 224/112
+
+    depth = base  # 32*alpha
+    conv_2_dw = Conv(conv_1, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_2_dw")  # 112/112
+    conv_2 = Conv(conv_2_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_2")  # 112/112
+
+    depth = base * 2  # 64*alpha
+    conv_3_dw = Conv(conv_2, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_3_dw")  # 112/56
+    conv_3 = Conv(conv_3_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_3")  # 56/56
+
+    depth = base * 4  # 128*alpha
+    conv_4_dw = Conv(conv_3, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_4_dw")  # 56/56
+    conv_4 = Conv(conv_4_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_4")  # 56/56
+
+    depth = base * 4  # 128*alpha
+    conv_5_dw = Conv(conv_4, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_5_dw")  # 56/28
+    conv_5 = Conv(conv_5_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_5")  # 28/28
+
+    depth = base * 8  # 256*alpha
+    conv_6_dw = Conv(conv_5, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_6_dw")  # 28/28
+    conv_6 = Conv(conv_6_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_6")  # 28/28
+
+    depth = base * 8  # 256*alpha
+    conv_7_dw = Conv(conv_6, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_7_dw")  # 28/14
+    conv_7 = Conv(conv_7_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_7")  # 14/14
+
+    depth = base * 16  # 512*alpha
+    conv_8_dw = Conv(conv_7, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_8_dw")  # 14/14
+    conv_8 = Conv(conv_8_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_8")  # 14/14
+    conv_9_dw = Conv(conv_8, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_9_dw")  # 14/14
+    conv_9 = Conv(conv_9_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_9")  # 14/14
+    conv_10_dw = Conv(conv_9, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_10_dw")  # 14/14
+    conv_10 = Conv(conv_10_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_10")  # 14/14
+    conv_11_dw = Conv(conv_10, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_11_dw")  # 14/14
+    conv_11 = Conv(conv_11_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_11")  # 14/14
+    conv_12_dw = Conv(conv_11, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_12_dw")  # 14/14
+    conv_12 = Conv(conv_12_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_12")  # 14/14
+
+    depth = base * 16  # 512*alpha
+    conv_13_dw = Conv(conv_12, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_13_dw")  # 14/7
+    conv_13 = Conv(conv_13_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_13")  # 7/7
+
+    depth = base * 32  # 1024*alpha
+    conv_14_dw = Conv(conv_13, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_14_dw")  # 7/7
+    conv_14 = Conv(conv_14_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_14")  # 7/7
+
+    pool_size = int(resolution / 32)
+    pool = mx.sym.Pooling(data=conv_14, kernel=(pool_size, pool_size), stride=(1, 1), pool_type="avg", name="global_pool")
     flatten = mx.sym.Flatten(data=pool, name="flatten")
     fc = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc')
     softmax = mx.symbol.SoftmaxOutput(data=fc, name='softmax')
diff --git a/example/model-parallel/lstm/README.md b/example/model-parallel/lstm/README.md
index 9acea85580..6f31ff8348 100644
--- a/example/model-parallel/lstm/README.md
+++ b/example/model-parallel/lstm/README.md
@@ -1,4 +1,13 @@
 Model Parallel LSTM
 ===================
+
 This is an example showing how to do model parallel LSTM in MXNet.
-Most of the code is duplicated with the rnn example, and should be eventually merged.
+
+We use [the PenTreeBank dataset](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/)
+in this example. Download the dataset with below command:
+
+`bash get_ptb_data.sh`
+
+This will download PenTreeBank dataset under `data` folder. Now, you can run the training as follows:
+
+`python lstm_ptb.py`
diff --git a/example/model-parallel/lstm/lstm.py b/example/model-parallel/lstm/lstm.py
index c24017ff0d..75fa533c78 100644
--- a/example/model-parallel/lstm/lstm.py
+++ b/example/model-parallel/lstm/lstm.py
@@ -84,7 +84,7 @@ def lstm_unroll(num_lstm_layer, seq_len, input_size,
 
     last_hidden = []
     for seqidx in range(seq_len):
-        # embeding layer
+        # embedding layer
         with mx.AttrScope(ctx_group='embed'):
             data = mx.sym.Variable("t%d_data" % seqidx)
             hidden = mx.sym.Embedding(data=data, weight=embed_weight,
@@ -121,7 +121,13 @@ def lstm_unroll(num_lstm_layer, seq_len, input_size,
                                            name="t%d_cls" % seqidx)
                 label = mx.sym.Variable("t%d_label" % seqidx)
                 if use_loss:
-                    sm = mx.sym.softmax_cross_entropy(fc, label, name="t%d_sm" % seqidx)
+                    # Currently softmax_cross_entropy fails https://github.com/apache/incubator-mxnet/issues/6874
+                    # So, workaround for now to fix this example
+                    out = mx.symbol.softmax(data=fc)
+                    label = mx.sym.Reshape(label, shape=(-1,1))
+                    ce = - mx.sym.broadcast_add(mx.sym.broadcast_mul(label, mx.sym.log(out)),
+                                              mx.sym.broadcast_mul((1 - label), mx.sym.log(1 - out)))
+                    sm = mx.sym.MakeLoss(ce,  name="t%d_sm" % seqidx)
                 else:
                     sm = mx.sym.SoftmaxOutput(data=fc, label=label, name="t%d_sm" % seqidx)
                 out_prob.append(sm)
@@ -134,7 +140,13 @@ def lstm_unroll(num_lstm_layer, seq_len, input_size,
                                        num_hidden=num_label)
             label = mx.sym.Variable("label")
             if use_loss:
-                sm = mx.sym.softmax_cross_entropy(fc, label, name="sm")
+                # Currently softmax_cross_entropy fails https://github.com/apache/incubator-mxnet/issues/6874
+                # So, workaround for now to fix this example
+                out = mx.symbol.softmax(data=fc)
+                label = mx.sym.Reshape(label, shape=(-1, 1))
+                ce = mx.sym.broadcast_add(mx.sym.broadcast_mul(label, mx.sym.log(out)),
+                                              mx.sym.broadcast_mul((1 - label), mx.sym.log(1 - out)))
+                sm = mx.sym.MakeLoss(ce,  name="sm")
             else:
                 sm = mx.sym.SoftmaxOutput(data=fc, label=label, name="sm")
             out_prob = [sm]
@@ -208,7 +220,7 @@ def setup_rnn_model(default_ctx,
             if not name.startswith("t"):
                 print("%s group=%s, ctx=%s" % (name, group, str(ctx)))
 
-        #bind with shared executor
+        # bind with shared executor
         rnn_exec = None
         if max_len == bucket_key:
               rnn_exec = rnn_sym.bind(default_ctx, args=arg_arrays,
@@ -344,7 +356,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
             # update epoch counter
             epoch_counter += 1
             if epoch_counter % update_period == 0:
-                # updare parameters
+                # update parameters
                 norm = 0.
                 for idx, weight, grad, name in m.param_blocks:
                     grad /= batch_size
@@ -363,7 +375,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
                 else:
                     train_nll += calc_nll(seq_label_probs, batch_size, batch_seq_length)
             else:
-                train_nll += sum([x.asscalar() for x in seq_loss]) / batch_size
+                train_nll += sum([x.sum().asscalar() for x in seq_loss]) / batch_size
 
             nbatch += batch_size
             toc = time.time()
@@ -405,7 +417,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
                 else:
                     val_nll += calc_nll(seq_label_probs, batch_size, batch_seq_length)
             else:
-                val_nll += sum([x.asscalar() for x in seq_loss]) / batch_size
+                val_nll += sum([x.sum().asscalar() for x in seq_loss]) / batch_size
             nbatch += batch_size
 
         perp = np.exp(val_nll / nbatch)
diff --git a/example/model-parallel/lstm/lstm_ptb.py b/example/model-parallel/lstm/lstm_ptb.py
index 0141338329..965ba1950b 100644
--- a/example/model-parallel/lstm/lstm_ptb.py
+++ b/example/model-parallel/lstm/lstm_ptb.py
@@ -22,7 +22,7 @@
 import mxnet as mx
 import numpy as np
 # reuse the bucket_io library
-sys.path.insert(0, "../rnn")
+sys.path.insert(0, "../../rnn/old")
 from bucket_io import BucketSentenceIter, default_build_vocab
 
 """
diff --git a/example/model-parallel/matrix_factorization/README.md b/example/model-parallel/matrix_factorization/README.md
new file mode 100644
index 0000000000..00507d924f
--- /dev/null
+++ b/example/model-parallel/matrix_factorization/README.md
@@ -0,0 +1,22 @@
+Model Parallel Matrix Factorization
+===================================
+
+This example walks you through a matrix factorization algorithm for recommendations and also
+demonstrates the basic usage of `group2ctxs` in `Module`, which allows one part of the model to be
+trained on cpu and the other on gpu. So, it is necessary to have GPUs available on the machine
+to run this example.
+
+To run this example, first make sure you download a dataset of 10 million movie ratings available
+from [the MovieLens project](http://files.grouplens.org/datasets/movielens/) by running following command:
+
+`python get_data.py`
+
+This will download MovieLens 10M dataset under ml-10M100K folder. Now, you can run the training as follows:
+
+`python train.py --num-gpus 1`
+
+You can also specify other attributes such as num-epoch, batch-size,
+factor-size(output dim of the embedding operation) to train.py.
+
+While training you will be able to see the usage of ctx_group attribute to divide the operators
+into different groups corresponding to different CPU/GPU devices.
diff --git a/example/model-parallel/matrix_factorization/model.py b/example/model-parallel/matrix_factorization/model.py
index f4004d1a65..16cd9b3144 100644
--- a/example/model-parallel/matrix_factorization/model.py
+++ b/example/model-parallel/matrix_factorization/model.py
@@ -32,6 +32,7 @@ def matrix_fact_model_parallel_net(factor_size, num_hidden, max_user, max_item):
         item_weight = mx.symbol.Variable('item_weight')
         item = mx.symbol.Embedding(data=item, weight=item_weight,
                                    input_dim=max_item, output_dim=factor_size)
+
     # set ctx_group attribute to 'dev2' for the symbols created in this scope,
     # the symbols will be bound to the context that 'dev2' map to in group2ctxs
     with mx.AttrScope(ctx_group='dev2'):
@@ -45,7 +46,7 @@ def matrix_fact_model_parallel_net(factor_size, num_hidden, max_user, max_item):
         fc_item_weight = mx.symbol.Variable('fc_item_weight')
         fc_item_bias = mx.symbol.Variable('fc_item_bias')
         item = mx.symbol.FullyConnected(data=item, weight=fc_item_weight, bias=fc_item_bias, num_hidden=num_hidden)
-        # predict by the inner product, which is elementwise product and then sum
+        # predict by the inner product, which is element-wise product and then sum
         pred = user * item
         pred = mx.symbol.sum(data=pred, axis=1)
         pred = mx.symbol.Flatten(data=pred)
diff --git a/example/model-parallel/matrix_factorization/readme.md b/example/model-parallel/matrix_factorization/readme.md
deleted file mode 100644
index 5d724aec30..0000000000
--- a/example/model-parallel/matrix_factorization/readme.md
+++ /dev/null
@@ -1,6 +0,0 @@
-Model Parallel Matrix Factorization
-==============
-
-The example demonstrates the basic usage of `group2ctxs` in `Module`, which allows one part of the model trained on cpu and the other on gpu.
-
-- `python matrix_factorization_model_parallel.py --num-gpus 2`
diff --git a/example/model-parallel/matrix_factorization/train.py b/example/model-parallel/matrix_factorization/train.py
index 7a2073bf3a..591dab3a65 100644
--- a/example/model-parallel/matrix_factorization/train.py
+++ b/example/model-parallel/matrix_factorization/train.py
@@ -21,7 +21,7 @@
 import mxnet as mx
 import numpy as np
 from get_data import get_movielens_iter, get_movielens_data
-from matrix_fact_parallel_model import matrix_fact_model_parallel_net
+from model import matrix_fact_model_parallel_net
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -77,10 +77,13 @@
     # construct the module
     # map the ctx_group attribute to the context assignment
     group2ctxs={'dev1':[mx.cpu()]*num_gpus, 'dev2':[mx.gpu(i) for i in range(num_gpus)]}
+
+    # Creating a module by passing group2ctxs attribute which maps
+    # the ctx_group attribute to the context assignment
     mod = mx.module.Module(symbol=net, context=[mx.cpu()]*num_gpus, data_names=['user', 'item'],
         label_names=['score'], group2ctxs=group2ctxs)
     
-    # the initializer uesd to initialize the parameters
+    # the initializer used to initialize the parameters
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
     
     # the parameters for the optimizer constructor
diff --git a/example/multi-task/README.md b/example/multi-task/README.md
index 698d6f468e..9034814c3b 100644
--- a/example/multi-task/README.md
+++ b/example/multi-task/README.md
@@ -5,4 +5,6 @@ This is a simple example to show how to use mxnet for multi-task learning. It us
 ## Usage
 First, you need to write a multi-task iterator on your own. The iterator needs to generate multiple labels according to your applications, and the label names should be specified in the `provide_label` function, which needs to be consist with the names of output layers. 
 
-Then, if you want to show metrics of different tasks separately, you need to write your own metric class and specify the `num` parameter. In the `update` function of metric, calculate the metrics separately for different tasks. 
+Then, if you want to show metrics of different tasks separately, you need to write your own metric class and specify the `num` parameter. In the `update` function of metric, calculate the metrics separately for different tasks.
+
+The example script uses gpu as device by default, if gpu is not available for your environment, you can change `device` to be `mx.cpu()`.
diff --git a/example/multi-task/example_multi_task.py b/example/multi-task/example_multi_task.py
index ec5ece985c..9ea9ad0173 100644
--- a/example/multi-task/example_multi_task.py
+++ b/example/multi-task/example_multi_task.py
@@ -18,7 +18,6 @@
 # pylint: skip-file
 import sys
 import os
-sys.path.insert(0, "../../python/")
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
 from get_data import MNISTIterator
@@ -148,18 +147,17 @@ def get_name_value(self):
 val = Multi_mnist_iterator(val)
 
 
-model = mx.model.FeedForward(
-    ctx                = device,
+model = mx.mod.Module(
+    context            = device,
     symbol             = network,
-    num_epoch          = num_epochs,
-    learning_rate      = lr,
-    momentum           = 0.9,
-    wd                 = 0.00001,
-    initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34))
+    label_names        = ('softmax1_label', 'softmax2_label'))
 
 model.fit(
-    X                  = train,
+    train_data         = train,
     eval_data          = val,
     eval_metric        = Multi_Accuracy(num=2),
+    num_epoch          = num_epochs,
+    optimizer_params   = (('learning_rate', lr), ('momentum', 0.9), ('wd', 0.00001)),
+    initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34),
     batch_end_callback = mx.callback.Speedometer(batch_size, 50))
 
diff --git a/example/neural-style/end_to_end/README.md b/example/neural-style/end_to_end/README.md
index 2f19bf51ab..4a228c199b 100644
--- a/example/neural-style/end_to_end/README.md
+++ b/example/neural-style/end_to_end/README.md
@@ -1,20 +1,17 @@
 # End to End Neural Art
 
-This is an implementation of blog: [http://dmlc.ml/mxnet/2016/06/20/end-to-end-neural-style.html](http://dmlc.ml/mxnet/2016/06/20/end-to-end-neural-style.html)
-
-
-We will release a Multi-GPU training code soon.
+Please refer to this [blog](http://dmlc.ml/mxnet/2016/06/20/end-to-end-neural-style.html) for details of how it is implemented.
 
 ## How to use
 
 
-1. First use `download.sh` to download pre-trained model and sample inputs
+1. First use `../download.sh` to download pre-trained model and sample inputs.
 
-2. Then prepare training dataset according to the blog
+2. Prepare training dataset. Put image samples to `../data/` (one file for each image sample). The pretrained model here was trained by 26k images sampled from [MIT Place dataset](http://places.csail.mit.edu).
 
-3. Modify [boost_train.py](boost_train.py)
+3. Use `boost_train.py` for training.
 
 ## Pretrained Model
 
-Weight [https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip](https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip)
-Inference [boost_inference.py](boost_inference.py)
+- Model: [https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip](https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip)
+- Inference script: `boost_inference.py`
diff --git a/example/neural-style/end_to_end/basic.py b/example/neural-style/end_to_end/basic.py
index 1763e884b9..eae64a6e68 100644
--- a/example/neural-style/end_to_end/basic.py
+++ b/example/neural-style/end_to_end/basic.py
@@ -14,10 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import sys
-sys.path.insert(0, "../../mxnet/python/")
-
 import mxnet as mx
 import numpy as np
 import model_vgg19 as vgg
diff --git a/example/neural-style/end_to_end/boost_inference.py b/example/neural-style/end_to_end/boost_inference.py
index 0ec8308f30..86ab000b08 100644
--- a/example/neural-style/end_to_end/boost_inference.py
+++ b/example/neural-style/end_to_end/boost_inference.py
@@ -14,10 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import sys
-sys.path.insert(0, "../mxnet/python")
-
 import mxnet as mx
 import numpy as np
 
@@ -31,8 +27,6 @@
 model_prefix = "./model/"
 ctx = mx.gpu(0)
 
-
-
 # generator
 gens = [gen_v4.get_module("g0", dshape, ctx),
         gen_v3.get_module("g1", dshape, ctx),
@@ -41,15 +35,10 @@
 for i in range(len(gens)):
     gens[i].load_params("./model/%d/v3_0002-0026000.params" % i)
 
-content_np = data_processing.PreprocessContentImage("../IMG_4343.jpg", min(dshape[2:]), dshape)
+content_np = data_processing.PreprocessContentImage("../input/IMG_4343.jpg", min(dshape[2:]), dshape)
 data = [mx.nd.array(content_np)]
 for i in range(len(gens)):
     gens[i].forward(mx.io.DataBatch([data[-1]], [0]), is_train=False)
     new_img = gens[i].get_outputs()[0]
     data.append(new_img.copyto(mx.cpu()))
     data_processing.SaveImage(new_img.asnumpy(), "out_%d.jpg" % i)
-
-
-import os
-os.system("rm -rf out.zip")
-os.system("zip out.zip out_*")
diff --git a/example/neural-style/end_to_end/boost_train.py b/example/neural-style/end_to_end/boost_train.py
index fa525e7e52..4f25b4304c 100644
--- a/example/neural-style/end_to_end/boost_train.py
+++ b/example/neural-style/end_to_end/boost_train.py
@@ -14,10 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import sys
-sys.path.insert(0, "../../mxnet/python")
-
 import mxnet as mx
 import numpy as np
 
@@ -27,7 +23,7 @@
 import gen_v4
 
 # params
-vgg_params = mx.nd.load("./vgg19.params")
+vgg_params = mx.nd.load("../model/vgg19.params")
 style_weight = 1.2
 content_weight = 10
 dshape = (1, 3, 384, 384)
@@ -36,7 +32,7 @@
 ctx = mx.gpu(0)
 
 # init style
-style_np = data_processing.PreprocessStyleImage("../starry_night.jpg", shape=dshape)
+style_np = data_processing.PreprocessStyleImage("../input/starry_night.jpg", shape=dshape)
 style_mod = basic.get_style_module("style", dshape, ctx, vgg_params)
 style_mod.forward(mx.io.DataBatch([mx.nd.array(style_np)], [0]), is_train=False)
 style_array = [arr.copyto(mx.cpu()) for arr in style_mod.get_outputs()]
@@ -119,7 +115,10 @@ def get_tv_grad_executor(img, ctx, tv_weight):
         loss_grad_array = []
         data_array = []
         path = data_root + file_list[idx]
-        content_np = data_processing.PreprocessContentImage(path, min(dshape[2:]), dshape)
+        try:
+            content_np = data_processing.PreprocessContentImage(path, min(dshape[2:]), dshape)
+        except:
+            logging.warn("Fail to load an input image. Skip.")
         data = mx.nd.array(content_np)
         data_array.append(data)
         # get content
diff --git a/example/neural-style/end_to_end/gen_v3.py b/example/neural-style/end_to_end/gen_v3.py
index 7962e68da2..a11e5989b0 100644
--- a/example/neural-style/end_to_end/gen_v3.py
+++ b/example/neural-style/end_to_end/gen_v3.py
@@ -18,14 +18,6 @@
 
 # coding: utf-8
 
-# In[1]:
-
-import sys
-sys.path.insert(0, "../../mxnet/python")
-
-
-# In[2]:
-
 import mxnet as mx
 import numpy as np
 
@@ -48,8 +40,6 @@ def Deconv(data, num_filter, im_hw, kernel=(7, 7), pad=(2, 2), stride=(2, 2), cr
         sym = mx.sym.Activation(sym, act_type="tanh")
     return sym
 
-# In[70]:
-
 def get_generator(prefix, im_hw):
     data = mx.sym.Variable("%s_data" % prefix)
     conv1 = Conv(data, 64) # 192
diff --git a/example/neural-style/end_to_end/gen_v4.py b/example/neural-style/end_to_end/gen_v4.py
index fb4e6d1e16..30f534cd76 100644
--- a/example/neural-style/end_to_end/gen_v4.py
+++ b/example/neural-style/end_to_end/gen_v4.py
@@ -18,14 +18,6 @@
 
 # coding: utf-8
 
-# In[1]:
-
-import sys
-sys.path.insert(0, "../mxnet/python")
-
-
-# In[2]:
-
 import mxnet as mx
 import numpy as np
 
@@ -46,8 +38,6 @@ def Deconv(data, num_filter, kernel=(6, 6), pad=(2, 2), stride=(2, 2), out=False
         sym = mx.sym.Activation(sym, act_type="tanh")
     return sym
 
-# In[70]:
-
 def get_generator(prefix, im_hw):
     data = mx.sym.Variable("%s_data" % prefix)
 
diff --git a/example/numpy-ops/custom_softmax.py b/example/numpy-ops/custom_softmax.py
index 82f491e458..a2ec5d54b7 100644
--- a/example/numpy-ops/custom_softmax.py
+++ b/example/numpy-ops/custom_softmax.py
@@ -82,10 +82,12 @@ def create_operator(self, ctx, shapes, dtypes):
 logging.basicConfig(level=logging.DEBUG)
 
 # MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
-model = mx.model.FeedForward(
-    ctx = mx.cpu(0), symbol = mlp, num_epoch = 20,
-    learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
+context=mx.cpu()
+# Uncomment this line to train on GPU
+# context=mx.gpu(0)
 
-model.fit(X=train, eval_data=val,
-          batch_end_callback=mx.callback.Speedometer(100,100))
+mod = mx.mod.Module(mlp, context=context)
 
+mod.fit(train_data=train, eval_data=val, optimizer='sgd',
+    optimizer_params={'learning_rate':0.1, 'momentum': 0.9, 'wd': 0.00001},
+    num_epoch=10, batch_end_callback=mx.callback.Speedometer(100, 100))
diff --git a/example/numpy-ops/numpy_softmax.py b/example/numpy-ops/numpy_softmax.py
index cbcb7787ae..c10dfe3779 100644
--- a/example/numpy-ops/numpy_softmax.py
+++ b/example/numpy-ops/numpy_softmax.py
@@ -76,9 +76,13 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 logging.basicConfig(level=logging.DEBUG)
 
-model = mx.model.FeedForward(
-    ctx = mx.cpu(), symbol = mlp, num_epoch = 20,
-    learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
+# MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
+context=mx.cpu()
+# Uncomment this line to train on GPU instead of CPU
+# context=mx.gpu(0)
 
-model.fit(X=train, eval_data=val)
+mod = mx.mod.Module(mlp, context=context)
 
+mod.fit(train_data=train, eval_data=val, optimizer='sgd',
+    optimizer_params={'learning_rate':0.1, 'momentum': 0.9, 'wd': 0.00001},
+    num_epoch=10, batch_end_callback=mx.callback.Speedometer(100, 100))
diff --git a/example/numpy-ops/weighted_logistic_regression.py b/example/numpy-ops/weighted_logistic_regression.py
index 26b5fb2fda..4062495e86 100644
--- a/example/numpy-ops/weighted_logistic_regression.py
+++ b/example/numpy-ops/weighted_logistic_regression.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import numpy as np
 import mxnet as mx
 
@@ -26,7 +25,8 @@ def __init__(self, pos_grad_scale, neg_grad_scale):
     def forward(self, is_train, req, in_data, out_data, aux):
         self.assign(out_data[0], req[0], mx.nd.divide(1, (1 + mx.nd.exp(- in_data[0]))))
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
-        in_grad[0][:] = ((out_data[0] - 1) * in_data[1] * self.pos_grad_scale + out_data[0] * (1 - in_data[1]) * self.neg_grad_scale) / out_data[0].shape[1]
+        in_grad[0][:] = ((out_data[0] - 1) * in_data[1] * self.pos_grad_scale
+                         + out_data[0] * (1 - in_data[1]) * self.neg_grad_scale) / out_data[0].shape[1]
 
 @mx.operator.register("weighted_logistic_regression")
 class WeightedLogisticRegressionProp(mx.operator.CustomOpProp):
@@ -48,23 +48,39 @@ def create_operator(self, ctx, shapes, dtypes):
     m, n = 2, 5
     pos, neg = 1, 0.1
     data = mx.sym.Variable('data')
-    wlr = mx.sym.Custom(data, pos_grad_scale = pos, neg_grad_scale = neg, name = 'wlr', op_type = 'weighted_logistic_regression')
-    lr = mx.sym.LogisticRegressionOutput(data, name = 'lr')
-    exe1 = wlr.simple_bind(ctx = mx.gpu(1), data = (2 * m, n))
-    exe2 = lr.simple_bind(ctx = mx.gpu(1), data = (2 * m, n))
+
+    wlr = mx.sym.Custom(data, pos_grad_scale=pos, neg_grad_scale=neg, name='wlr',
+                        op_type='weighted_logistic_regression')
+    lr = mx.sym.LogisticRegressionOutput(data, name='lr')
+
+    # MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
+    context = mx.cpu()
+    # Uncomment this line to compute on GPU
+    # context=mx.gpu(0)
+
+    exe1 = wlr.simple_bind(ctx=context, data=(2 * m, n))
+    exe2 = lr.simple_bind(ctx=context, data=(2 * m, n))
+
     exe1.arg_dict['data'][:] = np.ones([2 * m, n])
     exe2.arg_dict['data'][:] = np.ones([2 * m, n])
+
     exe1.arg_dict['wlr_label'][:] = np.vstack([np.ones([m, n]), np.zeros([m, n])])
     exe2.arg_dict['lr_label'][:] = np.vstack([np.ones([m, n]), np.zeros([m, n])])
-    exe1.forward(is_train = True)
-    exe2.forward(is_train = True)
-    print('wlr output:')
+
+    exe1.forward(is_train=True)
+    exe2.forward(is_train=True)
+
+    print('Weighted Logistic Regression output:')
     print(exe1.outputs[0].asnumpy())
-    print('lr output:')
+
+    print('Logistic Regression output:')
     print(exe2.outputs[0].asnumpy())
+
     exe1.backward()
     exe2.backward()
-    print('wlr grad:')
+
+    print('Weighted Logistic Regression gradients:')
     print(exe1.grad_dict['data'].asnumpy())
-    print('lr grad:')
+
+    print('Logistic Regression gradients:')
     print(exe2.grad_dict['data'].asnumpy())
diff --git a/example/profiler/README.md b/example/profiler/README.md
new file mode 100644
index 0000000000..7d3c42b629
--- /dev/null
+++ b/example/profiler/README.md
@@ -0,0 +1,23 @@
+# MXNet Profiler Examples
+
+This folder contains examples of using MXNet profiler to generate profiling results in json files.
+Please refer to [this link](http://mxnet.incubator.apache.org/faq/perf.html?highlight=profiler#profiler)
+for visualizing profiling results and make sure that you have installed a version of MXNet compiled
+with `USE_PROFILER=1`.
+
+- profiler_executor.py. To run this example, simply type `python profiler_executor.py` in terminal.
+It will generate a json file named `profile_executor_5iter.json`.
+
+- profiler_imageiter.py. You first need to create a file named `test.rec`,
+which is an image dataset file before running this example.
+Please follow
+[this tutorial](https://mxnet.incubator.apache.org/faq/recordio.html?highlight=rec%20file#create-a-dataset-using-recordio)
+on how to create `.rec` files using an existing tool in MXNet. After you created 'test.rec',
+type `python profiler_imageiter.py` in terminal. It will generate `profile_imageiter.json`.
+
+- profiler_matmul.py. This example profiles matrix multiplications on GPU. Please make sure
+that you have installed a GPU enabled version of MXNet before running this example. Type
+`python profiler_matmul.py` and it will generate `profile_matmul_20iter.json`.
+
+- profiler_ndarray.py. This examples profiles a series of `NDArray` operations. Simply type
+`python profiler_ndarray.py` in terminal and it will generate `profile_ndarray.json`.
\ No newline at end of file
diff --git a/example/profiler/profiler_executor.py b/example/profiler/profiler_executor.py
index 26e3e1ba2a..117a8df492 100644
--- a/example/profiler/profiler_executor.py
+++ b/example/profiler/profiler_executor.py
@@ -17,7 +17,7 @@
 
 import mxnet as mx
 import argparse
-import os, sys
+import os
 import time
 import numpy as np
 from mxnet import profiler
diff --git a/example/profiler/profiler_imageiter.py b/example/profiler/profiler_imageiter.py
index e16b9b7de4..77ca412358 100644
--- a/example/profiler/profiler_imageiter.py
+++ b/example/profiler/profiler_imageiter.py
@@ -15,16 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import print_function
 import os
 # uncomment to set the number of worker threads.
 # os.environ["MXNET_CPU_WORKER_NTHREADS"] = "4"
-from __future__ import print_function
 import time
 import mxnet as mx
-import numpy as np
 
 
-def run_imageiter(path_rec, n, batch_size = 32):
+def run_imageiter(path_rec, n, batch_size=32):
 
     data = mx.img.ImageIter(batch_size=batch_size,
                             data_shape=(3, 224, 224),
@@ -39,6 +38,7 @@ def run_imageiter(path_rec, n, batch_size = 32):
     mx.nd.waitall()
     print(batch_size*n/(time.time() - tic))
 
+
 if __name__ == '__main__':
     mx.profiler.profiler_set_config(mode='all', filename='profile_imageiter.json')
     mx.profiler.profiler_set_state('run')
diff --git a/example/profiler/profiler_matmul.py b/example/profiler/profiler_matmul.py
index 1b1cf74f41..a23545cb06 100644
--- a/example/profiler/profiler_matmul.py
+++ b/example/profiler/profiler_matmul.py
@@ -18,9 +18,8 @@
 from __future__ import print_function
 import mxnet as mx
 import argparse
-import os, sys
 import time
-import numpy as np
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Set network parameters for benchmark test.')
@@ -30,18 +29,18 @@ def parse_args():
     parser.add_argument('--end_profiling_iter', type=int, default=70)
     return parser.parse_args()
 
+
 args = parse_args()
 
 if __name__ == '__main__':
     mx.profiler.profiler_set_config(mode='symbolic', filename=args.profile_filename)
     print('profile file save to {0}'.format(args.profile_filename))
 
-
     A = mx.sym.Variable('A')
     B = mx.sym.Variable('B')
     C = mx.symbol.dot(A, B)
 
-    executor = C.simple_bind(mx.gpu(1), 'write', A=(4096, 4096), B=(4096, 4096))
+    executor = C.simple_bind(mx.gpu(0), 'write', A=(4096, 4096), B=(4096, 4096))
 
     a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
     b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
diff --git a/example/profiler/profiler_ndarray.py b/example/profiler/profiler_ndarray.py
index 67ea87b1ed..5c233c64ed 100644
--- a/example/profiler/profiler_ndarray.py
+++ b/example/profiler/profiler_ndarray.py
@@ -82,6 +82,7 @@ def random_ndarray(dim):
     data = mx.nd.array(np.random.uniform(-10, 10, shape))
     return data
 
+
 def test_ndarray_elementwise():
     np.random.seed(0)
     nrepeat = 10
@@ -99,6 +100,7 @@ def test_ndarray_elementwise():
             check_with_uniform(mx.nd.square, 1, dim, np.square, rmin=0)
             check_with_uniform(lambda x: mx.nd.norm(x).asscalar(), 1, dim, np.linalg.norm)
 
+
 def test_ndarray_negate():
     npy = np.random.uniform(-10, 10, (2,3,4))
     arr = mx.nd.array(npy)
@@ -170,6 +172,7 @@ def test_ndarray_scalar():
     d = -c + 2
     assert(np.sum(d.asnumpy()) < 1e-5)
 
+
 def test_ndarray_pickle():
     np.random.seed(0)
     maxdim = 5
@@ -222,8 +225,7 @@ def test_ndarray_slice():
 
 def test_ndarray_slice_along_axis():
     arr = mx.nd.array(np.random.uniform(-10, 10, (3, 4, 2, 3)))
-    sub_arr = mx.nd.zeros((3, 2, 2, 3))
-    arr._copy_slice_to(1, 1, 3, sub_arr)
+    sub_arr = arr.slice(begin=(None, 1), end=(None, 3))
 
     # test we sliced correctly
     assert same(arr.asnumpy()[:, 1:3, :, :], sub_arr.asnumpy())
@@ -242,6 +244,7 @@ def test_clip():
         assert B1[i] >= -2
         assert B1[i] <= 2
 
+
 def test_dot():
     a = np.random.uniform(-3, 3, (3, 4))
     b = np.random.uniform(-3, 3, (4, 5))
@@ -251,8 +254,10 @@ def test_dot():
     C = mx.nd.dot(A, B)
     assert reldiff(c, C.asnumpy()) < 1e-5
 
+
 def test_reduce():
     sample_num = 200
+
     def test_reduce_inner(numpy_reduce_func, nd_reduce_func):
         for i in range(sample_num):
             ndim = np.random.randint(1, 6)
@@ -285,8 +290,10 @@ def test_reduce_inner(numpy_reduce_func, nd_reduce_func):
     test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.min),
                       mx.nd.min)
 
+
 def test_broadcast():
     sample_num = 1000
+
     def test_broadcast_to():
         for i in range(sample_num):
             ndim = np.random.randint(1, 6)
@@ -307,6 +314,7 @@ def test_broadcast_to():
             assert err < 1E-8
     test_broadcast_to()
 
+
 if __name__ == '__main__':
     mx.profiler.profiler_set_config(mode='all', filename='profile_ndarray.json')
     mx.profiler.profiler_set_state('run')
diff --git a/example/python-howto/README.md b/example/python-howto/README.md
index 2499c2ab07..29652408e0 100644
--- a/example/python-howto/README.md
+++ b/example/python-howto/README.md
@@ -1,15 +1,17 @@
 Python Howto Examples
 =====================
-* [Configuring Net to get Multiple Ouputs](multiple_outputs.py)
+
+* [Configuring Net to Get Multiple Ouputs](multiple_outputs.py)
 * [Configuring Image Record Iterator](data_iter.py)
+* [Monitor Intermediate Outputs in the Network](monitor_weights.py)
 * Set break point in C++ code of the symbol using gdb under Linux:
 
 	* 	Build mxnet with following values:
 
 		 ```
 		 	DEBUG=1 
-		 	CUDA=0 #to make sure convolution-inl.h will be used 
-		 	CUDNN=0 #to make sure convolution-inl.h will be used 
+		 	USE_CUDA=0 # to make sure convolution-inl.h will be used
+		 	USE_CUDNN=0 # to make sure convolution-inl.h will be used
 		 ```
 		 
 	*  run python under gdb:  ```gdb --args python debug_conv.py```
diff --git a/example/python-howto/monitor_weights.py b/example/python-howto/monitor_weights.py
index a8b255196d..ab77b4908b 100644
--- a/example/python-howto/monitor_weights.py
+++ b/example/python-howto/monitor_weights.py
@@ -25,6 +25,7 @@
 import numpy as np
 import logging
 
+# network
 data = mx.symbol.Variable('data')
 fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
 act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
@@ -34,20 +35,16 @@
 mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
 
 # data
-
 train, val = MNISTIterator(batch_size=100, input_shape = (784,))
 
-# train
-
-logging.basicConfig(level=logging.DEBUG)
-
-model = mx.model.FeedForward(
-    ctx = mx.cpu(), symbol = mlp, num_epoch = 20,
-    learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
-
+# monitor
 def norm_stat(d):
     return mx.nd.norm(d)/np.sqrt(d.size)
 mon = mx.mon.Monitor(100, norm_stat)
-model.fit(X=train, eval_data=val, monitor=mon,
-          batch_end_callback = mx.callback.Speedometer(100, 100))
 
+# train with monitor
+logging.basicConfig(level=logging.DEBUG)
+module = mx.module.Module(context=mx.cpu(), symbol=mlp)
+module.fit(train_data=train, eval_data=val, monitor=mon, num_epoch=2,
+           batch_end_callback = mx.callback.Speedometer(100, 100),
+           optimizer_params=(('learning_rate', 0.1), ('momentum', 0.9), ('wd', 0.00001)))
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index 282a1aebe9..dbf2a423c3 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -29,10 +29,8 @@ MXNet engines and parallelization for object detection.
 * If you value simplicity. Technical details are *very complicated* in MXNet.
   This is by design to attain maximum possible performance instead of patching fixes after fixes.
   Performance and parallelization are more than a change of parameter.
-* If you want to do CPU training, be advised that it has not been verified yet.
-  You will not encounter NOT_IMPLEMENTED_ERROR so it is still possible.
-* If you are on Windows or Python3, some people reported it was possible with some modifications.
-  But they have disappeared.
+* If you want to do CPU training, be advised that it has not been verified properly yet. You can change the `ctx` variable in `train_end2end.py` or `train_alternate.py` scripts to `mx.cpu` and run these scripts directly to test it.
+* If you are on Windows some people reported it was possible with some modifications. But they have disappeared.
 
 ## Experiments
 | Method | Network | Training Data | Testing Data | Reference | Result |
@@ -48,18 +46,22 @@ MXNet engines and parallelization for object detection.
 The above experiments were conducted at [mx-rcnn](https://github.com/precedenceguo/mx-rcnn/tree/6a1ab0eec5035a10a1efb5fc8c9d6c54e101b4d0)
 using [a MXNet fork, based on MXNet 0.9.1 nnvm pre-release](https://github.com/precedenceguo/mxnet/tree/simple).
 
-## I'm Feeling Lucky
+## Quickstart
 * Prepare: `bash script/additional_deps.sh`
 * Download training data: `bash script/get_voc.sh`
 * Download pretrained model: `bash script/get_pretrained_model.sh`
-* Training and testing: `bash script/vgg_voc07.sh 0,1` (use gpu 0 and 1)
+* Training and testing: `bash script/vgg_voc07.sh 0,1` (this means to use gpu 0 and 1)
+
+## Prerequisites
+* Pip, Python-dev, Unzip
+* Some python packages are required: Cython, Scikit-image, Easydict, Matplot, OpenCV, Future
+* On debian, you can usually run `sudo apt install python-pip python-dev unzip`
+* And the python packages can be installed by running `sudo pip install cython scikit-image easydict matplotlib opencv-python future`. Note that you may have to remove sudo depending on how your mxnet package is installed.
+* MXNet version v0.9.5 or higher with Python interface installed. Open `python` type `import mxnet` to confirm.
 
 ## Getting started
-See if `bash script/additional_deps.sh` will do the following for you.
 * Suppose `HOME` represents where this file is located. All commands, unless stated otherwise, should be started from `HOME`.
-* Install python package `cython easydict matplotlib scikit-image`.
-* Install MXNet version v0.9.5 or higher and MXNet Python Interface. Open `python` type `import mxnet` to confirm.
-* Run `make` in `HOME`.
+* Ensure that `bash script/additional_deps.sh` installs all prerequisites listed above. If you're not using this script, ensure above prerequisities are present on your system and then run `make` from `HOME`. This builds the cython extensions and installs python bindings for them.
 
 Command line arguments have the same meaning as in mxnet/example/image-classification.
 * `prefix` refers to the first part of a saved model file name and `epoch` refers to a number in this file name.
diff --git a/example/rcnn/rcnn/core/tester.py b/example/rcnn/rcnn/core/tester.py
index 651b2a945e..a451883f58 100644
--- a/example/rcnn/rcnn/core/tester.py
+++ b/example/rcnn/rcnn/core/tester.py
@@ -15,13 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import cPickle
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 import os
 import time
 import mxnet as mx
 import numpy as np
+from builtins import range
 
-from module import MutableModule
+from .module import MutableModule
 from rcnn.logger import logger
 from rcnn.config import config
 from rcnn.io import image
@@ -110,12 +114,12 @@ def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.):
 
     rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl')
     with open(rpn_file, 'wb') as f:
-        cPickle.dump(imdb_boxes, f, cPickle.HIGHEST_PROTOCOL)
+        pickle.dump(imdb_boxes, f, pickle.HIGHEST_PROTOCOL)
 
     if thresh > 0:
         full_rpn_file = os.path.join(rpn_folder, imdb.name + '_full_rpn.pkl')
         with open(full_rpn_file, 'wb') as f:
-            cPickle.dump(original_boxes, f, cPickle.HIGHEST_PROTOCOL)
+            pickle.dump(original_boxes, f, pickle.HIGHEST_PROTOCOL)
 
     logger.info('wrote rpn proposals to %s' % rpn_file)
     return imdb_boxes
@@ -168,8 +172,8 @@ def pred_eval(predictor, test_data, imdb, vis=False, thresh=1e-3):
     # all detections are collected into:
     #    all_boxes[cls][image] = N x 5 array of detections in
     #    (x1, y1, x2, y2, score)
-    all_boxes = [[[] for _ in xrange(num_images)]
-                 for _ in xrange(imdb.num_classes)]
+    all_boxes = [[[] for _ in range(num_images)]
+                 for _ in range(imdb.num_classes)]
 
     i = 0
     t = time.time()
@@ -211,7 +215,7 @@ def pred_eval(predictor, test_data, imdb, vis=False, thresh=1e-3):
 
     det_file = os.path.join(imdb.cache_path, imdb.name + '_detections.pkl')
     with open(det_file, 'wb') as f:
-        cPickle.dump(all_boxes, f, protocol=cPickle.HIGHEST_PROTOCOL)
+        pickle.dump(all_boxes, f, protocol=pickle.HIGHEST_PROTOCOL)
 
     imdb.evaluate_detections(all_boxes)
 
diff --git a/example/rcnn/rcnn/cython/setup.py b/example/rcnn/rcnn/cython/setup.py
index e50478b2d9..4ab6f75cc6 100644
--- a/example/rcnn/rcnn/cython/setup.py
+++ b/example/rcnn/rcnn/cython/setup.py
@@ -67,7 +67,7 @@ def locate_cuda():
     cudaconfig = {'home':home, 'nvcc':nvcc,
                   'include': pjoin(home, 'include'),
                   'lib64': pjoin(home, 'lib64')}
-    for k, v in cudaconfig.iteritems():
+    for k, v in cudaconfig.items():
         if not os.path.exists(v):
             raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
 
diff --git a/example/rcnn/rcnn/dataset/__init__.py b/example/rcnn/rcnn/dataset/__init__.py
index 1a706e9e0c..80fcc32c21 100644
--- a/example/rcnn/rcnn/dataset/__init__.py
+++ b/example/rcnn/rcnn/dataset/__init__.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from imdb import IMDB
-from pascal_voc import PascalVOC
-from coco import coco
+from .imdb import IMDB
+from .pascal_voc import PascalVOC
+from .coco import coco
diff --git a/example/rcnn/rcnn/dataset/coco.py b/example/rcnn/rcnn/dataset/coco.py
index 9ca5a74cc4..1ec7567958 100644
--- a/example/rcnn/rcnn/dataset/coco.py
+++ b/example/rcnn/rcnn/dataset/coco.py
@@ -15,14 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import cPickle
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 import cv2
 import os
 import json
 import numpy as np
+from builtins import range
 
 from ..logger import logger
-from imdb import IMDB
+from .imdb import IMDB
 
 # coco api
 from ..pycocotools.coco import COCO
@@ -47,7 +51,7 @@ def __init__(self, image_set, root_path, data_path):
         cats = [cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())]
         self.classes = ['__background__'] + cats
         self.num_classes = len(self.classes)
-        self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
+        self._class_to_ind = dict(zip(self.classes, range(self.num_classes)))
         self._class_to_coco_ind = dict(zip(cats, self.coco.getCatIds()))
         self._coco_ind_to_class_ind = dict([(self._class_to_coco_ind[cls], self._class_to_ind[cls])
                                             for cls in self.classes[1:]])
@@ -84,13 +88,13 @@ def gt_roidb(self):
         cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
-                roidb = cPickle.load(fid)
+                roidb = pickle.load(fid)
             logger.info('%s gt roidb loaded from %s' % (self.name, cache_file))
             return roidb
 
         gt_roidb = [self._load_coco_annotation(index) for index in self.image_set_index]
         with open(cache_file, 'wb') as fid:
-            cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
+            pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
         logger.info('%s wrote gt roidb to %s' % (self.name, cache_file))
 
         return gt_roidb
@@ -193,7 +197,7 @@ def _coco_results_one_category(self, boxes, cat_id):
             result = [{'image_id': index,
                        'category_id': cat_id,
                        'bbox': [xs[k], ys[k], ws[k], hs[k]],
-                       'score': scores[k]} for k in xrange(dets.shape[0])]
+                       'score': scores[k]} for k in range(dets.shape[0])]
             results.extend(result)
         return results
 
@@ -208,7 +212,7 @@ def _do_python_eval(self, res_file, res_folder):
 
         eval_file = os.path.join(res_folder, 'detections_%s_results.pkl' % self.image_set)
         with open(eval_file, 'wb') as f:
-            cPickle.dump(coco_eval, f, cPickle.HIGHEST_PROTOCOL)
+            pickle.dump(coco_eval, f, pickle.HIGHEST_PROTOCOL)
         logger.info('eval results saved to %s' % eval_file)
 
     def _print_detection_metrics(self, coco_eval):
diff --git a/example/rcnn/rcnn/dataset/ds_utils.py b/example/rcnn/rcnn/dataset/ds_utils.py
index e6f839b8fd..8f90e8d390 100644
--- a/example/rcnn/rcnn/dataset/ds_utils.py
+++ b/example/rcnn/rcnn/dataset/ds_utils.py
@@ -21,7 +21,7 @@
 def unique_boxes(boxes, scale=1.0):
     """ return indices of unique boxes """
     v = np.array([1, 1e3, 1e6, 1e9])
-    hashes = np.round(boxes * scale).dot(v)
+    hashes = np.round(boxes * scale).dot(v).astype(np.int)
     _, index = np.unique(hashes, return_index=True)
     return np.sort(index)
 
diff --git a/example/rcnn/rcnn/dataset/imdb.py b/example/rcnn/rcnn/dataset/imdb.py
index b9038c5da0..5908cc3358 100644
--- a/example/rcnn/rcnn/dataset/imdb.py
+++ b/example/rcnn/rcnn/dataset/imdb.py
@@ -28,7 +28,10 @@
 
 from ..logger import logger
 import os
-import cPickle
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 import numpy as np
 from ..processing.bbox_transform import bbox_overlaps
 
@@ -90,7 +93,7 @@ def load_rpn_data(self, full=False):
         assert os.path.exists(rpn_file), '%s rpn data not found at %s' % (self.name, rpn_file)
         logger.info('%s loading rpn data from %s' % (self.name, rpn_file))
         with open(rpn_file, 'rb') as f:
-            box_list = cPickle.load(f)
+            box_list = pickle.load(f)
         return box_list
 
     def load_rpn_roidb(self, gt_roidb):
diff --git a/example/rcnn/rcnn/dataset/pascal_voc.py b/example/rcnn/rcnn/dataset/pascal_voc.py
index 091c4e8ea1..753f7038aa 100644
--- a/example/rcnn/rcnn/dataset/pascal_voc.py
+++ b/example/rcnn/rcnn/dataset/pascal_voc.py
@@ -23,15 +23,18 @@
 criterion.
 """
 
-import cPickle
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 import cv2
 import os
 import numpy as np
 
 from ..logger import logger
-from imdb import IMDB
-from pascal_voc_eval import voc_eval
-from ds_utils import unique_boxes, filter_small_boxes
+from .imdb import IMDB
+from .pascal_voc_eval import voc_eval
+from .ds_utils import unique_boxes, filter_small_boxes
 
 
 class PascalVOC(IMDB):
@@ -94,13 +97,13 @@ def gt_roidb(self):
         cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
-                roidb = cPickle.load(fid)
+                roidb = pickle.load(fid)
             logger.info('%s gt roidb loaded from %s' % (self.name, cache_file))
             return roidb
 
         gt_roidb = [self.load_pascal_annotation(index) for index in self.image_set_index]
         with open(cache_file, 'wb') as fid:
-            cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
+            pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
         logger.info('%s wrote gt roidb to %s' % (self.name, cache_file))
 
         return gt_roidb
@@ -184,7 +187,7 @@ def selective_search_roidb(self, gt_roidb, append_gt=False):
         cache_file = os.path.join(self.cache_path, self.name + '_ss_roidb.pkl')
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
-                roidb = cPickle.load(fid)
+                roidb = pickle.load(fid)
             logger.info('%s ss roidb loaded from %s' % (self.name, cache_file))
             return roidb
 
@@ -195,7 +198,7 @@ def selective_search_roidb(self, gt_roidb, append_gt=False):
         else:
             roidb = self.load_selective_search_roidb(gt_roidb)
         with open(cache_file, 'wb') as fid:
-            cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
+            pickle.dump(roidb, fid, pickle.HIGHEST_PROTOCOL)
         logger.info('%s wrote ss roidb to %s' % (self.name, cache_file))
 
         return roidb
diff --git a/example/rcnn/rcnn/dataset/pascal_voc_eval.py b/example/rcnn/rcnn/dataset/pascal_voc_eval.py
index e584ed7503..2583aed166 100644
--- a/example/rcnn/rcnn/dataset/pascal_voc_eval.py
+++ b/example/rcnn/rcnn/dataset/pascal_voc_eval.py
@@ -22,8 +22,10 @@
 from ..logger import logger
 import numpy as np
 import os
-import cPickle
-
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 
 def parse_voc_rec(filename):
     """
@@ -106,10 +108,10 @@ def voc_eval(detpath, annopath, imageset_file, classname, annocache, ovthresh=0.
                 logger.info('reading annotations for %d/%d' % (ind + 1, len(image_filenames)))
         logger.info('saving annotations cache to %s' % annocache)
         with open(annocache, 'wb') as f:
-            cPickle.dump(recs, f, protocol=cPickle.HIGHEST_PROTOCOL)
+            pickle.dump(recs, f, protocol=pickle.HIGHEST_PROTOCOL)
     else:
         with open(annocache, 'rb') as f:
-            recs = cPickle.load(f)
+            recs = pickle.load(f)
 
     # extract objects in :param classname:
     class_recs = {}
diff --git a/example/rcnn/rcnn/io/rcnn.py b/example/rcnn/rcnn/io/rcnn.py
index f9613d68bd..d11c7cadac 100644
--- a/example/rcnn/rcnn/io/rcnn.py
+++ b/example/rcnn/rcnn/io/rcnn.py
@@ -75,7 +75,7 @@ def get_rcnn_batch(roidb):
     assert config.TRAIN.BATCH_ROIS % config.TRAIN.BATCH_IMAGES == 0, \
         'BATCHIMAGES {} must divide BATCH_ROIS {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_ROIS)
     rois_per_image = config.TRAIN.BATCH_ROIS / config.TRAIN.BATCH_IMAGES
-    fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(int)
+    fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(np.int)
 
     rois_array = list()
     labels_array = list()
@@ -147,7 +147,7 @@ def sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes,
     # foreground RoI with FG_THRESH overlap
     fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0]
     # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs
-    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size)
+    fg_rois_per_this_image = int(np.minimum(fg_rois_per_image, fg_indexes.size))
     # Sample foreground regions without replacement
     if len(fg_indexes) > fg_rois_per_this_image:
         fg_indexes = npr.choice(fg_indexes, size=fg_rois_per_this_image, replace=False)
@@ -156,7 +156,7 @@ def sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes,
     bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0]
     # Compute number of background RoIs to take from this image (guarding against there being fewer than desired)
     bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
-    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_indexes.size)
+    bg_rois_per_this_image = int(np.minimum(bg_rois_per_this_image, bg_indexes.size))
     # Sample foreground regions without replacement
     if len(bg_indexes) > bg_rois_per_this_image:
         bg_indexes = npr.choice(bg_indexes, size=bg_rois_per_this_image, replace=False)
diff --git a/example/rcnn/rcnn/processing/bbox_regression.py b/example/rcnn/rcnn/processing/bbox_regression.py
index d5330f4098..24812ac2bd 100644
--- a/example/rcnn/rcnn/processing/bbox_regression.py
+++ b/example/rcnn/rcnn/processing/bbox_regression.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from ..logger import logger
-from bbox_transform import bbox_overlaps, bbox_transform
+from .bbox_transform import bbox_overlaps, bbox_transform
 from rcnn.config import config
 
 
diff --git a/example/rcnn/rcnn/processing/generate_anchor.py b/example/rcnn/rcnn/processing/generate_anchor.py
index 0e97d6ef2b..53c280dd45 100644
--- a/example/rcnn/rcnn/processing/generate_anchor.py
+++ b/example/rcnn/rcnn/processing/generate_anchor.py
@@ -18,7 +18,7 @@
 """
 Generate base anchors on index 0
 """
-
+from builtins import range
 import numpy as np
 
 
@@ -32,7 +32,7 @@ def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
     base_anchor = np.array([1, 1, base_size, base_size]) - 1
     ratio_anchors = _ratio_enum(base_anchor, ratios)
     anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
-                         for i in xrange(ratio_anchors.shape[0])])
+                         for i in range(ratio_anchors.shape[0])])
     return anchors
 
 
diff --git a/example/rcnn/rcnn/pycocotools/cocoeval.py b/example/rcnn/rcnn/pycocotools/cocoeval.py
index 8b78026d39..e1d181b5bc 100644
--- a/example/rcnn/rcnn/pycocotools/cocoeval.py
+++ b/example/rcnn/rcnn/pycocotools/cocoeval.py
@@ -21,7 +21,7 @@
 import datetime
 import time
 from collections import defaultdict
-import mask as maskUtils
+from .mask import *
 import copy
 
 class COCOeval:
@@ -204,7 +204,7 @@ def computeIoU(self, imgId, catId):
 
         # compute iou between each dt and gt region
         iscrowd = [int(o['iscrowd']) for o in gt]
-        ious = maskUtils.iou(d,g,iscrowd)
+        ious = iou(d,g,iscrowd)
         return ious
 
     def computeOks(self, imgId, catId):
diff --git a/example/rcnn/rcnn/pycocotools/mask.py b/example/rcnn/rcnn/pycocotools/mask.py
index 48c050c594..2122468f68 100644
--- a/example/rcnn/rcnn/pycocotools/mask.py
+++ b/example/rcnn/rcnn/pycocotools/mask.py
@@ -17,7 +17,7 @@
 
 __author__ = 'tsungyi'
 
-import _mask
+from rcnn.pycocotools import _mask
 
 # Interface for manipulating masks stored in RLE format.
 #
diff --git a/example/rcnn/rcnn/symbol/__init__.py b/example/rcnn/rcnn/symbol/__init__.py
index 113b52c98a..7547122dd5 100644
--- a/example/rcnn/rcnn/symbol/__init__.py
+++ b/example/rcnn/rcnn/symbol/__init__.py
@@ -15,5 +15,5 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from symbol_vgg import *
-from symbol_resnet import *
+from .symbol_vgg import *
+from .symbol_resnet import *
diff --git a/example/rcnn/rcnn/symbol/proposal_target.py b/example/rcnn/rcnn/symbol/proposal_target.py
index e0444f978b..0af19a9cf3 100644
--- a/example/rcnn/rcnn/symbol/proposal_target.py
+++ b/example/rcnn/rcnn/symbol/proposal_target.py
@@ -45,7 +45,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
         assert self._batch_rois % self._batch_images == 0, \
             'BATCHIMAGES {} must devide BATCH_ROIS {}'.format(self._batch_images, self._batch_rois)
         rois_per_image = self._batch_rois / self._batch_images
-        fg_rois_per_image = np.round(self._fg_fraction * rois_per_image).astype(int)
+        fg_rois_per_image = np.round(self._fg_fraction * rois_per_image).astype(np.int)
 
         all_rois = in_data[0].asnumpy()
         gt_boxes = in_data[1].asnumpy()
diff --git a/example/rcnn/rcnn/symbol/symbol_resnet.py b/example/rcnn/rcnn/symbol/symbol_resnet.py
index 4a9677d440..f7721366c1 100644
--- a/example/rcnn/rcnn/symbol/symbol_resnet.py
+++ b/example/rcnn/rcnn/symbol/symbol_resnet.py
@@ -16,9 +16,9 @@
 # under the License.
 
 import mxnet as mx
-import proposal
-import proposal_target
 from rcnn.config import config
+from . import proposal
+from . import proposal_target
 
 eps = 2e-5
 use_global_stats = True
diff --git a/example/rcnn/rcnn/symbol/symbol_vgg.py b/example/rcnn/rcnn/symbol/symbol_vgg.py
index 00ba15ed8e..33fbede2df 100644
--- a/example/rcnn/rcnn/symbol/symbol_vgg.py
+++ b/example/rcnn/rcnn/symbol/symbol_vgg.py
@@ -16,10 +16,9 @@
 # under the License.
 
 import mxnet as mx
-import proposal
-import proposal_target
 from rcnn.config import config
-
+from . import proposal
+from . import proposal_target
 
 def get_vgg_conv(data):
     """
diff --git a/example/rcnn/rcnn/tools/reeval.py b/example/rcnn/rcnn/tools/reeval.py
index a7ae898f41..1e5c0aa5a8 100644
--- a/example/rcnn/rcnn/tools/reeval.py
+++ b/example/rcnn/rcnn/tools/reeval.py
@@ -16,7 +16,10 @@
 # under the License.
 
 import argparse
-import cPickle
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
 import os
 import mxnet as mx
 
@@ -32,7 +35,7 @@ def reeval(args):
     # load detection results
     cache_file = os.path.join(imdb.cache_path, imdb.name, 'detections.pkl')
     with open(cache_file) as f:
-        detections = cPickle.load(f)
+        detections = pickle.load(f)
 
     # eval
     imdb.evaluate_detections(detections)
diff --git a/example/rcnn/script/additional_deps.sh b/example/rcnn/script/additional_deps.sh
index 0e6599c77f..cddc391b13 100755
--- a/example/rcnn/script/additional_deps.sh
+++ b/example/rcnn/script/additional_deps.sh
@@ -20,19 +20,7 @@
 
 # install additional depts
 sudo apt install python-pip python-dev unzip python-matplotlib
-sudo pip install cython scikit-image easydict
-
-# install a forked MXNet
-pushd ../../
-cp make/config.mk ./
-echo "USE_CUDA=1" >>config.mk
-echo "USE_CUDA_PATH=/usr/local/cuda" >>config.mk
-echo "USE_CUDNN=1" >>config.mk
-make -j$(nproc)
-pushd python
-python setup.py install --user
-popd
-popd
+sudo pip install cython scikit-image easydict opencv-python
 
 # build cython extension
 make
diff --git a/example/reinforcement-learning/dqn/README.md b/example/reinforcement-learning/dqn/README.md
index 8a73e1db64..58f7b56146 100644
Binary files a/example/reinforcement-learning/dqn/README.md and b/example/reinforcement-learning/dqn/README.md differ
diff --git a/example/reinforcement-learning/dqn/atari_game.py b/example/reinforcement-learning/dqn/atari_game.py
index 5c1314ffcf..43c298a738 100644
--- a/example/reinforcement-learning/dqn/atari_game.py
+++ b/example/reinforcement-learning/dqn/atari_game.py
@@ -44,18 +44,18 @@ def ale_load_from_rom(rom_path, display_screen):
                            'installation guidance')
 
     ale = ALEInterface()
-    ale.setInt('random_seed', rng.randint(1000))
+    ale.setInt(b'random_seed', rng.randint(1000))
     if display_screen:
         import sys
         if sys.platform == 'darwin':
             import pygame
             pygame.init()
-            ale.setBool('sound', False) # Sound doesn't work on OSX
-        ale.setBool('display_screen', True)
+            ale.setBool(b'sound', False) # Sound doesn't work on OSX
+        ale.setBool(b'display_screen', True)
     else:
-        ale.setBool('display_screen', False)
-    ale.setFloat('repeat_action_probability', 0)
-    ale.loadROM(rom_path)
+        ale.setBool(b'display_screen', False)
+    ale.setFloat(b'repeat_action_probability', 0)
+    ale.loadROM(str.encode(rom_path))
     return ale
 
 
diff --git a/example/reinforcement-learning/dqn/base.py b/example/reinforcement-learning/dqn/base.py
index ce82f2b1ad..982ae17f86 100644
--- a/example/reinforcement-learning/dqn/base.py
+++ b/example/reinforcement-learning/dqn/base.py
@@ -135,7 +135,7 @@ def switch_bucket(self, bucket_kwargs=None, data_shapes=None):
                 self.initializer(k, v)
         else:
             assert set(arg_name_shape.items()) == \
-                   set(data_shapes.items() + [(k, v.shape) for k, v in self.params.items()])
+                   set(list(data_shapes.items()) + list([(k, v.shape) for k, v in self.params.items()]))
         if self.aux_states is None:
             self.aux_states = OrderedDict([(k, nd.empty(s, ctx=self.ctx))
                                            for k, s in zip(aux_names, aux_shapes)])
diff --git a/example/reinforcement-learning/parallel_actor_critic/README.md b/example/reinforcement-learning/parallel_actor_critic/README.md
index d734ceb190..d3288492a6 100644
--- a/example/reinforcement-learning/parallel_actor_critic/README.md
+++ b/example/reinforcement-learning/parallel_actor_critic/README.md
@@ -10,6 +10,14 @@ Please see the accompanying [tutorial](https://minpy.readthedocs.io/en/latest/tu
 
 Author: Sean Welleck ([@wellecks](https://github.com/wellecks)), Reed Lee ([@loofahcus](https://github.com/loofahcus))
 
+
+## Prerequisites
+  - Install Scikit-learn: `python -m pip install --user sklearn`
+  - Install SciPy: `python -m pip install --user scipy`
+  - Install the required OpenAI environments. For example, install Atari: `pip install gym[atari]`
+
+For more details refer: https://github.com/openai/gym
+
 ## Training
 
 #### Atari Pong
diff --git a/example/reinforcement-learning/parallel_actor_critic/model.py b/example/reinforcement-learning/parallel_actor_critic/model.py
index b90af67905..384f48cfab 100644
--- a/example/reinforcement-learning/parallel_actor_critic/model.py
+++ b/example/reinforcement-learning/parallel_actor_critic/model.py
@@ -88,7 +88,7 @@ def train_step(self, env_xs, env_as, env_rs, env_vs):
         # Compute discounted rewards and advantages.
         advs = []
         gamma, lambda_ = self.config.gamma, self.config.lambda_
-        for i in xrange(len(env_vs)):
+        for i in range(len(env_vs)):
             # Compute advantages using Generalized Advantage Estimation;
             # see eqn. (16) of [Schulman 2016].
             delta_t = (env_rs[i] + gamma*np.array(env_vs[i][1:]) -
diff --git a/example/reinforcement-learning/parallel_actor_critic/train.py b/example/reinforcement-learning/parallel_actor_critic/train.py
index 128a550302..7b78d72205 100644
--- a/example/reinforcement-learning/parallel_actor_critic/train.py
+++ b/example/reinforcement-learning/parallel_actor_critic/train.py
@@ -125,7 +125,7 @@ def save_params(save_pre, model, epoch):
     parser = argparse.ArgumentParser()
     parser.add_argument('--num-envs', type=int, default=16)
     parser.add_argument('--t-max', type=int, default=50)
-    parser.add_argument('--env-type', default='PongDeterministic-v3')
+    parser.add_argument('--env-type', default='PongDeterministic-v4')
     parser.add_argument('--render', action='store_true')
     parser.add_argument('--save-pre', default='checkpoints')
     parser.add_argument('--save-every', type=int, default=0)
diff --git a/example/rnn/README.md b/example/rnn/README.md
index 8a6f29d2c0..f0d80c3a61 100644
--- a/example/rnn/README.md
+++ b/example/rnn/README.md
@@ -1,15 +1,14 @@
-RNN Example
+Recurrent Neural Network Examples
 ===========
-This folder contains RNN examples using high level mxnet.rnn interface.
 
-Examples using low level symbol interface have been deprecated and moved to old/
+This directory contains functions for creating recurrent neural networks
+models using high level mxnet.rnn interface.
 
-## Data
-Run `get_ptb_data.sh` to download PenTreeBank data.
+Here is a short overview of what is in this directory.
 
-## Python
-
-- [lstm_bucketing.py](lstm_bucketing.py) PennTreeBank language model by using LSTM
-
-Performance Note:
-More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer to [Environment Variables](https://mxnet.readthedocs.org/en/latest/how_to/env_var.html).
+Directory | What's in it?
+--- | ---
+`word_lm/` | Language model trained on the PTB dataset achieving state of the art performance
+`bucketing/` | Language model with bucketing API with python
+`bucket_R/` | Language model with bucketing API with R
+`old/` | Language model trained with low level symbol interface (deprecated)
diff --git a/example/rnn/bucketing/README.md b/example/rnn/bucketing/README.md
new file mode 100644
index 0000000000..6baf1ecae9
--- /dev/null
+++ b/example/rnn/bucketing/README.md
@@ -0,0 +1,13 @@
+RNN Example
+===========
+This folder contains RNN examples using high level mxnet.rnn interface.
+
+## Data
+Run `get_ptb_data.sh` to download PenTreeBank data.
+
+## Python
+
+- [lstm_bucketing.py](lstm_bucketing.py) PennTreeBank language model by using LSTM
+
+Performance Note:
+More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer to [Environment Variables](https://mxnet.readthedocs.org/en/latest/how_to/env_var.html).
diff --git a/example/rnn/cudnn_lstm_bucketing.py b/example/rnn/bucketing/cudnn_lstm_bucketing.py
similarity index 100%
rename from example/rnn/cudnn_lstm_bucketing.py
rename to example/rnn/bucketing/cudnn_lstm_bucketing.py
diff --git a/example/rnn/get_ptb_data.sh b/example/rnn/bucketing/get_ptb_data.sh
similarity index 100%
rename from example/rnn/get_ptb_data.sh
rename to example/rnn/bucketing/get_ptb_data.sh
diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/bucketing/lstm_bucketing.py
similarity index 100%
rename from example/rnn/lstm_bucketing.py
rename to example/rnn/bucketing/lstm_bucketing.py
diff --git a/example/rnn/word_lm/README.md b/example/rnn/word_lm/README.md
new file mode 100644
index 0000000000..c4980326e4
--- /dev/null
+++ b/example/rnn/word_lm/README.md
@@ -0,0 +1,49 @@
+Word Level Language Modeling
+===========
+This example trains a multi-layer LSTM on Penn Treebank (PTB) language modeling benchmark.
+
+The following techniques have been adopted for SOTA results:
+- [LSTM for LM](https://arxiv.org/pdf/1409.2329.pdf)
+- [Weight tying](https://arxiv.org/abs/1608.05859) between word vectors and softmax output embeddings
+
+## Prerequisite
+The example requires MXNet built with CUDA.
+
+## Data
+The PTB data is the processed version from [(Mikolov et al, 2010)](http://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf):
+
+## Usage
+Example runs and the results:
+
+```
+python train.py --tied --nhid 650 --emsize 650 --dropout 0.5        # Test ppl of 75.4
+```
+
+```
+usage: train.py [-h] [--data DATA] [--emsize EMSIZE] [--nhid NHID]
+                [--nlayers NLAYERS] [--lr LR] [--clip CLIP] [--epochs EPOCHS]
+                [--batch_size BATCH_SIZE] [--dropout DROPOUT] [--tied]
+                [--bptt BPTT] [--log-interval LOG_INTERVAL] [--seed SEED]
+
+PennTreeBank LSTM Language Model
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data DATA           location of the data corpus
+  --emsize EMSIZE       size of word embeddings
+  --nhid NHID           number of hidden units per layer
+  --nlayers NLAYERS     number of layers
+  --lr LR               initial learning rate
+  --clip CLIP           gradient clipping by global norm
+  --epochs EPOCHS       upper epoch limit
+  --batch_size BATCH_SIZE
+                        batch size
+  --dropout DROPOUT     dropout applied to layers (0 = no dropout)
+  --tied                tie the word embedding and softmax weights
+  --bptt BPTT           sequence length
+  --log-interval LOG_INTERVAL
+                        report interval
+  --seed SEED           random seed
+```
+
+
diff --git a/example/rnn/word_lm/data.py b/example/rnn/word_lm/data.py
new file mode 100644
index 0000000000..ff67088c78
--- /dev/null
+++ b/example/rnn/word_lm/data.py
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os, gzip
+import sys
+import mxnet as mx
+import numpy as np
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+        self.word_count = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+            self.word_count.append(0)
+        index = self.word2idx[word]
+        self.word_count[index] += 1
+        return index
+
+    def __len__(self):
+        return len(self.idx2word)
+
+class Corpus(object):
+    def __init__(self, path):
+        self.dictionary = Dictionary()
+        self.train = self.tokenize(path + 'train.txt')
+        self.valid = self.tokenize(path + 'valid.txt')
+        self.test = self.tokenize(path + 'test.txt')
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        assert os.path.exists(path)
+        # Add words to the dictionary
+        with open(path, 'r') as f:
+            tokens = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                tokens += len(words)
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        # Tokenize file content
+        with open(path, 'r') as f:
+            ids = np.zeros((tokens,), dtype='int32')
+            token = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    ids[token] = self.dictionary.word2idx[word]
+                    token += 1
+
+        return mx.nd.array(ids, dtype='int32')
+
+def batchify(data, batch_size):
+    """Reshape data into (num_example, batch_size)"""
+    nbatch = data.shape[0] // batch_size
+    data = data[:nbatch * batch_size]
+    data = data.reshape((batch_size, nbatch)).T
+    return data
+
+class CorpusIter(mx.io.DataIter):
+    "An iterator that returns the a batch of sequence each time"
+    def __init__(self, source, batch_size, bptt):
+        super(CorpusIter, self).__init__()
+        self.batch_size = batch_size
+        self.provide_data = [('data', (bptt, batch_size), np.int32)]
+        self.provide_label = [('label', (bptt, batch_size))]
+        self._index = 0
+        self._bptt = bptt
+        self._source = batchify(source, batch_size)
+
+    def iter_next(self):
+        i = self._index
+        if i+self._bptt > self._source.shape[0] - 1:
+            return False
+        self._next_data = self._source[i:i+self._bptt]
+        self._next_label = self._source[i+1:i+1+self._bptt].astype(np.float32)
+        self._index += self._bptt
+        return True
+
+    def next(self):
+        if self.iter_next():
+            return mx.io.DataBatch(data=self.getdata(), label=self.getlabel())
+        else:
+            raise StopIteration
+
+    def reset(self):
+        self._index = 0
+        self._next_data = None
+        self._next_label = None
+
+    def getdata(self):
+        return [self._next_data]
+
+    def getlabel(self):
+        return [self._next_label]
diff --git a/example/rnn/word_lm/get_ptb_data.sh b/example/rnn/word_lm/get_ptb_data.sh
new file mode 100755
index 0000000000..0a0c7051b0
--- /dev/null
+++ b/example/rnn/word_lm/get_ptb_data.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
+
+RNN_DIR=$(cd `dirname $0`; pwd)
+DATA_DIR="${RNN_DIR}/data/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} doesn't exist, will create one";
+  mkdir -p ${DATA_DIR}
+fi
+
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/rnn/word_lm/model.py b/example/rnn/word_lm/model.py
new file mode 100644
index 0000000000..aa3710a3b0
--- /dev/null
+++ b/example/rnn/word_lm/model.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+def rnn(bptt, vocab_size, num_embed, nhid,
+        num_layers, dropout, batch_size, tied):
+    # encoder
+    data = mx.sym.Variable('data')
+    weight = mx.sym.var("encoder_weight", init=mx.init.Uniform(0.1))
+    embed = mx.sym.Embedding(data=data, weight=weight, input_dim=vocab_size,
+                             output_dim=num_embed, name='embed')
+
+    # stacked rnn layers
+    states = []
+    state_names = []
+    outputs = mx.sym.Dropout(embed, p=dropout)
+    for i in range(num_layers):
+        prefix = 'lstm_l%d_' % i
+        cell = mx.rnn.FusedRNNCell(num_hidden=nhid, prefix=prefix, get_next_state=True,
+                                   forget_bias=0.0, dropout=dropout)
+        state_shape = (1, batch_size, nhid)
+        begin_cell_state_name = prefix + 'cell'
+        begin_hidden_state_name = prefix + 'hidden'
+        begin_cell_state = mx.sym.var(begin_cell_state_name, shape=state_shape)
+        begin_hidden_state = mx.sym.var(begin_hidden_state_name, shape=state_shape)
+        state_names += [begin_cell_state_name, begin_hidden_state_name]
+        outputs, next_states = cell.unroll(bptt, inputs=outputs,
+                                           begin_state=[begin_cell_state, begin_hidden_state],
+                                           merge_outputs=True, layout='TNC')
+        outputs = mx.sym.Dropout(outputs, p=dropout)
+        states += next_states
+
+    # decoder
+    pred = mx.sym.Reshape(outputs, shape=(-1, nhid))
+    if tied:
+        assert(nhid == num_embed), \
+               "the number of hidden units and the embedding size must batch for weight tying"
+        pred = mx.sym.FullyConnected(data=pred, weight=weight,
+                                     num_hidden=vocab_size, name='pred')
+    else:
+        pred = mx.sym.FullyConnected(data=pred, num_hidden=vocab_size, name='pred')
+    pred = mx.sym.Reshape(pred, shape=(-1, vocab_size))
+    return pred, [mx.sym.stop_gradient(s) for s in states], state_names
+
+def softmax_ce_loss(pred):
+    # softmax cross-entropy loss
+    label = mx.sym.Variable('label')
+    label = mx.sym.Reshape(label, shape=(-1,))
+    logits = mx.sym.log_softmax(pred, axis=-1)
+    loss = -mx.sym.pick(logits, label, axis=-1, keepdims=True)
+    loss = mx.sym.mean(loss, axis=0, exclude=True)
+    return mx.sym.make_loss(loss, name='nll')
diff --git a/example/rnn/word_lm/module.py b/example/rnn/word_lm/module.py
new file mode 100644
index 0000000000..864700c104
--- /dev/null
+++ b/example/rnn/word_lm/module.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import logging
+
+class CustomStatefulModule():
+    """CustomStatefulModule is a module that takes a custom loss symbol and state symbols.
+    The custom loss is typically composed by `mx.sym.make_loss` or `mx.sym.MakeLoss`.
+    The states listed in `state_names` will be carried between iterations.
+
+    Parameters
+    ----------
+    loss : Symbol
+        The custom loss symbol
+    states: list of Symbol
+        The symbols of next states
+    state_names : list of str
+        states are similar to data and label, but not provided by data iterator.
+        Instead they are initialized to `initial_states` and can be carried between iterations.
+    data_names : list of str
+        Defaults to `('data')` for a typical model used in image classification.
+    label_names : list of str
+        Defaults to `('softmax_label')` for a typical model used in image
+        classification.
+    logger : Logger
+        Defaults to `logging`.
+    context : Context or list of Context
+        Defaults to ``mx.cpu()``.
+    initial_states: float or list of NDArray
+        Defaults to 0.0.
+    """
+    def __init__(self, loss, states, state_names, data_names=('data',), label_names=('label',),
+                 context=mx.cpu(), initial_states=0.0, **kwargs):
+        if isinstance(states, mx.symbol.Symbol):
+            states = [states]
+        self._net = mx.sym.Group(states + [loss])
+        self._next_states = initial_states
+        self._module = mx.module.Module(self._net, data_names=data_names, label_names=label_names,
+                                        context=context, state_names=state_names, **kwargs)
+
+    def backward(self, out_grads=None):
+        """Backward computation.
+        """
+        self._module.backward(out_grads=out_grads)
+
+    def init_params(self, initializer=mx.init.Uniform(0.01), **kwargs):
+        """Initializes the parameters and auxiliary states.
+        """
+        self._module.init_params(initializer=initializer, **kwargs)
+
+    def init_optimizer(self, **kwargs):
+        """Installs and initializes optimizers, as well as initialize kvstore for
+           distributed training.
+        """
+        self._module.init_optimizer(**kwargs)
+
+    def bind(self, data_shapes, **kwargs):
+        """Binds the symbols to construct executors. This is necessary before one
+        can perform computation with the module.
+        """
+        self._module.bind(data_shapes, **kwargs)
+
+    def forward(self, data_batch, is_train=None, carry_state=True):
+        """Forward computation. States from previous forward computation are carried
+        to the current iteration if `carry_state` is set to `True`.
+        """
+        # propagate states from the previous iteration
+        if carry_state:
+            if isinstance(self._next_states, (int, float)):
+                self._module.set_states(value=self._next_states)
+            else:
+                self._module.set_states(states=self._next_states)
+        self._module.forward(data_batch, is_train=is_train)
+        outputs = self._module.get_outputs(merge_multi_context=False)
+        self._next_states = outputs[:-1]
+
+    def update(self, max_norm=None):
+        """Updates parameters according to the installed optimizer and the gradients computed
+        in the previous forward-backward batch. Gradients are clipped by their global norm
+        if `max_norm` is set.
+
+        Parameters
+        ----------
+        max_norm: float, optional
+            If set, clip values of all gradients the ratio of the sum of their norms.
+        """
+        if max_norm is not None:
+            self._clip_by_global_norm(max_norm)
+        self._module.update()
+
+    def _clip_by_global_norm(self, max_norm):
+        """Clips gradient norm.
+
+        The norm is computed over all gradients together, as if they were
+        concatenated into a single vector. Gradients are modified in-place.
+        The method is first used in
+         `[ICML2013] On the difficulty of training recurrent neural networks`
+
+        Parameters
+        ----------
+        max_norm : float or int
+            The maximum clipping threshold of the gradient norm.
+
+        Returns
+        -------
+        norm_val : float
+            The computed norm of the gradients.
+        """
+        assert self._module.binded and self._module.params_initialized \
+               and self._module.optimizer_initialized
+        grad_array = []
+        for grad in self._module._exec_group.grad_arrays:
+            grad_array += grad
+        return mx.gluon.utils.clip_global_norm(grad_array, max_norm)
+
+    def get_loss(self):
+        """Gets the output loss of the previous forward computation.
+        """
+        return self._module.get_outputs(merge_multi_context=False)[-1]
diff --git a/example/rnn/word_lm/train.py b/example/rnn/word_lm/train.py
new file mode 100644
index 0000000000..53b6bd35f2
--- /dev/null
+++ b/example/rnn/word_lm/train.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import mxnet as mx, math
+import argparse, math
+import logging
+from data import Corpus, CorpusIter
+from model import *
+from module import *
+from mxnet.model import BatchEndParam
+
+parser = argparse.ArgumentParser(description='PennTreeBank LSTM Language Model')
+parser.add_argument('--data', type=str, default='./data/ptb.',
+                    help='location of the data corpus')
+parser.add_argument('--emsize', type=int, default=650,
+                    help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=650,
+                    help='number of hidden units per layer')
+parser.add_argument('--nlayers', type=int, default=2,
+                    help='number of layers')
+parser.add_argument('--lr', type=float, default=1.0,
+                    help='initial learning rate')
+parser.add_argument('--clip', type=float, default=0.2,
+                    help='gradient clipping by global norm')
+parser.add_argument('--epochs', type=int, default=40,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=32,
+                    help='batch size')
+parser.add_argument('--dropout', type=float, default=0.5,
+                    help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--tied', action='store_true',
+                    help='tie the word embedding and softmax weights')
+parser.add_argument('--bptt', type=int, default=35,
+                    help='sequence length')
+parser.add_argument('--log-interval', type=int, default=200,
+                    help='report interval')
+parser.add_argument('--seed', type=int, default=3,
+                    help='random seed')
+args = parser.parse_args()
+
+best_loss = 9999
+
+def evaluate(valid_module, data_iter, epoch, mode, bptt, batch_size):
+    total_loss = 0.0
+    nbatch = 0
+    for batch in data_iter:
+        valid_module.forward(batch, is_train=False)
+        outputs = valid_module.get_loss()
+        total_loss += mx.nd.sum(outputs[0]).asscalar()
+        nbatch += 1
+    data_iter.reset()
+    loss = total_loss / bptt / batch_size / nbatch
+    logging.info('Iter[%d] %s loss:\t%.7f, Perplexity: %.7f' % \
+                 (epoch, mode, loss, math.exp(loss)))
+    return loss
+
+if __name__ == '__main__':
+    # args
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    args = parser.parse_args()
+    logging.info(args)
+    ctx = mx.gpu()
+    batch_size = args.batch_size
+    bptt = args.bptt
+    mx.random.seed(args.seed)
+
+    # data
+    corpus = Corpus(args.data)
+    ntokens = len(corpus.dictionary)
+    train_data = CorpusIter(corpus.train, batch_size, bptt)
+    valid_data = CorpusIter(corpus.valid, batch_size, bptt)
+    test_data = CorpusIter(corpus.test, batch_size, bptt)
+
+    # model
+    pred, states, state_names = rnn(bptt, ntokens, args.emsize, args.nhid,
+                                    args.nlayers, args.dropout, batch_size, args.tied)
+    loss = softmax_ce_loss(pred)
+
+    # module
+    module = CustomStatefulModule(loss, states, state_names=state_names, context=ctx)
+    module.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    module.init_params(initializer=mx.init.Xavier())
+    optimizer = mx.optimizer.create('sgd', learning_rate=args.lr, rescale_grad=1.0/batch_size)
+    module.init_optimizer(optimizer=optimizer)
+
+    # metric
+    speedometer = mx.callback.Speedometer(batch_size, args.log_interval)
+
+    # train
+    logging.info("Training started ... ")
+    for epoch in range(args.epochs):
+        # train
+        total_loss = 0.0
+        nbatch = 0
+        for batch in train_data:
+            module.forward(batch)
+            module.backward()
+            module.update(max_norm=args.clip * bptt * batch_size)
+            # update metric
+            outputs = module.get_loss()
+            total_loss += mx.nd.sum(outputs[0]).asscalar()
+            speedometer_param = BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                              eval_metric=None, locals=locals())
+            speedometer(speedometer_param)
+            if nbatch % args.log_interval == 0 and nbatch > 0:
+                cur_loss = total_loss / bptt / batch_size / args.log_interval
+                logging.info('Iter[%d] Batch [%d]\tLoss:  %.7f,\tPerplexity:\t%.7f' % \
+                             (epoch, nbatch, cur_loss, math.exp(cur_loss)))
+                total_loss = 0.0
+            nbatch += 1
+        # validation
+        valid_loss = evaluate(module, valid_data, epoch, 'Valid', bptt, batch_size)
+        if valid_loss < best_loss:
+            best_loss = valid_loss
+            # test
+            test_loss = evaluate(module, test_data, epoch, 'Test', bptt, batch_size)
+        else:
+            optimizer.lr *= 0.25
+        train_data.reset()
+    logging.info("Training completed. ")
diff --git a/example/sparse/factorization_machine/README.md b/example/sparse/factorization_machine/README.md
new file mode 100644
index 0000000000..7609f31d5c
--- /dev/null
+++ b/example/sparse/factorization_machine/README.md
@@ -0,0 +1,16 @@
+Factorization Machine
+===========
+This example trains a factorization machine model using the criteo dataset.
+
+## Download the Dataset
+
+The criteo dataset is available at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#criteo
+The data was used in a competition on click-through rate prediction jointly hosted by Criteo and Kaggle in 2014,
+with 1,000,000 features. There are 45,840,617 training examples and 6,042,135 testing examples.
+It takes more than 30 GB to download and extract the dataset.
+
+## Train the Model
+
+- python train.py --train-data /path/to/criteo.kaggle2014.test.svm --test-data /path/to/criteo.kaggle2014.test.svm
+
+[Rendle, Steffen. "Factorization machines." In Data Mining (ICDM), 2010 IEEE 10th International Conference on, pp. 995-1000. IEEE, 2010. ](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
diff --git a/example/sparse/factorization_machine/metric.py b/example/sparse/factorization_machine/metric.py
new file mode 100644
index 0000000000..07a7e01e02
--- /dev/null
+++ b/example/sparse/factorization_machine/metric.py
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+
+@mx.metric.register
+@mx.metric.alias('log_loss')
+class LogLossMetric(mx.metric.EvalMetric):
+    """Computes the negative log-likelihood loss.
+
+    The negative log-likelihoodd loss over a batch of sample size :math:`N` is given by
+
+    .. math::
+       -\\sum_{n=1}^{N}\\sum_{k=1}^{K}t_{nk}\\log (y_{nk}),
+
+    where :math:`K` is the number of classes, :math:`y_{nk}` is the prediceted probability for
+    :math:`k`-th class for :math:`n`-th sample. :math:`t_{nk}=1` if and only if sample
+    :math:`n` belongs to class :math:`k`.
+
+    Parameters
+    ----------
+    eps : float
+        Negative log-likelihood loss is undefined for predicted value is 0,
+        so predicted values are added with the small constant.
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
+    Examples
+    --------
+    >>> predicts = [mx.nd.array([[0.3], [0], [0.4]])]
+    >>> labels   = [mx.nd.array([0, 1, 1])]
+    >>> log_loss= mx.metric.NegativeLogLikelihood()
+    >>> log_loss.update(labels, predicts)
+    >>> print log_loss.get()
+    ('log-loss', 0.57159948348999023)
+    """
+    def __init__(self, eps=1e-12, name='log-loss',
+                 output_names=None, label_names=None):
+        super(LogLossMetric, self).__init__(
+            name, eps=eps,
+            output_names=output_names, label_names=label_names)
+        self.eps = eps
+
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        mx.metric.check_label_shapes(labels, preds)
+
+        for label, pred in zip(labels, preds):
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+            pred = np.column_stack((1 - pred, pred))
+
+            label = label.ravel()
+            num_examples = pred.shape[0]
+            assert label.shape[0] == num_examples, (label.shape[0], num_examples)
+            prob = pred[np.arange(num_examples, dtype=np.int64), np.int64(label)]
+            self.sum_metric += (-np.log(prob + self.eps)).sum()
+            self.num_inst += num_examples
diff --git a/example/sparse/factorization_machine/model.py b/example/sparse/factorization_machine/model.py
new file mode 100644
index 0000000000..f0af2e650d
--- /dev/null
+++ b/example/sparse/factorization_machine/model.py
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+def factorization_machine_model(factor_size, num_features,
+                                lr_mult_config, wd_mult_config, init_config):
+    """ builds factorization machine network with proper formulation:
+    y = w_0 \sum(x_i w_i) + 0.5(\sum\sum<v_i,v_j>x_ix_j - \sum<v_iv_i>x_i^2)
+    """
+    x = mx.symbol.Variable("data", stype='csr')
+    # factor, linear and bias terms
+    v = mx.symbol.Variable("v", shape=(num_features, factor_size), stype='row_sparse',
+                           init=init_config['v'], lr_mult=lr_mult_config['v'],
+                           wd_mult=wd_mult_config['v'])
+    w = mx.symbol.var('w', shape=(num_features, 1), stype='row_sparse',
+                      init=init_config['w'], lr_mult=lr_mult_config['w'],
+                      wd_mult=wd_mult_config['w'])
+    w0 = mx.symbol.var('w0', shape=(1,), init=init_config['w0'],
+                       lr_mult=lr_mult_config['w0'], wd_mult=wd_mult_config['w0'])
+    w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w), w0)
+
+    # squared terms for subtracting self interactions
+    v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True)
+    x_s = x.square()
+    bd_sum = mx.sym.dot(x_s, v_s)
+
+    # interactions
+    w2 = mx.symbol.dot(x, v)
+    w2_squared = 0.5 * mx.symbol.square(data=w2)
+
+    # putting everything together
+    w_all = mx.symbol.Concat(w1, w2_squared, dim=1)
+    sum1 = w_all.sum(axis=1, keepdims=True)
+    sum2 = -0.5 * bd_sum
+    model = sum1 + sum2
+
+    y = mx.symbol.Variable("softmax_label")
+    model = mx.symbol.LogisticRegressionOutput(data=model, label=y)
+    return model
diff --git a/example/sparse/factorization_machine/train.py b/example/sparse/factorization_machine/train.py
new file mode 100644
index 0000000000..741cf958db
--- /dev/null
+++ b/example/sparse/factorization_machine/train.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from metric import *
+from mxnet.test_utils import *
+from model import *
+import argparse, os
+
+parser = argparse.ArgumentParser(description="Run factorization machine with criteo dataset",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--data-train', type=str, default=None,
+                    help='training dataset in LibSVM format.')
+parser.add_argument('--data-test', type=str, default=None,
+                    help='test dataset in LibSVM format.')
+parser.add_argument('--num-epoch', type=int, default=1,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=1000,
+                    help='number of examples per batch')
+parser.add_argument('--input-size', type=int, default=1000000,
+                    help='number of features in the input')
+parser.add_argument('--factor-size', type=int, default=16,
+                    help='number of latent variables')
+parser.add_argument('--factor-lr', type=float, default=0.0001,
+                    help='learning rate for factor terms')
+parser.add_argument('--linear-lr', type=float, default=0.001,
+                    help='learning rate for linear terms')
+parser.add_argument('--bias-lr', type=float, default=0.1,
+                    help='learning rate for bias terms')
+parser.add_argument('--factor-wd', type=float, default=0.00001,
+                    help='weight decay rate for factor terms')
+parser.add_argument('--linear-wd', type=float, default=0.001,
+                    help='weight decay rate for linear terms')
+parser.add_argument('--bias-wd', type=float, default=0.01,
+                    help='weight decay rate for bias terms')
+parser.add_argument('--factor-sigma', type=float, default=0.001,
+                    help='standard deviation for initialization of factor terms')
+parser.add_argument('--linear-sigma', type=float, default=0.01,
+                    help='standard deviation for initialization of linear terms')
+parser.add_argument('--bias-sigma', type=float, default=0.01,
+                    help='standard deviation for initialization of bias terms')
+parser.add_argument('--log-interval', type=int, default=100,
+                    help='number of batches between logging messages')
+parser.add_argument('--kvstore', type=str, default='local',
+                    help='what kvstore to use', choices=["dist_async", "local"])
+
+if __name__ == '__main__':
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=head)
+
+    # arg parser
+    args = parser.parse_args()
+    logging.info(args)
+    num_epoch = args.num_epoch
+    batch_size = args.batch_size
+    kvstore = args.kvstore
+    factor_size = args.factor_size
+    num_features = args.input_size
+    log_interval = args.log_interval
+    assert(args.data_train is not None and args.data_test is not None), \
+          "dataset for training or test is missing"
+
+    # create kvstore
+    kv = mx.kvstore.create(kvstore)
+    # data iterator
+    train_data = mx.io.LibSVMIter(data_libsvm=args.data_train, data_shape=(num_features,),
+                                  batch_size=batch_size)
+    eval_data = mx.io.LibSVMIter(data_libsvm=args.data_test, data_shape=(num_features,),
+                                 batch_size=batch_size)
+    # model
+    lr_config = {'v': args.factor_lr, 'w': args.linear_lr, 'w0': args.bias_lr}
+    wd_config = {'v': args.factor_wd, 'w': args.linear_wd, 'w0': args.bias_wd}
+    init_config = {'v': mx.initializer.Normal(args.factor_sigma),
+                   'w': mx.initializer.Normal(args.linear_sigma),
+                   'w0': mx.initializer.Normal(args.bias_sigma)}
+    model = factorization_machine_model(factor_size, num_features, lr_config, wd_config, init_config)
+
+    # module
+    mod = mx.mod.Module(symbol=model)
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params()
+    optimizer_params=(('learning_rate', 1), ('wd', 1), ('beta1', 0.9),
+                      ('beta2', 0.999), ('epsilon', 1e-8))
+    mod.init_optimizer(optimizer='adam', kvstore=kv, optimizer_params=optimizer_params)
+
+    # metrics
+    metric = mx.metric.create(['log_loss'])
+    speedometer = mx.callback.Speedometer(batch_size, log_interval)
+
+    # get the sparse weight parameter
+    w_index = mod._exec_group.param_names.index('w')
+    w_param = mod._exec_group.param_arrays[w_index]
+    v_index = mod._exec_group.param_names.index('v')
+    v_param = mod._exec_group.param_arrays[v_index]
+
+    logging.info('Training started ...')
+    train_iter = iter(train_data)
+    eval_iter = iter(eval_data)
+    for epoch in range(num_epoch):
+        nbatch = 0
+        metric.reset()
+        for batch in train_iter:
+            nbatch += 1
+            # manually pull sparse weights from kvstore so that _square_sum
+            # only computes the rows necessary
+            row_ids = batch.data[0].indices
+            kv.row_sparse_pull('w', w_param, row_ids=[row_ids], priority=-w_index)
+            kv.row_sparse_pull('v', v_param, row_ids=[row_ids], priority=-v_index)
+            mod.forward_backward(batch)
+            # update all parameters (including the weight parameter)
+            mod.update()
+            # update training metric
+            mod.update_metric(metric, batch.label)
+            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                                       eval_metric=metric, locals=locals())
+            speedometer(speedometer_param)
+
+        # pull all updated rows before validation
+        kv.row_sparse_pull('w', w_param, row_ids=[row_ids], priority=-w_index)
+        kv.row_sparse_pull('v', v_param, row_ids=[row_ids], priority=-v_index)
+        # evaluate metric on validation dataset
+        score = mod.score(eval_iter, ['log_loss'])
+        logging.info("epoch %d, eval log loss = %s" % (epoch, score[0][1]))
+        # reset the iterator for next pass of data
+        train_iter.reset()
+        eval_iter.reset()
+    logging.info('Training completed.')
diff --git a/example/sparse/linear_classification/README.md b/example/sparse/linear_classification/README.md
new file mode 100644
index 0000000000..7e2a7ad37f
--- /dev/null
+++ b/example/sparse/linear_classification/README.md
@@ -0,0 +1,17 @@
+Linear Classification Using Sparse Matrix Multiplication
+===========
+This examples trains a linear model using the sparse feature in MXNet. This is for demonstration purpose only.
+
+The example utilizes the sparse data loader ([mx.io.LibSVMIter](https://mxnet.incubator.apache.org/versions/master/api/python/io.html#mxnet.io.LibSVMIter)),
+the sparse dot operator and [sparse gradient updaters](https://mxnet.incubator.apache.org/versions/master/api/python/ndarray/sparse.html#updater)
+to train a linear model on the
+[Avazu](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu) click-through-prediction dataset.
+
+The example also shows how to perform distributed training with the sparse feature.
+
+- `python train.py`
+
+Notes on Distributed Training:
+
+- For distributed training, please use the `../../tools/launch.py` script to launch a cluster.
+- For example, to run two workers and two servers with one machine, run `../../../tools/launch.py -n 2 --launcher=local python train.py --kvstore=dist_async`
diff --git a/example/speech-demo/io_func/info.py b/example/sparse/linear_classification/data.py
similarity index 57%
rename from example/speech-demo/io_func/info.py
rename to example/sparse/linear_classification/data.py
index eaf95ab983..02984734fb 100644
--- a/example/speech-demo/io_func/info.py
+++ b/example/sparse/linear_classification/data.py
@@ -15,9 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
+import os, gzip
+import sys
+import mxnet as mx
 
-_mydir = os.path.dirname(__file__) or '.'
-
-ROOT  = os.path.abspath(os.path.join(_mydir, "../.."))
-CONFIGS = os.path.join(ROOT, "configs")
+def get_avazu_data(data_dir, data_name, url):
+    if not os.path.isdir(data_dir):
+        os.mkdir(data_dir)
+    os.chdir(data_dir)
+    if (not os.path.exists(data_name)):
+        print("Dataset " + data_name + " not present. Downloading now ...")
+        import urllib
+        zippath = os.path.join(data_dir, data_name + ".bz2")
+        urllib.urlretrieve(url + data_name + ".bz2", zippath)
+        os.system("bzip2 -d %r" % data_name + ".bz2")
+        print("Dataset " + data_name + " is now present.")
+    os.chdir("..")
diff --git a/example/sparse/linear_model.py b/example/sparse/linear_classification/linear_model.py
similarity index 100%
rename from example/sparse/linear_model.py
rename to example/sparse/linear_classification/linear_model.py
diff --git a/example/sparse/linear_classification.py b/example/sparse/linear_classification/train.py
similarity index 95%
rename from example/sparse/linear_classification.py
rename to example/sparse/linear_classification/train.py
index 1d63c55b11..eb7871bbdb 100644
--- a/example/sparse/linear_classification.py
+++ b/example/sparse/linear_classification/train.py
@@ -17,7 +17,7 @@
 
 import mxnet as mx
 from mxnet.test_utils import *
-from get_data import get_libsvm_data
+from data import get_avazu_data
 from linear_model import *
 import argparse
 import os
@@ -67,8 +67,8 @@
     data_dir = os.path.join(os.getcwd(), 'data')
     train_data = os.path.join(data_dir, AVAZU['train'])
     val_data = os.path.join(data_dir, AVAZU['test'])
-    get_libsvm_data(data_dir, AVAZU['train'], AVAZU['url'])
-    get_libsvm_data(data_dir, AVAZU['test'], AVAZU['url'])
+    get_avazu_data(data_dir, AVAZU['train'], AVAZU['url'])
+    get_avazu_data(data_dir, AVAZU['test'], AVAZU['url'])
 
     # data iterator
     train_data = mx.io.LibSVMIter(data_libsvm=train_data, data_shape=(num_features,),
@@ -100,11 +100,10 @@
     speedometer = mx.callback.Speedometer(batch_size, 100)
 
     logging.info('Training started ...')
-    data_iter = iter(train_data)
     for epoch in range(num_epoch):
         nbatch = 0
         metric.reset()
-        for batch in data_iter:
+        for batch in train_data:
             nbatch += 1
             # for distributed training, we need to manually pull sparse weights from kvstore
             if kv:
@@ -129,5 +128,6 @@
         save_optimizer_states = 'dist' not in kv.type if kv else True
         mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=save_optimizer_states)
         # reset the iterator for next pass of data
-        data_iter.reset()
+        train_data.reset()
+        eval_data.reset()
     logging.info('Training completed.')
diff --git a/example/sparse/weighted_softmax_ce.py b/example/sparse/linear_classification/weighted_softmax_ce.py
similarity index 98%
rename from example/sparse/weighted_softmax_ce.py
rename to example/sparse/linear_classification/weighted_softmax_ce.py
index a40ece658e..f781e6ae38 100644
--- a/example/sparse/weighted_softmax_ce.py
+++ b/example/sparse/linear_classification/weighted_softmax_ce.py
@@ -61,7 +61,7 @@ class WeightedSoftmaxCrossEntropyLossProp(mx.operator.CustomOpProp):
     def __init__(self, positive_cls_weight):
         super(WeightedSoftmaxCrossEntropyLossProp, self).__init__(True)
         self.positive_cls_weight = positive_cls_weight
-        assert(positive_cls_weight > 0)
+        assert(float(positive_cls_weight) > 0)
 
     def list_arguments(self):
         return ['data', 'label']
diff --git a/example/sparse/matrix_factorization/README.md b/example/sparse/matrix_factorization/README.md
new file mode 100644
index 0000000000..3ada5e8015
--- /dev/null
+++ b/example/sparse/matrix_factorization/README.md
@@ -0,0 +1,8 @@
+Matrix Factorization w/ Sparse Embedding
+===========
+The example demonstrates the basic usage of the SparseEmbedding operator in MXNet, adapted based on @leopd's recommender examples.
+The operator is available on both CPU and GPU. This is for demonstration purpose only.
+
+- `python train.py`
+- To compare the training speed with (dense) Embedding, run `python train.py --use-dense`
+- To run the example on the GPU, run `python train.py --use-gpu`
diff --git a/example/sparse/get_data.py b/example/sparse/matrix_factorization/data.py
similarity index 58%
rename from example/sparse/get_data.py
rename to example/sparse/matrix_factorization/data.py
index 19c635fe33..c8971651b9 100644
--- a/example/sparse/get_data.py
+++ b/example/sparse/matrix_factorization/data.py
@@ -15,48 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os, gzip
-import sys
+import os
 import mxnet as mx
+from mxnet.test_utils import DummyIter
 
-class DummyIter(mx.io.DataIter):
-    "A dummy iterator that always return the same batch, used for speed testing"
-    def __init__(self, real_iter):
-        super(DummyIter, self).__init__()
-        self.real_iter = real_iter
-        self.provide_data = real_iter.provide_data
-        self.provide_label = real_iter.provide_label
-        self.batch_size = real_iter.batch_size
-
-        for batch in real_iter:
-            self.the_batch = batch
-            break
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        return self.the_batch
-
-def get_libsvm_data(data_dir, data_name, url):
-    if not os.path.isdir(data_dir):
-        os.mkdir(data_dir)
-    os.chdir(data_dir)
-    if (not os.path.exists(data_name)):
-        print("Dataset " + data_name + " not present. Downloading now ...")
-        import urllib
-        zippath = os.path.join(data_dir, data_name + ".bz2")
-        urllib.urlretrieve(url + data_name + ".bz2", zippath)
-        os.system("bzip2 -d %r" % data_name + ".bz2")
-        print("Dataset " + data_name + " is now present.")
-    os.chdir("..")
-
-def get_movielens_data(prefix):
-    if not os.path.exists("%s.zip" % prefix):
-        print("Dataset MovieLens 10M not present. Downloading now ...")
-        os.system("wget http://files.grouplens.org/datasets/movielens/%s.zip" % prefix)
-        os.system("unzip %s.zip" % prefix)
-        os.system("cd ml-10M100K; sh split_ratings.sh; cd -;")
+def get_movielens_data(data_dir, prefix):
+    if not os.path.exists(os.path.join(data_dir, "ml-10M100K")):
+        mx.test_utils.get_zip_data(data_dir,
+                                   "http://files.grouplens.org/datasets/movielens/%s.zip" % prefix,
+                                   prefix + ".zip")
+        assert os.path.exists(os.path.join(data_dir, "ml-10M100K"))
+        os.system("cd data/ml-10M100K; chmod +x allbut.pl; sh split_ratings.sh; cd -;")
 
 def get_movielens_iter(filename, batch_size, dummy_iter):
     """Not particularly fast code to parse the text file and load into NDArrays.
@@ -89,3 +58,5 @@ def get_movielens_iter(filename, batch_size, dummy_iter):
                                    batch_size=batch_size, shuffle=True)
     iter_train = DummyIter(iter_train) if dummy_iter else iter_train
     return mx.io.PrefetchingIter(iter_train)
+
+
diff --git a/example/sparse/matrix_fact_model.py b/example/sparse/matrix_factorization/model.py
similarity index 100%
rename from example/sparse/matrix_fact_model.py
rename to example/sparse/matrix_factorization/model.py
diff --git a/example/sparse/matrix_factorization.py b/example/sparse/matrix_factorization/train.py
similarity index 93%
rename from example/sparse/matrix_factorization.py
rename to example/sparse/matrix_factorization/train.py
index 3387706665..0db58ad524 100644
--- a/example/sparse/matrix_factorization.py
+++ b/example/sparse/matrix_factorization/train.py
@@ -17,12 +17,11 @@
 
 import argparse
 import logging
-import time
 import mxnet as mx
 import numpy as np
-from get_data import get_movielens_iter, get_movielens_data
-from matrix_fact_model import matrix_fact_net
-
+from data import get_movielens_iter, get_movielens_data
+from model import matrix_fact_net
+import os
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -45,8 +44,8 @@
 
 MOVIELENS = {
     'dataset': 'ml-10m',
-    'train': './ml-10M100K/r1.train',
-    'val': './ml-10M100K/r1.test',
+    'train': './data/ml-10M100K/r1.train',
+    'val': './data/ml-10M100K/r1.test',
     'max_user': 71569,
     'max_movie': 65135,
 }
@@ -73,7 +72,8 @@
     # prepare dataset and iterators
     max_user = MOVIELENS['max_user']
     max_movies = MOVIELENS['max_movie']
-    get_movielens_data(MOVIELENS['dataset'])
+    data_dir = os.path.join(os.getcwd(), 'data')
+    get_movielens_data(data_dir, MOVIELENS['dataset'])
     train_iter = get_movielens_iter(MOVIELENS['train'], batch_size, dummy_iter)
     val_iter = get_movielens_iter(MOVIELENS['val'], batch_size, dummy_iter)
 
diff --git a/example/sparse/readme.md b/example/sparse/readme.md
deleted file mode 100644
index e443bfa2d5..0000000000
--- a/example/sparse/readme.md
+++ /dev/null
@@ -1,21 +0,0 @@
-Example
-===========
-This folder contains examples using the sparse feature in MXNet. They are for demonstration purpose only.
-
-## Linear Classification Using Sparse Matrix Multiplication
-
-The example demonstrates the basic usage of the sparse feature in MXNet to speedup computation. It utilizes the sparse data loader, sparse operators and a sparse gradient updater to train a linear model on the [Avazu](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu) click-through-prediction dataset.
-
-- `python linear_classification.py`
-
-Notes on Distributed Training:
-
-- For distributed training, please use the `../../tools/launch.py` script to launch a cluster.
-- For example, to run two workers and two servers with one machine, run `../../tools/launch.py -n 2 --launcher=local python linear_classification.py --kvstore=dist_async`
-
-## Matrix Factorization Using Sparse Embedding
-
-The example demonstrates the basic usage of the SparseEmbedding operator in MXNet, adapted based on @leopd's recommender examples.
-
-- `python matrix_factorization.py`
-- To compare the train speed with (dense) Embedding, run `python matrix_factorization.py --use-dense`
diff --git a/example/sparse/wide_deep/README.md b/example/sparse/wide_deep/README.md
new file mode 100644
index 0000000000..a538106216
--- /dev/null
+++ b/example/sparse/wide_deep/README.md
@@ -0,0 +1,7 @@
+## Wide and Deep Learning
+
+The example demonstrates how to train [wide and deep model](https://arxiv.org/abs/1606.07792). The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) that this example uses for training is hosted by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/). Tricks of feature engineering are adapted from tensorflow's [wide and deep tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep).
+
+The final accuracy should be around 85%.
+
+- `python wide_deep_classification.py`
diff --git a/example/sparse/wide_deep/data.py b/example/sparse/wide_deep/data.py
new file mode 100644
index 0000000000..ffac1eb422
--- /dev/null
+++ b/example/sparse/wide_deep/data.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from csv import DictReader
+import os
+import mxnet as mx
+import numpy as np
+
+
+def get_uci_adult(data_dir, data_name, url):
+    if not os.path.isdir(data_dir):
+        os.mkdir(data_dir)
+    os.chdir(data_dir)
+    if (not os.path.exists(data_name)):
+        print("Dataset " + data_name + " not present. Downloading now ...")
+        os.system("wget %r" % url + data_name)
+        if "test" in data_name:
+            os.system("sed -i '1d' %r" % data_name)
+        print("Dataset " + data_name + " is now present.")
+    csr, dns, label = preprocess_uci_adult(data_name)
+    os.chdir("..")
+    return csr, dns, label
+
+
+def preprocess_uci_adult(data_name):
+    """Some tricks of feature engineering are adapted
+    from tensorflow's wide and deep tutorial.
+    """
+    csv_columns = [
+        "age", "workclass", "fnlwgt", "education", "education_num",
+        "marital_status", "occupation", "relationship", "race", "gender",
+        "capital_gain", "capital_loss", "hours_per_week", "native_country",
+        "income_bracket"
+    ]
+
+    vocabulary_dict = {
+        "gender": [
+            "Female", "Male"
+        ],
+        "education": [
+            "Bachelors", "HS-grad", "11th", "Masters", "9th",
+            "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
+            "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
+            "Preschool", "12th"
+        ],
+        "marital_status": [
+            "Married-civ-spouse", "Divorced", "Married-spouse-absent",
+            "Never-married", "Separated", "Married-AF-spouse", "Widowed"
+        ],
+        "relationship": [
+            "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
+            "Other-relative"
+        ],
+        "workclass": [
+            "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
+            "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
+        ]
+    }
+    # wide columns
+    crossed_columns = [
+        ["education", "occupation"],
+        ["native_country", "occupation"],
+        ["age_buckets", "education", "occupation"],
+    ]
+    age_boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
+    # deep columns
+    indicator_columns = ['workclass', 'education', 'gender', 'relationship']
+    
+    embedding_columns = ['native_country', 'occupation']
+
+    continuous_columns = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
+    # income_bracket column is the label
+    labels = ["<", ">"]
+
+    hash_bucket_size = 1000
+    
+    csr_ncols = len(crossed_columns) * hash_bucket_size
+    dns_ncols = len(continuous_columns) + len(embedding_columns)
+    for col in indicator_columns:
+        dns_ncols += len(vocabulary_dict[col])
+
+    label_list = []
+    csr_list = []
+    dns_list = []
+
+    with open(data_name) as f:
+        for row in DictReader(f, fieldnames=csv_columns):
+            label_list.append(labels.index(row['income_bracket'].strip()[0]))
+
+            for i, cols in enumerate(crossed_columns):
+                if cols[0] == "age_buckets":
+                    age_bucket = np.digitize(float(row["age"]), age_boundaries)
+                    s = '_'.join([row[col].strip() for col in cols[1:]])
+                    s += '_' + str(age_bucket)
+                    csr_list.append((i * hash_bucket_size + hash(s) % hash_bucket_size, 1.0))
+                else:
+                    s = '_'.join([row[col].strip() for col in cols])
+                    csr_list.append((i * hash_bucket_size + hash(s) % hash_bucket_size, 1.0))
+            
+            dns_row = [0] * dns_ncols
+            dns_dim = 0
+            for col in embedding_columns:
+                dns_row[dns_dim] = hash(row[col].strip()) % hash_bucket_size
+                dns_dim += 1
+
+            for col in indicator_columns:
+                dns_row[dns_dim + vocabulary_dict[col].index(row[col].strip())] = 1.0
+                dns_dim += len(vocabulary_dict[col])
+
+            for col in continuous_columns:
+                dns_row[dns_dim] = float(row[col].strip())
+                dns_dim += 1
+
+            dns_list.append(dns_row)
+
+    data_list = [item[1] for item in csr_list]
+    indices_list = [item[0] for item in csr_list]
+    indptr_list = range(0, len(indices_list) + 1, len(crossed_columns))
+    # convert to ndarrays
+    csr = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list),
+                                  shape=(len(label_list), hash_bucket_size * len(crossed_columns)))
+    dns = np.array(dns_list)
+    label = np.array(label_list)
+    return csr, dns, label
diff --git a/example/sparse/wide_deep/model.py b/example/sparse/wide_deep/model.py
new file mode 100644
index 0000000000..e8ba5318b5
--- /dev/null
+++ b/example/sparse/wide_deep/model.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+
+def wide_deep_model(num_linear_features, num_embed_features, num_cont_features, 
+                    input_dims, hidden_units):
+    # wide model
+    csr_data = mx.symbol.Variable("csr_data", stype='csr')
+    label = mx.symbol.Variable("softmax_label")
+
+    norm_init = mx.initializer.Normal(sigma=0.01)
+    # weight with row_sparse storage type to enable sparse gradient updates
+    weight = mx.symbol.Variable("linear_weight", shape=(num_linear_features, 2),
+                                init=norm_init, stype='row_sparse')
+    bias = mx.symbol.Variable("linear_bias", shape=(2,))
+    dot = mx.symbol.sparse.dot(csr_data, weight)
+    linear_out = mx.symbol.broadcast_add(dot, bias)
+    # deep model
+    dns_data = mx.symbol.Variable("dns_data")
+    # embedding features
+    x = mx.symbol.slice(data=dns_data, begin=(0, 0),
+                        end=(None, num_embed_features))
+    embeds = mx.symbol.split(data=x, num_outputs=num_embed_features, squeeze_axis=1)
+    # continuous features
+    x = mx.symbol.slice(data=dns_data, begin=(0, num_embed_features),
+                        end=(None, num_embed_features + num_cont_features))
+    features = [x]
+
+    for i, embed in enumerate(embeds):
+        embed_weight = mx.symbol.Variable('embed_%d_weight' % i, stype='row_sparse')
+        features.append(mx.symbol.contrib.SparseEmbedding(data=embed, weight=embed_weight,
+                        input_dim=input_dims[i], output_dim=hidden_units[0]))
+
+    hidden = mx.symbol.concat(*features, dim=1)
+    hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[1])
+    hideen = mx.symbol.Activation(data=hidden, act_type='relu')
+    hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[2])
+    hideen = mx.symbol.Activation(data=hidden, act_type='relu')
+    deep_out = mx.symbol.FullyConnected(data=hidden, num_hidden=2)
+
+    out = mx.symbol.SoftmaxOutput(linear_out + deep_out, label, name='model')
+    return out
diff --git a/example/sparse/wide_deep/train.py b/example/sparse/wide_deep/train.py
new file mode 100644
index 0000000000..89befb5aa8
--- /dev/null
+++ b/example/sparse/wide_deep/train.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.test_utils import *
+from data import *
+from model import *
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description="Run sparse wide and deep classification ",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-epoch', type=int, default=10,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=100,
+                    help='number of examples per batch')
+parser.add_argument('--lr', type=float, default=0.001,
+                    help='learning rate')
+parser.add_argument('--cuda', action='store_true', default=False,
+                    help='Train on GPU with CUDA')
+parser.add_argument('--optimizer', type=str, default='adam',
+                    help='what optimizer to use',
+                    choices=["ftrl", "sgd", "adam"])
+parser.add_argument('--log-interval', type=int, default=100,
+                    help='number of batches to wait before logging training status')
+
+
+# Related to feature engineering, please see preprocess in data.py
+ADULT = {
+    'train': 'adult.data',
+    'test': 'adult.test',
+    'url': 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/',
+    'num_linear_features': 3000,
+    'num_embed_features': 2,
+    'num_cont_features': 38,
+    'embed_input_dims': [1000, 1000],
+    'hidden_units': [8, 50, 100],
+}
+
+
+if __name__ == '__main__':
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=head)
+
+    # arg parser
+    args = parser.parse_args()
+    logging.info(args)
+    num_epoch = args.num_epoch
+    batch_size = args.batch_size
+    optimizer = args.optimizer
+    log_interval = args.log_interval
+    lr = args.lr
+    ctx = mx.gpu(0) if args.cuda else mx.cpu()
+
+    # dataset    
+    data_dir = os.path.join(os.getcwd(), 'data')
+    train_data = os.path.join(data_dir, ADULT['train'])
+    val_data = os.path.join(data_dir, ADULT['test'])
+    train_csr, train_dns, train_label = get_uci_adult(data_dir, ADULT['train'], ADULT['url'])
+    val_csr, val_dns, val_label = get_uci_adult(data_dir, ADULT['test'], ADULT['url'])
+
+    model = wide_deep_model(ADULT['num_linear_features'], ADULT['num_embed_features'],
+                            ADULT['num_cont_features'], ADULT['embed_input_dims'],
+                            ADULT['hidden_units'])
+
+    # data iterator
+    train_data = mx.io.NDArrayIter({'csr_data': train_csr, 'dns_data': train_dns},
+                                   {'softmax_label': train_label}, batch_size,
+                                   shuffle=True, last_batch_handle='discard')
+    eval_data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
+                                  {'softmax_label': val_label}, batch_size,
+                                  shuffle=True, last_batch_handle='discard')
+    
+    # module
+    mod = mx.mod.Module(symbol=model, context=ctx ,data_names=['csr_data', 'dns_data'],
+                        label_names=['softmax_label'])
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params()
+    optim = mx.optimizer.create(optimizer, learning_rate=lr, rescale_grad=1.0/batch_size)
+    mod.init_optimizer(optimizer=optim)
+    # use accuracy as the metric
+    metric = mx.metric.create(['acc'])
+    # get the sparse weight parameter
+    speedometer = mx.callback.Speedometer(batch_size, log_interval)
+
+    logging.info('Training started ...')
+    
+    data_iter = iter(train_data)
+    for epoch in range(num_epoch):
+        nbatch = 0
+        metric.reset()
+        for batch in data_iter:
+            nbatch += 1
+            mod.forward_backward(batch)
+            # update all parameters (including the weight parameter)
+            mod.update()
+            # update training metric
+            mod.update_metric(metric, batch.label)
+            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                                       eval_metric=metric, locals=locals())
+            speedometer(speedometer_param)
+        # evaluate metric on validation dataset
+        score = mod.score(eval_data, ['acc'])
+        logging.info('epoch %d, accuracy = %s' % (epoch, score[0][1]))
+        
+        mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=True)
+        # reset the iterator for next pass of data
+        data_iter.reset()
+    
+    logging.info('Training completed.')
diff --git a/example/speech-demo/README.md b/example/speech-demo/README.md
deleted file mode 100644
index 00b0b64f2c..0000000000
--- a/example/speech-demo/README.md
+++ /dev/null
@@ -1,160 +0,0 @@
-Speech Acoustic Modeling Example
-================================
-This folder contains examples for speech recognition.
-
-- [lstm_proj.py](lstm.py): Functions for building a LSTM Network with/without projection layer.
-- [io_util.py](io_util.py): Wrapper functions for `DataIter` over speech data.
-- [train_lstm_proj.py](train_lstm_proj.py): Script for training LSTM acoustic model.
-- [decode_mxnet.py](decode_mxnet.py): Script for decoding LSTMP acoustic model.
-- [default.cfg](default.cfg): Configuration for training on the `AMI` SDM1 dataset. Can be used as a template for writing other configuration files.
-- [python_wrap](python_wrap): C wrappers for Kaldi C++ code, this is built into a .so. Python code that loads the .so and calls the C wrapper functions in `io_func/feat_readers/reader_kaldi.py`.
-
-Connect to Kaldi:
-- [decode_mxnet.sh](decode_mxnet.sh): called by Kaldi to decode a acoustic model trained by mxnet (please select the `simple` method for decoding).
-
-A full receipt:
-- [run_ami.sh](run_ami.sh): a full receipt to train and decode acoustic model on AMI. It takes features and alignment from Kaldi to train an acoustic model and decode it.
-
-To reproduce the results, use the following steps.
-
-### Build Kaldi
-
-Build Kaldi as **shared libraties** if you have not already done so.
-
-```bash
-cd kaldi/src
-./configure --shared # and other options that you need
-make depend
-make
-```
-
-### Build Python Wrapper
-
-1. Copy or link the attached `python_wrap` folder to `kaldi/src`.
-2. Compile python_wrap/
-
-```
-cd kaldi/src/python_wrap/
-make
-```
-
-### Extract Features and Prepare Frame-level Labels
-
-The acoustic models use *Mel filter-bank* or *MFCC* as input features. It also need to use Kaldi to do force-alignment to generate frame-level labels from the text transcriptions. For example, if you want to work on the `AMI` data `SDM1`. You can run `kaldi/egs/ami/s5/run_sdm.sh`. You will need to do some configuration of paths in `kaldi/egs/ami/s5/cmd.sh` and `kaldi/egs/ami/s5/run_sdm.sh` before you can run the examples. Please refer to Kaldi's document for more details.
-
-The default `run_sdm.sh` script generates the force-alignment labels in their stage 7, and saves the force-aligned labels in `exp/sdm1/tri3a_ali`. The default script generates MFCC features (13-dimensional). You can try training with the MFCC features, or you can create Mel filter bank features by your self. For example, a script like this can be used to compute Mel filter bank features using Kaldi.
-
-```bash
-#!/bin/bash -u
-
-. ./cmd.sh
-. ./path.sh
-
-# SDM - Signle Distant Microphone
-micid=1 #which mic from array should be used?
-mic=sdm$micid
-
-# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
-# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
-set -euxo pipefail
-
-# Path where AMI gets downloaded (or where locally available):
-AMI_DIR=$PWD/wav_db # Default,
-data_dir=$PWD/data/$mic
-
-# make filter bank data
-for dset in train dev eval; do
-  steps/make_fbank.sh --nj 48 --cmd "$train_cmd" $data_dir/$dset \
-    $data_dir/$dset/log $data_dir/$dset/data-fbank
-  steps/compute_cmvn_stats.sh $data_dir/$dset \
-    $data_dir/$dset/log $data_dir/$dset/data
-
-  apply-cmvn --utt2spk=ark:$data_dir/$dset/utt2spk \
-    scp:$data_dir/$dset/cmvn.scp scp:$data_dir/$dset/feats.scp \
-    ark,scp:$data_dir/$dset/feats-cmvn.ark,$data_dir/$dset/feats-cmvn.scp
-
-  mv $data_dir/$dset/feats-cmvn.scp $data_dir/$dset/feats.scp
-done
-```
-Here `apply-cmvn` was for mean-variance normalization. The default setup was applied per speaker. A more common was doing mean-variance normalization for the whole corpus and then feed to the neural networks:
-```
- compute-cmvn-stats scp:data/sdm1/train_fbank/feats.scp data/sdm1/train_fbank/cmvn_g.ark
- apply-cmvn --norm-vars=true data/sdm1/train_fbank/cmvn_g.ark scp:data/sdm1/train_fbank/feats.scp ark,scp:data/sdm1/train_fbank_gcmvn/feats.ark,data/sdm1/train_fbank_gcmvn/feats.scp
-```
-Note that kaldi always try to find features in `feats.scp`. So make sure the normalized features organized as Kaldi way during decoding.
-
-Finally, you need to put the features and labels together in a file so that MXNet can find them. More specifically, for each data set (train, dev, eval), you will need to create a file like `train_mxnet.feats`, will the following contents:
-
-```
-TRANSFORM scp:feat.scp
-scp:label.scp
-```
-
-Here the `TRANSFORM` is the transformation you want to apply to the features. By default we use `NO_FEATURE_TRANSFORM`. The `scp:` syntax is from Kaldi. The `feat.scp` is typically the file from `data/sdm1/train/feats.scp`, and the `label.scp` is converted from the force-aligned labels located in `exp/sdm1/tri3a_ali`. Because the force-alignments are only generated on the training data, we split the training set into 90/10 parts, and use the 1/10 hold-out as the dev set (validation set). The script [run_ami.sh](run_ami.sh) will automatically do the splitting and format the file for MXNet. Please set the path in that script correctly before running. The [run_ami.sh](run_ami.sh) script will actually run the full pipeline including training the acoustic model and decoding. So you can skip the following steps if that scripts successfully runs.
-
-### Run MXNet Acoustic Model Training
-
-1. Go back to this speech demo directory in MXNet. Make a copy of `default.cfg` and edit necessary items like the path to the dataset you just prepared.
-2. Run `python train_lstm.py --configfile=your-config.cfg`. You can do `python train_lstm.py --help` to see the helps. All the configuration parameters can be set in `default.cfg`, customized config file, and through command line (e.g. `--train_batch_size=50`), and the latter values overwrite the former ones.
-
-Here are some example outputs that we got from training on the TIMIT dataset.
-
-```
-Example output for TIMIT:
-Summary of dataset ==================
-bucket of len 100 : 3 samples
-bucket of len 200 : 346 samples
-bucket of len 300 : 1496 samples
-bucket of len 400 : 974 samples
-bucket of len 500 : 420 samples
-bucket of len 600 : 90 samples
-bucket of len 700 : 11 samples
-bucket of len 800 : 2 samples
-Summary of dataset ==================
-bucket of len 100 : 0 samples
-bucket of len 200 : 28 samples
-bucket of len 300 : 169 samples
-bucket of len 400 : 107 samples
-bucket of len 500 : 41 samples
-bucket of len 600 : 6 samples
-bucket of len 700 : 3 samples
-bucket of len 800 : 0 samples
-2016-04-21 20:02:40,904 Epoch[0] Train-Acc_exlude_padding=0.154763
-2016-04-21 20:02:40,904 Epoch[0] Time cost=91.574
-2016-04-21 20:02:44,419 Epoch[0] Validation-Acc_exlude_padding=0.353552
-2016-04-21 20:04:17,290 Epoch[1] Train-Acc_exlude_padding=0.447318
-2016-04-21 20:04:17,290 Epoch[1] Time cost=92.870
-2016-04-21 20:04:20,738 Epoch[1] Validation-Acc_exlude_padding=0.506458
-2016-04-21 20:05:53,127 Epoch[2] Train-Acc_exlude_padding=0.557543
-2016-04-21 20:05:53,128 Epoch[2] Time cost=92.390
-2016-04-21 20:05:56,568 Epoch[2] Validation-Acc_exlude_padding=0.548100
-```
-
-The final frame accuracy was around 62%.
-
-### Run decode on the trained acoustic model
-
-1. Estimate senone priors by run `python make_stats.py --configfile=your-config.cfg | copy-feats ark:- ark:label_mean.ark` (edit necessary items like the path to the training dataset). It will generate the label counts in `label_mean.ark`.
-2. Link to necessary Kaldi decode setup e.g. `local/` and `utils/` and Run `./run_ami.sh --model prefix model --num_epoch num`.
-
-Here are the results on TIMIT and AMI test set (using all default setup, 3 layer LSTM with projection layers):
-
-| Corpus | WER |
-|--------|-----|
-|TIMIT   | 18.9|
-|AMI     | 51.7 (42.2) |
-
-Note that for AMI 42.2 was evaluated non-overlapped speech. Kaldi-HMM baseline was 67.2% and DNN was 57.5%.
-
-### update Feb 07
-
-We had updated this demo on Feb 07 (kaldi c747ed5, mxnet 912a7eb). We had also added timit demo script in this folder. 
-
-To run the timit demo:
-
-1. cd path/to/kaldi/egs/timit/s5/
-2. ./run.sh (setup the kaild timit demo and run it) 
-3. ln -s path/to/mxnet/example/speech-demo/* path/to/kaldi/egs/timit/s5/
-4. set **ali_src, graph_src** and so on in the run_timit.sh and default_timit.cfg to the generated folder in kaldi/egs/timit/s5/exp. In the demo script, we use tri3_ali as the alignment dir
-5. set ydim (in default_timit.cfg) to kaldi/egs/timit/s5/exp/tri3/graph/num_pdfs + 1
-6. ./run_timit.sh
diff --git a/example/speech-demo/config_util.py b/example/speech-demo/config_util.py
deleted file mode 100644
index 6fd6a50a19..0000000000
--- a/example/speech-demo/config_util.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import os
-import sys
-import argparse
-import mxnet as mx
-import numpy as np
-
-if sys.version_info >= (3, 0):
-    import configparser
-else:
-    import ConfigParser as configparser
-
-
-def parse_args():
-    default_cfg = configparser.ConfigParser()
-    default_cfg.read(os.path.join(os.path.dirname(__file__), 'default.cfg'))
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--configfile", help="config file for training parameters")
-
-    # those allow us to overwrite the configs through command line
-    for sec in default_cfg.sections():
-        for name, _ in default_cfg.items(sec):
-            arg_name = '--%s_%s' % (sec, name)
-            doc = 'Overwrite %s in section [%s] of config file' % (name, sec)
-            parser.add_argument(arg_name, help=doc)
-
-    args = parser.parse_args()
-
-    if args.configfile is not None:
-        # now read the user supplied config file to overwrite some values
-        default_cfg.read(args.configfile)
-
-    # now overwrite config from command line options
-    for sec in default_cfg.sections():
-        for name, _ in default_cfg.items(sec):
-            arg_name = ('%s_%s' % (sec, name)).replace('-', '_')
-            if hasattr(args, arg_name) and getattr(args, arg_name) is not None:
-                sys.stderr.write('!! CMDLine overwriting %s.%s:\n' % (sec, name))
-                sys.stderr.write("    '%s' => '%s'\n" % (default_cfg.get(sec, name),
-                                 getattr(args, arg_name)))
-                default_cfg.set(sec, name, getattr(args, arg_name))
-
-    args.config = default_cfg
-    sys.stderr.write("="*80+"\n")
-    return args
-
-
-def get_checkpoint_path(args):
-    prefix = args.config.get('train', 'prefix')
-    if os.path.isabs(prefix):
-        return prefix
-    return os.path.abspath(os.path.join(os.path.dirname(__file__), 'checkpoints', prefix))
-
-
-def parse_contexts(args):
-    # parse context into Context objects
-    contexts = re.split(r'\W+', args.config.get('train', 'context'))
-    for i, ctx in enumerate(contexts):
-        if ctx[:3] == 'gpu':
-            contexts[i] = mx.context.gpu(int(ctx[3:]))
-        else:
-            contexts[i] = mx.context.cpu(int(ctx[3:]))
-    return contexts
diff --git a/example/speech-demo/decode_mxnet.py b/example/speech-demo/decode_mxnet.py
deleted file mode 100644
index deb9c30d79..0000000000
--- a/example/speech-demo/decode_mxnet.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import sys
-sys.path.insert(0, "../../python")
-import time
-import logging
-import os.path
-
-import mxnet as mx
-import numpy as np
-
-from lstm_proj import lstm_unroll
-from io_util import BucketSentenceIter, TruncatedSentenceIter, SimpleIter, DataReadStream
-from config_util import parse_args, get_checkpoint_path, parse_contexts
-
-from io_func.feat_readers.writer_kaldi import KaldiWriteOut
-
-# some constants
-METHOD_BUCKETING = 'bucketing'
-METHOD_TBPTT = 'truncated-bptt'
-METHOD_SIMPLE = 'simple'
-
-def prepare_data(args):
-    batch_size = args.config.getint('train', 'batch_size')
-    num_hidden = args.config.getint('arch', 'num_hidden')
-    num_hidden_proj = args.config.getint('arch', 'num_hidden_proj')
-    num_lstm_layer = args.config.getint('arch', 'num_lstm_layer')
-
-    init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
-    if num_hidden_proj > 0:
-        init_h = [('l%d_init_h'%l, (batch_size, num_hidden_proj)) for l in range(num_lstm_layer)]
-    else:
-        init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
-
-    init_states = init_c + init_h
-
-    file_test = args.config.get('data', 'test')
-    file_label_mean =  args.config.get('data', 'label_mean')
-    file_format = args.config.get('data', 'format')
-    feat_dim = args.config.getint('data', 'xdim')
-    label_dim = args.config.getint('data', 'ydim')
-
-    test_data_args = {
-            "gpu_chunk": 32768,
-            "lst_file": file_test,
-            "file_format": file_format,
-            "separate_lines":True,
-            "has_labels":False
-            }
-
-    label_mean_args = {
-            "gpu_chunk": 32768,
-            "lst_file": file_label_mean,
-            "file_format": file_format,
-            "separate_lines":True,
-            "has_labels":False
-            }
-
-    test_sets = DataReadStream(test_data_args, feat_dim)
-    label_mean_sets = DataReadStream(label_mean_args, label_dim)
-    return (init_states, test_sets, label_mean_sets)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    args.config.write(sys.stderr)
-
-    decoding_method = args.config.get('train', 'method')
-    contexts = parse_contexts(args)
-
-    init_states, test_sets, label_mean_sets = prepare_data(args)
-    state_names = [x[0] for x in init_states]
-
-    batch_size = args.config.getint('train', 'batch_size')
-    num_hidden = args.config.getint('arch', 'num_hidden')
-    num_hidden_proj = args.config.getint('arch', 'num_hidden_proj')
-    num_lstm_layer = args.config.getint('arch', 'num_lstm_layer')
-    feat_dim = args.config.getint('data', 'xdim')
-    label_dim = args.config.getint('data', 'ydim')
-    out_file = args.config.get('data', 'out_file')
-    num_epoch = args.config.getint('train', 'num_epoch')
-    model_name = get_checkpoint_path(args)
-    logging.basicConfig(level=logging.DEBUG, format='%(asctime)-15s %(message)s')
-
-    # load the model
-    sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epoch)
-
-    if decoding_method == METHOD_BUCKETING:
-        buckets = args.config.get('train', 'buckets')
-        buckets = list(map(int, re.split(r'\W+', buckets)))
-        data_test   = BucketSentenceIter(test_sets, buckets, batch_size, init_states, feat_dim=feat_dim, has_label=False)
-        def sym_gen(seq_len):
-            sym = lstm_unroll(num_lstm_layer, seq_len, feat_dim, num_hidden=num_hidden,
-                              num_label=label_dim, take_softmax=True, num_hidden_proj=num_hidden_proj)
-            data_names = ['data'] + state_names
-            label_names = ['softmax_label']
-            return (sym, data_names, label_names)
-
-        module = mx.mod.BucketingModule(sym_gen,
-                            default_bucket_key=data_test.default_bucket_key,
-                            context=contexts)
-    elif decoding_method == METHOD_SIMPLE:
-        data_test = SimpleIter(test_sets, batch_size, init_states, feat_dim=feat_dim, label_dim=label_dim,
-                label_mean_sets=label_mean_sets, has_label=False)
-        def sym_gen(seq_len):
-            sym = lstm_unroll(num_lstm_layer, seq_len, feat_dim, num_hidden=num_hidden,
-                              num_label=label_dim, take_softmax=False, num_hidden_proj=num_hidden_proj)
-            data_names = ['data'] + state_names
-            label_names = []
-            return (sym, data_names, label_names)
-
-        module = mx.mod.BucketingModule(sym_gen,
-                            default_bucket_key=data_test.default_bucket_key,
-                            context=contexts)
-
-    else:
-        truncate_len=20
-        data_test = TruncatedSentenceIter(test_sets, batch_size, init_states,
-                                         truncate_len, feat_dim=feat_dim,
-                                         do_shuffling=False, pad_zeros=True, has_label=True)
-
-        sym = lstm_unroll(num_lstm_layer, truncate_len, feat_dim, num_hidden=num_hidden,
-                          num_label=label_dim, output_states=True, num_hidden_proj=num_hidden_proj)
-        data_names = [x[0] for x in data_test.provide_data]
-        label_names = ['softmax_label']
-        module = mx.mod.Module(sym, context=contexts, data_names=data_names,
-                               label_names=label_names)
-    # set the parameters
-    module.bind(data_shapes=data_test.provide_data, label_shapes=None, for_training=False)
-    module.set_params(arg_params=arg_params, aux_params=aux_params)
-
-    kaldiWriter = KaldiWriteOut(None, out_file)
-    kaldiWriter.open_or_fd()
-    for preds, i_batch, batch in module.iter_predict(data_test):
-        label = batch.label[0].asnumpy().astype('int32')
-        posteriors = preds[0].asnumpy().astype('float32')
-        # copy over states
-        if decoding_method == METHOD_BUCKETING:
-            for (ind, utt) in enumerate(batch.utt_id):
-                if utt != "GAP_UTT":
-                    posteriors = np.log(posteriors[:label[0][0],1:] + 1e-20) - np.log(data_train.label_mean).T
-                    kaldiWriter.write(utt, posteriors)
-        elif decoding_method == METHOD_SIMPLE:
-            for (ind, utt) in enumerate(batch.utt_id):
-                if utt != "GAP_UTT":
-                    posteriors = posteriors[:batch.utt_len[0],1:] - np.log(data_test.label_mean[1:]).T
-                    kaldiWriter.write(utt, posteriors)
-        else:
-            outputs = module.get_outputs()
-            # outputs[0] is softmax, 1:end are states
-            for i in range(1, len(outputs)):
-                outputs[i].copyto(data_test.init_state_arrays[i-1])
-            for (ind, utt) in enumerate(batch.utt_id):
-                if utt != "GAP_UTT":
-                    posteriors = np.log(posteriors[:,1:])# - np.log(data_train.label_mean).T
-                    kaldiWriter.write(utt, posteriors)
-
-
-    kaldiWriter.close()
-    args.config.write(sys.stderr)
-
diff --git a/example/speech-demo/decode_mxnet.sh b/example/speech-demo/decode_mxnet.sh
deleted file mode 100755
index d300d0e91c..0000000000
--- a/example/speech-demo/decode_mxnet.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# Copyright 2012-2013 Karel Vesely, Daniel Povey
-# 	    2015 Yu Zhang
-# Apache 2.0
-
-# Begin configuration section.
-nnet= # Optionally pre-select network to use for getting state-likelihoods
-feature_transform= # Optionally pre-select feature transform (in front of nnet)
-model= # Optionally pre-select transition model
-class_frame_counts= # Optionally pre-select class-counts used to compute PDF priors
-
-stage=0 # stage=1 skips lattice generation
-nj=4
-cmd=run.pl
-max_active=7000 # maximum of active tokens
-min_active=200 #minimum of active tokens
-max_mem=50000000 # limit the fst-size to 50MB (larger fsts are minimized)
-beam=13.0 # GMM:13.0
-latbeam=8.0 # GMM:6.0
-acwt=0.10 # GMM:0.0833, note: only really affects pruning (scoring is on lattices).
-scoring_opts="--min-lmwt 1 --max-lmwt 10"
-skip_scoring=false
-use_gpu_id=-1 # disable gpu
-#parallel_opts="-pe smp 2" # use 2 CPUs (1 DNN-forward, 1 decoder)
-parallel_opts= # use 2 CPUs (1 DNN-forward, 1 decoder)
-# End configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh; # source the path.
-. parse_options.sh || exit 1;
-
-graphdir=$1
-data=$2
-dir=$3
-srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
-sdata=$data/split$nj;
-
-mxstring=$4
-
-mkdir -p $dir/log
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-echo $nj > $dir/num_jobs
-
-if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
-  if [ -z $iter ]; then model=$srcdir/final.mdl;
-  else model=$srcdir/$iter.mdl; fi
-fi
-
-for f in $model $graphdir/HCLG.fst; do
-  [ ! -f $f ] && echo "decode_mxnet.sh: no such file $f" && exit 1;
-done
-
-
-# check that files exist
-for f in $sdata/1/feats.scp $model $graphdir/HCLG.fst; do
-  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
-done
-
-# PREPARE THE LOG-POSTERIOR COMPUTATION PIPELINE
-if [ -z "$class_frame_counts" ]; then
-  class_frame_counts=$srcdir/ali_train_pdf.counts
-else
-  echo "Overriding class_frame_counts by $class_frame_counts"
-fi
-
-# Create the feature stream:
-feats="scp:$sdata/JOB/feats.scp"
-inputfeats="$sdata/JOB/mxnetInput.scp"
-
-
-if [ -f $sdata/1/feats.scp ]; then
-    $cmd JOB=1:$nj $dir/log/make_input.JOB.log \
-        echo NO_FEATURE_TRANSFORM scp:$sdata/JOB/feats.scp \> $inputfeats
-fi
-
-# Run the decoding in the queue
-if [ $stage -le 0 ]; then
-  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
-    $mxstring --data_test $inputfeats \| \
-    latgen-faster-mapped --min-active=$min_active --max-active=$max_active --max-mem=$max_mem --beam=$beam --lattice-beam=$latbeam \
-    --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
-    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
-fi
-
-# Run the scoring
-if ! $skip_scoring ; then
-  [ ! -x local/score.sh ] && \
-    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
-  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
-fi
-
-exit 0;
diff --git a/example/speech-demo/default.cfg b/example/speech-demo/default.cfg
deleted file mode 100644
index 072a4aeba8..0000000000
--- a/example/speech-demo/default.cfg
+++ /dev/null
@@ -1,50 +0,0 @@
-[data]
-kaldi_root =
-train = /home/chiyuan/download/kaldi/egs/ami/s5/exp/sdm1/data-for-mxnet/train.feats
-dev = /home/chiyuan/download/kaldi/egs/ami/s5/exp/sdm1/data-for-mxnet/dev.feats
-test =
-out_file = |
-format = kaldi
-xdim = 40
-ydim = 3920
-label_mean = label_mean.feats
-[arch]
-num_hidden = 1024
-# set it to zero if you want a regular LSTM
-num_hidden_proj = 512
-num_lstm_layer = 3
-
-[train]
-batch_size = 40
-buckets = 100, 200, 300, 400, 500, 600, 700, 800
-num_epoch = 12
-
-# used only if method is truncated-bptt
-truncate_len = 20
-
-# gpu0, gpu1
-context = gpu0
-
-# bucketing, truncated-bptt
-method = truncated-bptt
-
-# checkpoint prefix
-prefix = ami
-
-learning_rate = 1
-decay_factor = 2
-decay_lower_bound = 1e-6
-
-optimizer = speechSGD
-momentum = 0.9
-
-# set to 0 to disable gradient clipping
-clip_gradient = 0
-
-# uniform, normal, xavier
-initializer = Uniform
-init_scale = 0.05
-weight_decay = 0.008
-
-# show progress every how many batches
-show_every = 1000
diff --git a/example/speech-demo/default_timit.cfg b/example/speech-demo/default_timit.cfg
deleted file mode 100644
index 2e0cd2a635..0000000000
--- a/example/speech-demo/default_timit.cfg
+++ /dev/null
@@ -1,52 +0,0 @@
-[data]
-kaldi_root =
-train = /home/sooda/speech/kaldi/egs/timit/s5/data/train/train.feats
-dev = /home/sooda/speech/kaldi/egs/timit/s5/data/dev/dev.feats
-test =
-out_file = |
-format = kaldi
-xdim = 13
-ydim = 1939
-#ydim = 1909
-label_mean = label_mean.feats
-[arch]
-num_hidden = 1024
-# set it to zero if you want a regular LSTM
-num_hidden_proj = 512
-num_lstm_layer = 3
-
-[train]
-batch_size = 40
-buckets = 100, 200, 300, 400, 500, 600, 700, 800
-num_epoch = 12
-
-# used only if method is truncated-bptt
-truncate_len = 20
-
-# gpu0, gpu1
-context = gpu0
-
-# bucketing, truncated-bptt
-method = truncated-bptt
-#method = bucketing
-
-# checkpoint prefix
-prefix = timit
-
-learning_rate = 1
-decay_factor = 2
-decay_lower_bound = 1e-6
-
-optimizer = speechSGD
-momentum = 0.9
-
-# set to 0 to disable gradient clipping
-clip_gradient = 0
-
-# uniform, normal, xavier
-initializer = Uniform
-init_scale = 0.05
-weight_decay = 0.008
-
-# show progress every how many batches
-show_every = 1000
diff --git a/example/speech-demo/io_func/__init__.py b/example/speech-demo/io_func/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/example/speech-demo/io_func/convert2kaldi.py b/example/speech-demo/io_func/convert2kaldi.py
deleted file mode 100644
index eac8ee695a..0000000000
--- a/example/speech-demo/io_func/convert2kaldi.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Copyright 2013    Yajie Miao    Carnegie Mellon University
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import os
-import sys
-
-from StringIO import StringIO
-import json
-import utils.utils as utils
-from model_io import string_2_array
-
-# Various functions to convert models into Kaldi formats
-def _nnet2kaldi(nnet_spec, set_layer_num = -1, filein='nnet.in',
-               fileout='nnet.out', activation='sigmoid', withfinal=True):
-    _nnet2kaldi_main(nnet_spec, set_layer_num=set_layer_num, filein=filein,
-                    fileout=fileout, activation=activation, withfinal=withfinal, maxout=False)
-
-def _nnet2kaldi_maxout(nnet_spec, pool_size = 1, set_layer_num = -1,
-                      filein='nnet.in', fileout='nnet.out', activation='sigmoid', withfinal=True):
-    _nnet2kaldi_main(nnet_spec, set_layer_num=set_layer_num, filein=filein,
-                    fileout=fileout, activation=activation, withfinal=withfinal,
-                    pool_size = 1, maxout=True)
-
-def _nnet2kaldi_main(nnet_spec, set_layer_num = -1, filein='nnet.in',
-               fileout='nnet.out', activation='sigmoid', withfinal=True, maxout=False):
-    elements = nnet_spec.split(':')
-    layers = []
-    for x in elements:
-        layers.append(int(x))
-    if set_layer_num == -1:
-        layer_num = len(layers) - 1
-    else:
-        layer_num = set_layer_num + 1
-    nnet_dict = {}
-    nnet_dict = utils.pickle_load(filein)
-
-    fout = open(fileout, 'wb')
-    for i in xrange(layer_num - 1):
-        input_size = int(layers[i])
-        if maxout:
-            output_size = int(layers[i + 1]) * pool_size
-        else:
-            output_size = int(layers[i + 1])
-        W_layer = []
-        b_layer = ''
-        for rowX in xrange(output_size):
-            W_layer.append('')
-
-        dict_key = str(i) + ' ' + activation + ' W'
-        matrix = string_2_array(nnet_dict[dict_key])
-
-        for x in xrange(input_size):
-            for t in xrange(output_size):
-                W_layer[t] = W_layer[t] + str(matrix[x][t]) + ' '
-
-        dict_key = str(i) + ' ' + activation + ' b'
-        vector = string_2_array(nnet_dict[dict_key])
-        for x in xrange(output_size):
-            b_layer = b_layer + str(vector[x]) + ' '
-
-        fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n')
-        fout.write('[' + '\n')
-        for x in xrange(output_size):
-            fout.write(W_layer[x].strip() + '\n')
-        fout.write(']' + '\n')
-        fout.write('[ ' + b_layer.strip() + ' ]' + '\n')
-        if maxout:
-            fout.write('<maxout> ' + str(int(layers[i + 1])) + ' ' + str(output_size) + '\n')
-        else:
-            fout.write('<sigmoid> ' + str(output_size) + ' ' + str(output_size) + '\n')
-
-    if withfinal:
-        input_size = int(layers[-2])
-        output_size = int(layers[-1])
-        W_layer = []
-        b_layer = ''
-        for rowX in xrange(output_size):
-            W_layer.append('')
-
-        dict_key = 'logreg W'
-        matrix = string_2_array(nnet_dict[dict_key])
-        for x in xrange(input_size):
-            for t in xrange(output_size):
-                W_layer[t] = W_layer[t] + str(matrix[x][t]) + ' '
-
-
-        dict_key = 'logreg b'
-        vector = string_2_array(nnet_dict[dict_key])
-        for x in xrange(output_size):
-            b_layer = b_layer + str(vector[x]) + ' '
-
-        fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n')
-        fout.write('[' + '\n')
-        for x in xrange(output_size):
-            fout.write(W_layer[x].strip() + '\n')
-        fout.write(']' + '\n')
-        fout.write('[ ' + b_layer.strip() + ' ]' + '\n')
-        fout.write('<softmax> ' + str(output_size) + ' ' + str(output_size) + '\n')
-
-    fout.close();
diff --git a/example/speech-demo/io_func/feat_io.py b/example/speech-demo/io_func/feat_io.py
deleted file mode 100644
index 6a7e424d1e..0000000000
--- a/example/speech-demo/io_func/feat_io.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import os
-import sys
-import random
-import shlex
-import time
-import re
-
-from utils import to_bool
-from .feat_readers.common import *
-from .feat_readers import stats
-
-class DataReadStream(object):
-
-    SCHEMA = {
-        "type": "object",
-        "properties": {
-            "gpu_chunk": {"type": ["string", "integer"], "required": False},
-
-            "lst_file": {"type": "string"},
-            "separate_lines": {"type": ["string", "integer", "boolean"], "required": False},
-            "has_labels": {"type": ["string", "integer", "boolean"], "required": False},
-
-            "file_format": {"type": "string"},
-            "train_stat": {"type": "string", "required": False},
-            "offset_labels": {"type": ["string", "integer", "boolean"], "required": False},
-
-            #"XXXchunk": {"type": ["string", "integer"], "required": False},
-            "max_feats": {"type": ["string", "integer"], "required": False},
-            "shuffle": {"type": ["string", "integer", "boolean"], "required": False},
-
-            "seed": {"type": ["string", "integer"], "required": False},
-            "_num_splits": {"type": ["string", "integer"], "required": False},
-            "_split_id": {"type": ["string", "integer"], "required": False}
-        }
-    }
-
-    END_OF_DATA = -1
-    END_OF_PARTITION = -2
-    END_OF_SEQ = (None, None, None)
-    def __init__(self, dataset_args, n_ins):
-
-        # stats
-        self.mean = None
-        self.std = None
-        if 'train_stat' in dataset_args.keys():
-            train_stat = dataset_args['train_stat']
-            featureStats = stats.FeatureStats()
-            featureStats.Load(train_stat)
-            self.mean = featureStats.GetMean()
-            self.std = featureStats.GetInvStd()
-
-        # open lstfile
-        file_path = dataset_args["lst_file"]
-        if file_path.endswith('.gz'):
-            file_read = gzip.open(file_path, 'r')
-        else:
-            file_read = open(file_path, 'r')
-
-        separate_lines = False
-        if "separate_lines" in dataset_args:
-            separate_lines = to_bool(dataset_args["separate_lines"])
-
-        self.has_labels = True
-        if "has_labels" in dataset_args:
-            self.has_labels = to_bool(dataset_args["has_labels"])
-
-        # parse it, file_lst is a list of (featureFile, labelFile) pairs in the input set
-        lines = [ln.strip() for ln in file_read]
-        lines = [ln for ln in lines if ln != "" ]
-
-        if self.has_labels:
-            if separate_lines:
-                if len(lines) % 2 != 0:
-                    print("List has mis-matched number of feature files and label files")
-                    sys.exit(1)
-                self.orig_file_lst = []
-                for i in xrange(0, len(lines), 2):
-                    self.orig_file_lst.append((lines[i], lines[i+1]))
-            else:
-                self.orig_file_lst = []
-                for i in xrange(len(lines)):
-                    pair = re.compile("\s+").split(lines[i])
-                    if len(pair) != 2:
-                        print(lines[i])
-                        print("Each line in the train and eval lists must contain feature file and label file separated by space character")
-                        sys.exit(1)
-                    self.orig_file_lst.append(pair)
-        else:
-            # no labels
-            self.orig_file_lst = []
-            for i in xrange(0, len(lines), 1):
-                self.orig_file_lst.append((lines[i], None))
-
-        # save arguments
-
-        self.n_ins = n_ins
-        self.file_format = dataset_args['file_format']
-
-        self.file_format = "htk"
-        if 'file_format' in dataset_args:
-            self.file_format = dataset_args['file_format']
-
-        self.offsetLabels = False
-        if 'offset_labels' in dataset_args:
-            self.offsetLabels = to_bool(dataset_args['offset_labels'])
-
-        self.chunk_size = 32768
-        if 'gpu_chunk' in dataset_args:
-            self.chunk_size = int(dataset_args['gpu_chunk'])
-
-        self.maxFeats = 0
-        if "max_feats" in dataset_args:
-            self.maxFeats = int(dataset_args["max_feats"])
-        if self.maxFeats == 0:
-            self.maxFeats = sys.maxint
-
-        self.shuffle = True
-        if 'shuffle' in dataset_args:
-            self.shuffle = to_bool(dataset_args['shuffle'])
-
-        self.seed = None
-        if "seed" in dataset_args:
-            self.seed = int(dataset_args["seed"])
-
-        if int("_split_id" in dataset_args) + int("_num_splits" in dataset_args) == 1:
-            raise Exception("_split_id must be used with _num_splits")
-        self.num_splits = 0
-        if "_num_splits" in dataset_args:
-            self.num_splits = int(dataset_Args["_num_splits"])
-            self.split_id = dataset_args["_split_id"]
-
-        # internal state
-        self.split_parts = False
-        self.by_matrix = False
-        self.x = numpy.zeros((self.chunk_size, self.n_ins), dtype=numpy.float32)
-        if self.has_labels:
-            self.y = numpy.zeros((self.chunk_size,), dtype=numpy.int32)
-        else:
-            self.y = None
-        self.numpy_rng = numpy.random.RandomState(self.seed)
-
-        #self.make_shared()
-        self.initialize_read()
-
-    def read_by_part(self):
-        if self.file_format in ["kaldi"]:
-            self.read_by_matrix()
-        else:   # htk
-            self.split_parts = True
-
-    def read_by_matrix(self):
-        self.by_matrix = True
-
-
-    def get_shared(self):
-        return self.shared_x, self.shared_y
-
-    def initialize_read(self):
-        self.file_lst = self.orig_file_lst[:]
-        if self.shuffle:
-            self.numpy_rng.shuffle(self.file_lst)
-        self.fileIndex = 0
-        self.totalFrames = 0
-        self.reader = None
-        self.crossed_part = False
-        self.done = False
-        self.utt_id = None
-        self.queued_feats = None
-        self.queued_tgts = None
-
-    def _end_of_data(self):
-        return self.totalFrames >= self.maxFeats or self.fileIndex >= len(self.file_lst)
-
-    def _queue_get(self, at_most):
-        # if we have frames/labels queued, return at_most of those and queue the rest
-        if self.queued_feats is None:
-            return None
-
-        num_queued = self.queued_feats.shape[0]
-        at_most = min(at_most, num_queued)
-
-        if at_most == num_queued:   # no leftover after the split
-            feats, tgts = self.queued_feats, self.queued_tgts
-            self.queued_feats = None
-            self.queued_tgts = None
-        else:
-            feats, self.queued_feats = numpy.array_split(self.queued_feats, [at_most])
-            if self.queued_tgts is not None:
-                tgts, self.queued_tgts = numpy.array_split(self.queued_tgts, [at_most])
-            else:
-                tgts = None
-
-        return feats, tgts
-
-    def _queue_excess(self, at_most, feats, tgts):
-        assert(self.queued_feats is None)
-        num_supplied = feats.shape[0]
-
-        if num_supplied > at_most:
-            feats, self.queued_feats = numpy.array_split(feats, [at_most])
-            if tgts is not None:
-                tgts, self.queued_tgts = numpy.array_split(tgts, [at_most])
-
-        return feats, tgts
-
-    # Returns frames/labels (if there are any) or None (otherwise) for current partition
-    # Always set the pointers to the next partition
-    def _load_fn(self, at_most):
-        tup = self._queue_get(at_most)
-        if tup is not None:
-            return tup
-
-        if self.reader is None:
-            featureFile, labelFile = self.file_lst[self.fileIndex]
-            self.reader = getReader(self.file_format, featureFile, labelFile)
-
-        if self.reader.IsDone():
-            self.fileIndex += 1
-            self.reader.Cleanup()
-            self.reader = None # cleanup
-            return None
-
-        tup = self.reader.Read()
-        if tup is None:
-            self.fileIndex += 1
-            self.reader.Cleanup()
-            self.reader = None # cleanup
-            return None
-
-        feats, tgts = tup
-
-        # normalize here
-        if self.mean is not None:
-            feats -= self.mean
-        if self.std is not None:
-            feats *= self.std
-
-        self.utt_id = self.reader.GetUttId()
-
-        if feats.shape[1] != self.n_ins:
-            errMs = "Dimension of features read does not match specified dimensions".format(feats.shape[1], self.n_ins)
-
-        if self.has_labels and tgts is not None:
-            if feats.shape[0] != tgts.shape[0]:
-                errMs = "Number of frames in feature ({}) and label ({}) files does not match".format(self.featureFile, self.labelFile)
-                raise FeatureException(errMsg)
-
-            if self.offsetLabels:
-                tgts = numpy.add(tgts, - 1)
-
-        feats, tgts = self._queue_excess(at_most, feats, tgts)
-
-        return feats, tgts
-
-    def current_utt_id(self):
-        assert(self.by_matrix or self.split_parts)
-        return self.utt_id
-
-    def load_next_seq(self):
-        if self.done:
-            return DataReadStream.END_OF_SEQ
-        if self._end_of_data():
-            if self.reader is not None:
-                self.reader.Cleanup()
-            self.reader = None
-            self.done = True
-            return DataReadStream.END_OF_SEQ
-
-        num_feats = 0
-        old_fileIndes = self.fileIndex
-
-        self.utt_id = None
-
-        tup  = self._load_fn(self.chunk_size)
-        if tup is None:
-            return DataReadStream.END_OF_SEQ
-        (loaded_feats, loaded_tgts) = tup
-        return loaded_feats, loaded_tgts, self.utt_id
-
-
-    def load_next_block(self):
-        # if anything left...
-        # set_value
-
-        if self.crossed_part:
-            self.crossed_part = False
-            if not self.by_matrix: #    <--- THERE IS A BUG IN THIS
-                return DataReadStream.END_OF_PARTITION
-        if self.done:
-            return DataReadStream.END_OF_DATA
-        if self._end_of_data():
-            if self.reader is not None:
-                self.reader.Cleanup()
-            self.reader = None # cleanup
-            self.done = True
-            return DataReadStream.END_OF_DATA
-
-        # keep loading features until we pass a partition or EOF
-
-        num_feats = 0
-        old_fileIndex = self.fileIndex
-
-        self.utt_id = None
-
-        while num_feats < self.chunk_size:
-            if self.split_parts:
-                if old_fileIndex != self.fileIndex:
-                    self.crossed_part = True
-                    break
-
-            if self._end_of_data():
-                break
-
-            tup = self._load_fn(self.chunk_size - num_feats)
-            if tup is None:
-                continue
-
-            (loaded_feat, loaded_label) = tup
-
-            if self.has_labels and loaded_label is None:
-                print(sys.stderr, "Missing labels for: ", self.utt_id)
-                continue
-
-            numFrames = loaded_feat.shape[0]
-
-            # limit loaded_feat, loaded_label, and numFrames to maximum allowed
-            allowed = self.maxFeats - self.totalFrames
-            if numFrames > allowed:
-                loaded_feat = loaded_feat[0:allowed]
-                if self.has_labels:
-                    loaded_label = loaded_label[0:allowed]
-                numFrames = allowed
-                assert(numFrames == loaded_feat.shape[0])
-
-            self.totalFrames += numFrames
-            new_num_feats = num_feats + numFrames
-
-            # if the x and y buffers are too small, make bigger ones
-            # not possible any more; buffers are always fixed
-            """
-            if new_num_feats > self.x.shape[0]:
-                newx = numpy.zeros((new_num_feats, self.n_ins), dtype=numpy.float32)
-                newx[0:num_feats] = self.x[0:num_feats]
-                self.x = newx
-
-                if self.has_labels:
-                    newy = numpy.zeros((new_num_feats,), dtype=numpy.int32)
-                    newy[0:num_feats] = self.y[0:num_feats]
-                    self.y = newy
-            """
-
-            # place into [num_feats:num_feats+num_loaded]
-            self.x[num_feats:new_num_feats] = loaded_feat
-            if self.has_labels:
-                self.y[num_feats:new_num_feats] = loaded_label
-
-            num_feats = new_num_feats
-
-            if self.by_matrix:
-                break
-
-        # if we loaded features, shuffle and copy to shared
-        if num_feats != 0:
-
-            if self.shuffle:
-                x = self.x[0:num_feats]
-                state = self.numpy_rng.get_state()
-                self.numpy_rng.shuffle(x)
-                self.x[0:num_feats] = x
-
-                if self.has_labels:
-                    y = self.y[0:num_feats]
-                    self.numpy_rng.set_state(state)
-                    self.numpy_rng.shuffle(y)
-                    self.y[0:num_feats] = y
-
-            assert(self.x.shape == (self.chunk_size, self.n_ins))
-            self.shared_x.set_value(self.x, borrow = True)
-            if self.has_labels:
-                self.shared_y.set_value(self.y, borrow = True)
-
-            #import hashlib
-            #print self.totalFrames, self.x.sum(), hashlib.sha1(self.x.view(numpy.float32)).hexdigest()
-
-            if self.by_matrix:
-                self.crossed_part = True
-
-        return num_feats
-
-    def get_state(self):
-        return self.numpy_rng.get_state()
-
-    def set_state(self, state):
-        self.numpy_rng.set_state(state)
diff --git a/example/speech-demo/io_func/feat_readers/__init__.py b/example/speech-demo/io_func/feat_readers/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/example/speech-demo/io_func/feat_readers/common.py b/example/speech-demo/io_func/feat_readers/common.py
deleted file mode 100644
index 742d3e25a1..0000000000
--- a/example/speech-demo/io_func/feat_readers/common.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy
-import os
-
-class ByteOrder:
-	LittleEndian, BigEndian    = range(2)
-
-class FeatureException(Exception):
-	def __init__(self,msg):
-		self.msg = msg
-	def __str__(self):
-		return repr(self.msg)
-
-def ReadLabel(filename):
-	labels = numpy.loadtxt(filename, ndmin=1)
-	return labels.astype(numpy.int32)
-
-class BaseReader():
-	def __init__(self, featureFile, labelFile, byteOrder=None):
-		self.byteOrder = byteOrder
-		self.featureFile = featureFile
-		self.labelFile = labelFile
-		self.done = False
-
-	def _markDone(self):
-		self.done = True
-
-	def IsDone(self):
-		return self.done
-
-	def Read(self):
-		pass
-
-	def Cleanup(self):
-		pass
-
-	# no slashes or weird characters
-	def GetUttId(self):
-		return os.path.basename(self.featureFile)
-
-def getReader(fileformat, featureFile, labelFile):
-	if fileformat.lower() == 'htk':
-		import reader_htk
-		return reader_htk.htkReader(featureFile, labelFile, ByteOrder.BigEndian)
-	elif fileformat.lower() == 'htk_little':
-		import reader_htk
-		return reader_htk.htkReader(featureFile, labelFile, ByteOrder.LittleEndian)
-	elif fileformat.lower() == 'bvec':
-		import reader_bvec
-		return reader_bvec.bvecReader(featureFile, labelFile)
-	elif fileformat.lower() == 'atrack':
-		import reader_atrack
-		return reader_atrack.atrackReader(featureFile, labelFile)
-	elif fileformat.lower() == 'kaldi':
-		import reader_kaldi
-		return reader_kaldi.kaldiReader(featureFile, labelFile)
-	else:
-		msg = "Error: Specified format '{}' is not supported".format(fileformat)
-		raise Exception(msg)
diff --git a/example/speech-demo/io_func/feat_readers/reader_atrack.py b/example/speech-demo/io_func/feat_readers/reader_atrack.py
deleted file mode 100644
index e8db0fd14d..0000000000
--- a/example/speech-demo/io_func/feat_readers/reader_atrack.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy
-import numpy as num
-import stats
-from common import *
-
-class atrackReader(BaseReader):
-    def __init__(self, featureFile, labelFile, byteOrder=None):
-        BaseReader.__init__(self, featureFile, labelFile, byteOrder)
-
-    def checkHeader(self, header):
-        assert(header[0] == 0x56782)
-        assert(header[1] == header[6]) # and header[1] == frameSize)
-        assert(header[2] == header[5]) # and header[2] >= numSamples)
-        assert(header[3] == 0)
-        assert(header[4] == 24) # size of float + 20
-        assert(header[4])
-
-    def Read(self):
-        # flip both the header and data using >
-        # atrack format...
-        """
-        0.000000 354178 -2107177728
-        0.000000 1845 889651200
-        0.000000 1124588 -332918528
-        0.000000 0 0
-        0.000000 24 402653184
-        0.000000 1124588 -332918528
-        0.000000 1845 889651200
-        -2.395848 -1072081519 -1856693824
-        -1.677172 -1076449904 -1867655489
-        -1.562828 -1077409088 -1073035073
-        """
-
-        f = open(self.featureFile, "rb")
-        header = num.fromfile(f, dtype=num.dtype('>i4'), count=7)
-        self.checkHeader(header)
-
-        frameSize = header[1]
-        numSamples = header[2]
-
-        a = num.fromfile(f, dtype=num.dtype('>f4'), count=numSamples*frameSize)
-        f.close()
-
-        a = a.astype(num.float32)
-        a = a.reshape((numSamples, frameSize))
-
-        self._markDone()
-
-        return a, ReadLabel(self.labelFile)
diff --git a/example/speech-demo/io_func/feat_readers/reader_bvec.py b/example/speech-demo/io_func/feat_readers/reader_bvec.py
deleted file mode 100644
index 3a0f745b92..0000000000
--- a/example/speech-demo/io_func/feat_readers/reader_bvec.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import struct
-import array
-import numpy
-from common import *
-
-class bvecReader(BaseReader):
-
-    def __init__(self, featureFile, labelFile, byteOrder=None):
-        BaseReader.__init__(self, featureFile, labelFile, byteOrder)
-
-    def Read(self):
-
-        with open(self.featureFile,"rb") as f:
-
-            dt = numpy.dtype([('numSamples',(numpy.int32,1)),('dim',(numpy.int32,1))])
-            header =  numpy.fromfile(f,dt.newbyteorder('>'),count=1)
-
-            numSamples = header[0]['numSamples']
-            dim        = header[0]['dim']
-
-            print('Num samples = {}'.format(numSamples))
-            print('dim = {}'.format(dim))
-
-            dt = numpy.dtype([('sample',(numpy.float32,dim))])
-            samples = numpy.fromfile(f,dt.newbyteorder('>'),count=numSamples)
-
-        self._markDone()
-
-        return samples[:]['sample'], ReadLabel(self.labelFile)
diff --git a/example/speech-demo/io_func/feat_readers/reader_htk.py b/example/speech-demo/io_func/feat_readers/reader_htk.py
deleted file mode 100644
index dca24d9bd3..0000000000
--- a/example/speech-demo/io_func/feat_readers/reader_htk.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy
-import stats
-from common import *
-
-class htkReader(BaseReader):
-    def __init__(self, featureFile, labelFile, byteOrder=None):
-        BaseReader.__init__(self, featureFile, labelFile, byteOrder)
-
-    def Read(self):
-
-        #return numpy.ones((256, 819)).astype('float32'), numpy.ones(256).astype('int32')
-
-        with open(self.featureFile,"rb") as f:
-
-            dt = numpy.dtype([('numSamples',(numpy.int32,1)),('sampPeriod',(numpy.int32,1)),('sampSize',(numpy.int16,1)),('sampKind',(numpy.int16,1))])
-            header =  numpy.fromfile(f,dt.newbyteorder('>' if self.byteOrder==ByteOrder.BigEndian else '<'),count=1)
-
-            numSamples = header[0]['numSamples']
-            sampPeriod = header[0]['sampPeriod']
-            sampSize   = header[0]['sampSize']
-            sampKind   = header[0]['sampKind']
-
-            # print 'Num samples = {}'.format(numSamples)
-            # print 'Sample period = {}'.format(sampPeriod)
-            # print 'Sample size = {}'.format(sampSize)
-            # print 'Sample kind = {}'.format(sampKind)
-            dt = numpy.dtype([('sample',(numpy.float32,sampSize/4))])
-            samples = numpy.fromfile(f,dt.newbyteorder('>' if self.byteOrder==ByteOrder.BigEndian else '<'),count=numSamples)
-
-        self._markDone()
-
-        if self.labelFile is None:
-            labels = None
-        else:
-            labels = ReadLabel(self.labelFile)
-
-        return samples[:]['sample'], labels
diff --git a/example/speech-demo/io_func/feat_readers/reader_kaldi.py b/example/speech-demo/io_func/feat_readers/reader_kaldi.py
deleted file mode 100644
index 345934a917..0000000000
--- a/example/speech-demo/io_func/feat_readers/reader_kaldi.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from common import *
-
-import random
-import time
-
-import ctypes
-import numpy
-import sys
-import re
-
-c_float_ptr = ctypes.POINTER(ctypes.c_float)
-c_int_ptr = ctypes.POINTER(ctypes.c_int)
-c_void_p = ctypes.c_void_p
-c_int = ctypes.c_int
-c_char_p = ctypes.c_char_p
-c_float = ctypes.c_float
-
-kaldi = ctypes.cdll.LoadLibrary("libkaldi-python-wrap.so")  # this needs to be in LD_LIBRARY_PATH
-
-def decl(f, restype, argtypes):
-    f.restype = restype
-    if argtypes is not None and len(argtypes) != 0:
-        f.argtypes = argtypes
-
-decl(kaldi.SBFMReader_new,          c_void_p,   [])
-decl(kaldi.SBFMReader_new_char,     c_void_p,   [c_char_p])
-decl(kaldi.SBFMReader_Open,         c_int,      [c_void_p, c_char_p])
-decl(kaldi.SBFMReader_Done,         c_int,      [c_void_p])
-decl(kaldi.SBFMReader_Key,          c_char_p,   [c_void_p])
-decl(kaldi.SBFMReader_FreeCurrent,  None,       [c_void_p])
-decl(kaldi.SBFMReader_Value,        c_void_p,   [c_void_p])
-decl(kaldi.SBFMReader_Next,         None,       [c_void_p])
-decl(kaldi.SBFMReader_IsOpen,       c_int,      [c_void_p])
-decl(kaldi.SBFMReader_Close,        c_int,      [c_void_p])
-decl(kaldi.SBFMReader_Delete,       None,       [c_void_p])
-
-decl(kaldi.MatrixF_NumRows,     c_int,       [c_void_p])
-decl(kaldi.MatrixF_NumCols,     c_int,       [c_void_p])
-decl(kaldi.MatrixF_Stride,      c_int,       [c_void_p])
-decl(kaldi.MatrixF_cpy_to_ptr,  None,        [c_void_p, c_float_ptr, c_int])
-decl(kaldi.MatrixF_SizeInBytes, c_int,       [c_void_p])
-decl(kaldi.MatrixF_Data,        c_float_ptr, [c_void_p])
-
-decl(kaldi.RAPReader_new_char,      c_void_p,   [c_char_p])
-decl(kaldi.RAPReader_HasKey,        c_int,      [c_void_p, c_char_p])
-decl(kaldi.RAPReader_Value,         c_int_ptr,  [c_void_p, c_char_p])
-decl(kaldi.RAPReader_DeleteValue,   None,       [c_void_p, c_int_ptr])
-decl(kaldi.RAPReader_Delete,        None,       [c_void_p])
-
-decl(kaldi.Nnet_new,            c_void_p,   [c_char_p, c_float, c_int])
-decl(kaldi.Nnet_Feedforward,    c_void_p,   [c_void_p, c_void_p])
-decl(kaldi.Nnet_Delete,         None,       [c_void_p])
-
-class kaldiReader(BaseReader):
-    def __init__(self, featureFile, labelFile, byteOrder=None):
-        BaseReader.__init__(self, featureFile, labelFile, byteOrder)
-
-        arr = re.split('\s+', featureFile, maxsplit=1)
-        if len(arr) != 2:
-            raise Exception("two items required in featureFile line: <transform> <rspecifier>")
-        feature_transform, featureFile = arr
-        if feature_transform == "NO_FEATURE_TRANSFORM":
-            feature_transform = None
-
-        self.feature_rspecifier = featureFile
-        self.targets_rspecifier = labelFile
-        self.feature_reader = kaldi.SBFMReader_new_char(self.feature_rspecifier)
-
-        if self.targets_rspecifier is not None:
-            self.targets_reader = kaldi.RAPReader_new_char(self.targets_rspecifier)
-        if feature_transform is not None:
-            self.nnet_transf = kaldi.Nnet_new(feature_transform, ctypes.c_float(1.0), 1)
-        else:
-            self.nnet_transf = None
-
-    def Cleanup(self):
-        kaldi.SBFMReader_Delete(self.feature_reader)
-        if self.targets_rspecifier is not None:
-            kaldi.RAPReader_Delete(self.targets_reader)
-        if self.nnet_transf is not None:
-            kaldi.Nnet_Delete(self.nnet_transf)
-
-    def Read(self):
-        if kaldi.SBFMReader_Done(self.feature_reader):
-            self._markDone()
-            return None
-        utt = kaldi.SBFMReader_Key(self.feature_reader)
-        self.utt_id = utt
-
-        #return numpy.ones((256, 819)).astype('float32'), numpy.ones(256).astype('int32')
-
-        feat_value = kaldi.SBFMReader_Value(self.feature_reader)
-        if self.nnet_transf is not None:
-            feat_value = kaldi.Nnet_Feedforward(self.nnet_transf, feat_value)
-        feat_rows = kaldi.MatrixF_NumRows(feat_value)
-        feat_cols = kaldi.MatrixF_NumCols(feat_value)
-        feat_data = kaldi.MatrixF_Data(feat_value)
-
-        # never use numpy.ndarray(buf=) or numpy.ctypeslib.as_array
-        # because you don't know if Python or C owns buffer
-        # (even if you numpy.copy() resulting array)
-        # http://stackoverflow.com/questions/4355524/getting-data-from-ctypes-array-into-numpy
-        #
-        # Can't use memmove/memcpy because arrays are strided
-        # Use special function -_-
-
-        feats = numpy.empty((feat_rows,feat_cols), dtype=numpy.float32)
-        # MUST: cast Python int to pointer, otherwise C interprets as 32-bit
-        # if you print the pointer value before casting, you might see weird value before seg fault
-        # casting fixes that
-        feats_numpy_ptr = ctypes.cast(feats.ctypes.data, c_float_ptr)
-        kaldi.MatrixF_cpy_to_ptr(feat_value, feats_numpy_ptr, feats.strides[0]/4)
-
-        if self.targets_rspecifier is not None:
-            if kaldi.RAPReader_HasKey(self.targets_reader, utt):
-                tgt_value = kaldi.RAPReader_Value(self.targets_reader, utt)
-
-                tgts = numpy.empty((feat_rows,), dtype=numpy.int32)
-                # ok to use memmove because this is 1-dimensional array I made in C (no stride)
-                tgts_numpy_ptr = ctypes.cast(tgts.ctypes.data, c_int_ptr)
-                ctypes.memmove(tgts_numpy_ptr, tgt_value, 4 * feat_rows)
-
-                kaldi.RAPReader_DeleteValue(self.targets_reader, tgt_value)
-            else:
-                tgts = None
-        else:
-            tgts = None
-
-        kaldi.SBFMReader_Next(self.feature_reader)
-
-        #print "FEATS:", feats[0:5][0:5]
-        #print "TGTS :", tgts[0:5]
-
-        return feats, tgts
-
-    def GetUttId(self):
-        return self.utt_id
diff --git a/example/speech-demo/io_func/feat_readers/stats.py b/example/speech-demo/io_func/feat_readers/stats.py
deleted file mode 100644
index a2c847359d..0000000000
--- a/example/speech-demo/io_func/feat_readers/stats.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import numpy
-
-class _StreamVariance(object):
-
-    def __init__(self,nCols):
-        self.n    = 0;
-        self.mean = numpy.zeros(nCols)
-        self.M2   = numpy.zeros(nCols)
-
-    def AddX(self,value):
-        # do not operate in the same way when the input is an 1
-        # dimension array or a 2 dimension array.  Maybe there is
-        # a better way to handle that
-        if len(value.shape) == 2:
-            for x in value:
-                self.n     = self.n+1
-                delta      = x-self.mean
-                self.mean  = self.mean+delta/self.n
-                self.M2    = self.M2+delta*(x-self.mean)
-        elif len(value.shape) == 1:
-            self.n     = self.n+1
-            delta      = value-self.mean
-            self.mean  = self.mean+delta/self.n
-            self.M2    = self.M2+delta*(value-self.mean)
-        else:
-            msg = 'Only 1D and 2D array are supported'
-            raise Exception(msg)
-
-    def GetMean(self):
-        return self.mean
-
-    def GetVariance(self):
-        return self.M2/(self.n-1)
-
-    def GetInvStandardDeviation(self):
-        return 1.0/(numpy.sqrt(self.M2/(self.n-1)))
-
-    def GetNumberOfSamples(self):
-        return self.n
-
-class FeatureStats(object):
-
-    def __init__(self):
-        self.mean           = numpy.zeros(1,)
-        self.invStd         = numpy.zeros(1,)
-        self.populationSize = 0
-        self.dim            = None
-
-    def GetMean(self):
-        return self.mean
-
-    def GetVariance(self):
-        return numpy.power(self.GetStd(), 2)
-
-    def GetStd(self):
-        return 1.0/self.invStd
-
-    def GetInvStd(self):
-        return self.invStd
-
-    """
-
-    def GetStatsFromList(self,fileList,featureFileHandler):
-        stats = None
-
-        for featureFile,label in featureList.FeatureList(fileList):
-            if stats is None:
-                self.dim = self.getDimFromFile(featureFile,featureFileHandler)
-                stats    = _StreamVariance(self.dim)
-
-            samples = featureFileHandler.Read(featureFile)
-
-            print('Process file : "{}"'.format(featureFile))
-            stats.AddX(samples)
-
-        print('Read {} samples'.format(stats.GetNumberOfSamples()))
-        self.mean           = stats.GetMean()
-        self.invStd         = stats.GetInvStandardDeviation()
-        self.populationSize = stats.GetNumberOfSamples()
-
-        return (self.mean,self.invStd)
-
-    def GetStatsFromFile(self,featureFile,featureFileHandler):
-        self.dim = self.getDimFromFile(featureFile,featureFileHandler)
-        stats = _StreamVariance(self.dim)
-
-        samples = featureFileHandler.Read(featureFile)
-        stats.AddX(samples)
-        self.mean           = stats.GetMean()
-        self.invStd         = stats.GetInvStandardDeviation()
-        self.populationSize = stats.GetNumberOfSamples()
-
-        return (self.mean,self.invStd)
-
-    def getDimFromFile(self,featureFile,featureFileHandler):
-        return featureFileHandler.GetDim(featureFile)
-
-    """
-
-    def Load(self,filename):
-        with open(filename,"rb") as f:
-            dt = numpy.dtype([('magicNumber',(numpy.int32,1)),('numSamples',(numpy.int32,1)),('dim',(numpy.int32,1))])
-            header = numpy.fromfile(f,dt,count=1)
-
-            if header[0]['magicNumber'] != 21812:
-                msg = 'File {} is not a stat file (wrong magic number)'
-                raise Exception(msg)
-
-            self.populationsize = header[0]['numSamples']
-            dim = header[0]['dim']
-
-            dt = numpy.dtype([('stats',(numpy.float32,dim))])
-            self.mean    = numpy.fromfile(f,dt,count=1)[0]['stats']
-            self.invStd  = numpy.fromfile(f,dt,count=1)[0]['stats']
-
-    def Save(self,filename):
-        with open(filename,'wb') as f:
-            dt = numpy.dtype([('magicNumber',(numpy.int32,1)),('numSamples',(numpy.int32,1)),('dim',(numpy.int32,1))])
-            header=numpy.zeros((1,),dtype=dt)
-            header[0]['magicNumber'] = 21812
-            header[0]['numSamples'] = self.populationSize
-            header[0]['dim'] = self.mean.shape[0]
-            header.tofile(f)
-
-            self.mean.astype(numpy.float32).tofile(f)
-            self.invStd.astype(numpy.float32).tofile(f)
-
-if __name__ == '__main__':
-
-    import argparse
-
-    parser = argparse.ArgumentParser(description='Print the mean and standard deviation from a stat file',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('filename', help="Name of the stat file")
-    args = parser.parse_args()
-    featureStats = FeatureStats()
-    featureStats.Load(args.filename)
-
-    numpy.set_printoptions(threshold='nan')
-    print("THIS IS THE MEAN: ")
-    print(featureStats.GetMean())
-    print("THIS IS THE INVERSE STD: ")
-    print(featureStats.GetInvStd())
-
-
diff --git a/example/speech-demo/io_func/feat_readers/writer_kaldi.py b/example/speech-demo/io_func/feat_readers/writer_kaldi.py
deleted file mode 100644
index 0f8fb93808..0000000000
--- a/example/speech-demo/io_func/feat_readers/writer_kaldi.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import sys
-import numpy
-import struct
-import subprocess
-import os
-
-# Functions to read and write Kaldi binary-formatted .scp and .ark
-
-class KaldiWriteOut(object):
-
-    def __init__(self, scp_path, ark_path):
-
-        self.ark_path = ark_path
-        self.scp_path = scp_path
-        self.out_ark = None
-        self.out_scp = None
-        if sys.byteorder != 'little':
-            raise Exception("output file needs to be little endian")
-
-    def open(self):
-        self.out_ark = open(self.ark_path, "w")
-        self.out_scp = open(self.scp_path, "w")
-
-    def open_or_fd(self):
-        offset = None
-        if self.ark_path[0] == '|':
-            #self.out_ark = os.popen(sys.stdout, 'wb')
-            self.out_ark = sys.stdout
-        else:
-            self.out_ark = open(self.ark_path, "w")
-    def write(self, uttID, data):
-        assert data.dtype == numpy.float32
-
-        self.out_ark.write(uttID + ' ')
-        if self.out_scp is not None:
-            start_offset = self.out_ark.tell()
-
-        # write out ark
-        num_row, num_col = data.shape
-        self.out_ark.write('\0B')
-        self.out_ark.write('FM ')
-        self.out_ark.write(chr(4))
-        self.out_ark.write(struct.pack('i', num_row))
-        self.out_ark.write(chr(4))
-        self.out_ark.write(struct.pack('i', num_col))
-        data.tofile(self.out_ark)
-        self.out_ark.flush()
-
-        # write out scp
-        if self.out_scp is not None:
-            scp_out = uttID + ' ' + self.ark_path + ':' + str(start_offset)
-            self.out_scp.write(scp_out + '\n')
-
-    def close(self):
-        self.out_ark.close()
-        if self.out_scp is not None:
-            self.out_scp.close()
diff --git a/example/speech-demo/io_func/kaldi_parser.py b/example/speech-demo/io_func/kaldi_parser.py
deleted file mode 100644
index 10a373d713..0000000000
--- a/example/speech-demo/io_func/kaldi_parser.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import struct
-import numpy as num
-import sys
-
-class KaldiParser(object):
-
-    NO_OPEN_BRACKET = "found > before <"
-    ERR_NO_CLOSE_BRACKET = "reached eof before >"
-    ERR_BYTES_BEFORE_TOKEN = "found bytes before <"
-    NO_SPACE_AFTER = "missing space after >"
-
-    def __init__(self, f):
-        self.f = f
-        self.binary = self.f.read(2) == '\0B'
-        assert(self.binary), "text format not supported yet"
-        if not self.binary:
-            self.f.seek(0, 0)
-
-    def is_binary(self):
-        return self.binary
-
-    def try_next_token(self):
-        pos = self.f.tell()
-        err, tok = self.next_token()
-        if err is not None:
-            self.f.seek(pos, 0)
-            print(err, tok)
-            return None
-        return tok.lower()
-
-    def next_token(self):
-        # keep reading until you get a > or at end of file (return None)
-        # consume the space
-        # return substring from < to >
-        # if things before < are not space, return error
-        buf = ""
-        while True:
-            b = self.f.read(1)
-            if b is None:
-                return KaldiParser.ERR_NO_CLOSE_BRACKET, None
-            buf += b
-            if b == ">":
-                break
-
-        try:
-            start = buf.index("<")
-        except ValueError:
-            return KaldiParser.NO_OPEN_BRACKET, None
-
-        b = self.f.read(1)
-        if not (b == " " or b is None):
-            return KaldiParser.NO_SPACE_AFTER, buf[start:]
-
-        if start != 0:
-            return KaldiParser.ERR_BYTES_BEFORE_TOKEN, buf[start:]
-
-        return None, buf
-
-    def read_space(self):
-        b = self.f.read(1)
-        assert(b == " " or b is None)
-
-    # http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
-    def read_basic_type(self, type):
-        if self.binary:
-            size = num.fromfile(self.f, dtype=num.dtype("i1"), count=1)[0]
-
-            if type == "int":
-                dtype = "<i4"
-                dsize = 4
-            elif type == "float":
-                dtype = "<f4"
-                dsize = 4
-            elif type == "char":
-                dtype = 'a'
-                dsize = 1
-            else:
-                print("unrecognized type")
-                return None
-
-            assert(size == dsize)
-            n = num.fromfile(self.f, dtype=num.dtype(dtype), count=1)
-            return n[0]
-
-        else:
-            assert(False), "not supported yet"
-
-    def read_matrix(self):
-        mode = self.f.read(2)
-        #print mode
-        assert(mode == 'FM')
-        self.read_space()
-
-        rows = self.read_basic_type("int")
-        #print "rows", rows
-        cols = self.read_basic_type("int")
-        #print "cols", cols
-
-        n = num.fromfile(self.f, dtype=num.dtype("<f4"), count=rows * cols)
-        n = n.reshape((rows, cols))
-
-        #print n[0][0]
-        #print "-----------"
-        return n
-
-    def read_vector(self):
-        mode = self.f.read(2)
-        #print mode
-        assert(mode == 'FV')
-        self.read_space()
-
-        length = self.read_basic_type("int")
-        #print "length", length
-
-        n = num.fromfile(self.f, dtype=num.dtype("<f4"), count=length)
-        #print n[0]
-        #print "-----------"
-        return n
-
-def fileIsBinary(filename):
-    f = open(filename, "rb")
-    binary = (f.read(2) == '\0B')
-    f.seek(0, 0)
-    return binary
-
-def file2nnet_binary(filename):
-    f = open(filename, "rb")
-    parser = KaldiParser(f)
-
-    net = []
-    layer = None
-    while True:
-        tok = parser.try_next_token()
-        if tok is None:
-            print("error")
-            break
-        if tok == "<nnet>":
-            continue
-        elif tok == "<affinetransform>":
-            if layer is not None:
-                net += [layer]
-            layer = {}
-            layer["outdim"] = parser.read_basic_type("int")
-            layer["indim"] = parser.read_basic_type("int")
-        elif tok == "<learnratecoef>":
-            parser.read_basic_type("float")
-        elif tok == "<biaslearnratecoef>":
-            parser.read_basic_type("float")
-        elif tok == "<maxnorm>":
-            parser.read_basic_type("float")
-            layer["weights"] = parser.read_matrix().transpose()        # kaldi writes the transpose!!!!
-            layer["bias"] = parser.read_vector()
-        elif tok == "<sigmoid>" or tok == "<softmax>":
-            layer["type"] = tok[1:-1]
-            outdim1 = parser.read_basic_type("int")
-            outdim2 = parser.read_basic_type("int")
-            assert(outdim1 == outdim2 and outdim2 == layer["outdim"])
-        elif tok == "</nnet>":
-            #print "Done!"
-            break
-        else:
-            print("unrecognized token", tok)
-            break
-
-    if layer is not None:
-        net += [layer]
-
-    #for layer in net:
-    #    print layer.keys()
-
-    return net
-
-if __name__ == '__main__':
-    filename = "exp/dnn4_pretrain-dbn_dnn/nnet_6.dbn_dnn.init"
-    #filename = "/usr/users/leoliu/s5/exp/dnn4_pretrain-dbn_dnn/final.feature_transform"
-    print(filename)
-
-    print("isBinary:", fileIsBinary(filename))
-    a = file2nnet_binary(filename)
-
-
-
-    """
-    while True:
-        err, tok = parser.next_token()
-        if err != KaldiParser.NO_SPACE_AFTER and tok is not None:
-            print(err, tok)
-    """
-
-"""
-        fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n')
-        fout.write('[' + '\n')
-        for x in xrange(output_size):
-            fout.write(W_layer[x].strip() + '\n')
-        fout.write(']' + '\n')
-        fout.write('[ ' + b_layer.strip() + ' ]' + '\n')
-        if maxout:
-            fout.write('<maxout> ' + str(int(layers[i + 1])) + ' ' + str(output_size) + '\n')
-        else:
-            fout.write('<sigmoid> ' + str(output_size) + ' ' + str(output_size) + '\n')
-"""
diff --git a/example/speech-demo/io_func/model_io.py b/example/speech-demo/io_func/model_io.py
deleted file mode 100755
index 8b6e0436c2..0000000000
--- a/example/speech-demo/io_func/model_io.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import numpy as np
-import os
-import sys
-import logging
-
-from StringIO import StringIO
-import json
-
-
-from datetime import datetime
-
-from kaldi_parser import *
-import utils.utils as utils
-
-# nicer interface for file2nnet, nnet2file
-
-def load(model, filename, gradients, num_hidden_layers=-1, with_final=True, factors=None):
-    _file2nnet(model.sigmoid_layers, set_layer_num = num_hidden_layers,
-        filename=filename, activation="sigmoid", withfinal=with_final, factor=1.0, gradients=gradients, factors=factors)
-
-def save(model, filename):
-    _nnet2file(model.sigmoid_layers, set_layer_num = -1, filename=filename,
-        activation="sigmoid", start_layer = 0, withfinal=True)
-
-# convert an array to a string
-def array_2_string(array):
-    return array.astype('float32')
-
-# convert a string to an array
-def string_2_array(string):
-    if isinstance(string, str) or isinstance(string, unicode):
-        str_in = StringIO(string)
-        return np.loadtxt(str_in)
-    else:
-        return string
-
-def _nnet2file(layers, set_layer_num = -1, filename='nnet.out', activation='sigmoid', start_layer = 0, withfinal=True, input_factor = 0.0, factor=[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]):
-    logger = logging.getLogger(__name__)
-    logger.info("Saving network "+filename)
-
-    n_layers = len(layers)
-    nnet_dict = {}
-    if set_layer_num == -1:
-        set_layer_num = n_layers - 1
-
-    for i in range(start_layer, set_layer_num):
-        logger.info("Saving hidden layer "+str(i))
-        dict_a = str(i) + ' ' + activation + ' W'
-        if i == 0:
-            nnet_dict[dict_a] = array_2_string((1.0 - input_factor) * layers[i].params[0].get_value())
-        else:
-            nnet_dict[dict_a] = array_2_string((1.0 - factor[i-1]) * layers[i].params[0].get_value())
-        dict_a = str(i) + ' ' + activation + ' b'
-        nnet_dict[dict_a] = array_2_string(layers[i].params[1].get_value())
-
-        # gradients
-        dict_a = str(i) + ' ' + activation + ' dW'
-        nnet_dict[dict_a] = array_2_string(layers[i].delta_params[0].get_value())
-        dict_a = str(i) + ' ' + activation + ' db'
-        nnet_dict[dict_a] = array_2_string(layers[i].delta_params[1].get_value())
-
-        if layers[i].kahan:
-            logger.info("Loading hidden kahan")
-            dict_a = str(i) + ' ' + activation + ' W_carry'
-            nnet_dict[dict_a] = array_2_string(layers[i].params_carry[0].get_value())
-            dict_a = str(i) + ' ' + activation + ' b_carry'
-            nnet_dict[dict_a] = array_2_string(layers[i].params_carry[1].get_value())
-            #dict_a = str(i) + ' ' + activation + ' dW_carry'
-            #nnet_dict[dict_a] = array_2_string(layers[i].delta_params_carry[0].get_value())
-            #dict_a = str(i) + ' ' + activation + ' db_carry'
-            #nnet_dict[dict_a] = array_2_string(layers[i].delta_params_carry[1].get_value())
-
-    if withfinal:
-        logger.info("Saving final layer ")
-
-        dict_a = 'logreg W'
-        nnet_dict[dict_a] = array_2_string((1.0 - factor[-1]) * layers[-1].params[0].get_value())
-        dict_a = 'logreg b'
-        nnet_dict[dict_a] = array_2_string(layers[-1].params[1].get_value())
-
-        #gradients
-        dict_a = 'logreg dW'
-        nnet_dict[dict_a] = array_2_string(layers[-1].delta_params[0].get_value())
-        dict_a = 'logreg db'
-        nnet_dict[dict_a] = array_2_string(layers[-1].delta_params[1].get_value())
-
-        if layers[-1].kahan:
-            logger.info("Loading softmax kahan")
-            dict_a = 'logreg W_carry'
-            nnet_dict[dict_a] = array_2_string(layers[-1].params_carry[0].get_value())
-            dict_a = 'logreg b_carry'
-            nnet_dict[dict_a] = array_2_string(layers[-1].params_carry[1].get_value())
-            #dict_a = 'logreg dW_carry'
-            #nnet_dict[dict_a] = array_2_string(layers[-1].delta_params_carry[0].get_value())
-            #dict_a = 'logreg db_carry'
-            #nnet_dict[dict_a] = array_2_string(layers[-1].delta_params_carry[1].get_value())
-
-    utils.pickle_save(nnet_dict, filename)
-
-def zero(x):
-    x.set_value(np.zeros_like(x.get_value(borrow=True), dtype=theano.config.floatX))
-
-def _file2nnet(layers, set_layer_num = -1, filename='nnet.in', activation='sigmoid', withfinal=True, factor=1.0, gradients=False, factors=None):
-    logger = logging.getLogger(__name__)
-    logger.info("Loading "+filename)
-
-    # if is KALDI binary
-    if fileIsBinary(filename):
-        print("Warning dropout factors ignored here")
-
-        nnet = file2nnet_binary(filename)
-
-        n_layers = len(nnet)
-        if set_layer_num == -1:
-            set_layer_num = n_layers - 1
-
-        for i in xrange(set_layer_num):
-            layers[i].params[0].set_value(factor * nnet[i]["weights"].astype(dtype=theano.config.floatX))
-            layers[i].params[1].set_value(nnet[i]["bias"].astype(dtype=theano.config.floatX))
-
-        if withfinal:
-            #print(nnet[-1]["weights"][0][0:10])
-            layers[-1].params[0].set_value(nnet[-1]["weights"].astype(dtype=theano.config.floatX))
-            layers[-1].params[1].set_value(nnet[-1]["bias"].astype(dtype=theano.config.floatX))
-
-        return
-
-    # else, it's pdnn format
-
-    n_layers = len(layers)
-
-    if factors is None:
-        factors = [1.0 for l in layers]
-
-    if len(factors) != n_layers:
-        raise Exception("number of factors does not equal number of hidden + softmax")
-
-    nnet_dict = {}
-    if set_layer_num == -1:
-        set_layer_num = n_layers - 1
-
-    nnet_dict = utils.pickle_load(filename)
-
-    for i in xrange(set_layer_num):
-        logger.info("Loading hidden layer "+str(i))
-
-        dict_key = str(i) + ' ' + activation + ' W'
-        layers[i].params[0].set_value(factors[i] * factor * np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-        dict_key = str(i) + ' ' + activation + ' b'
-        layers[i].params[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-
-        if gradients:
-            dict_key = str(i) + ' ' + activation + ' dW'
-            layers[i].delta_params[0].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            dict_key = str(i) + ' ' + activation + ' db'
-            layers[i].delta_params[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-        else:
-            zero(layers[i].delta_params[0])
-            zero(layers[i].delta_params[1])
-
-        dict_key = str(i) + ' ' + activation + ' W_carry'
-        if layers[i].kahan and dict_key in nnet_dict:
-            logger.info("Loading hidden kahan")
-            dict_key = str(i) + ' ' + activation + ' W_carry'
-            layers[i].params_carry[0].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            dict_key = str(i) + ' ' + activation + ' b_carry'
-            layers[i].params_carry[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            #dict_key = str(i) + ' ' + activation + ' dW_carry'
-            #layers[i].delta_params_carry[0].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            #dict_key = str(i) + ' ' + activation + ' db_carry'
-            #layers[i].delta_params_carry[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-
-        if layers[i].sync:
-            layers[i].params_sync[0].set_value(layers[i].params[0].get_value().astype('float32'))
-            layers[i].params_sync[1].set_value(layers[i].params[1].get_value().astype('float32'))
-            logger.info("Copy params to sync")
-
-    if withfinal:
-        logger.info("Loading final layer ")
-
-        dict_key = 'logreg W'
-        layers[-1].params[0].set_value(factors[-1] * np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-        dict_key = 'logreg b'
-        layers[-1].params[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-        if gradients:
-            dict_key = 'logreg dW'
-            layers[-1].delta_params[0].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            dict_key = 'logreg db'
-            layers[-1].delta_params[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-        else:
-            zero(layers[-1].delta_params[0])
-            zero(layers[-1].delta_params[1])
-
-        dict_key = 'logreg W_carry'
-        if layers[-1].kahan and dict_key in nnet_dict:
-            logger.info("Loading softmax kahan")
-            dict_key = 'logreg W_carry'
-            layers[-1].params_carry[0].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            dict_key = 'logreg b_carry'
-            layers[-1].params_carry[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            #dict_key = 'logreg dW_carry'
-            #layers[-1].delta_params_carry[0].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-            #dict_key = 'logreg db_carry'
-            #layers[-1].delta_params_carry[1].set_value(np.asarray(string_2_array(nnet_dict[dict_key]), dtype=theano.config.floatX))
-
-        if layers[-1].sync:
-            layers[-1].params_sync[0].set_value(layers[-1].params[0].get_value().astype('float32'))
-            layers[-1].params_sync[1].set_value(layers[-1].params[1].get_value().astype('float32'))
-            logger.info("Copy softmax params to sync")
-
-    if gradients:
-        logger.info("Loading gradients")
-    else:
-        logger.info("Zero-ing gradients")
-
-def _cnn2file(conv_layers, filename='nnet.out', activation='sigmoid', withfinal=True, input_factor = 1.0, factor=1.0):
-    n_layers = len(conv_layers)
-    nnet_dict = {}
-    for i in xrange(n_layers):
-       conv_layer = conv_layers[i]
-       filter_shape = conv_layer.filter_shape
-
-       for next_X in xrange(filter_shape[0]):
-           for this_X in xrange(filter_shape[1]):
-               dict_a = 'W ' + str(i) + ' ' + str(next_X) + ' ' + str(this_X)
-               if i == 0:
-                   nnet_dict[dict_a] = array_2_string(input_factor * (conv_layer.W.get_value())[next_X, this_X])
-               else:
-                   nnet_dict[dict_a] = array_2_string(factor * (conv_layer.W.get_value())[next_X, this_X])
-
-       dict_a = 'b ' + str(i)
-       nnet_dict[dict_a] = array_2_string(conv_layer.b.get_value())
-
-    with open(filename, 'wb') as fp:
-        json.dump(nnet_dict, fp, indent=2, sort_keys = True)
-        fp.flush()
-
-def _file2cnn(conv_layers, filename='nnet.in', activation='sigmoid', withfinal=True, factor=1.0):
-    n_layers = len(conv_layers)
-    nnet_dict = {}
-
-    with open(filename, 'rb') as fp:
-        nnet_dict = json.load(fp)
-    for i in xrange(n_layers):
-        conv_layer = conv_layers[i]
-        filter_shape = conv_layer.filter_shape
-        W_array = conv_layer.W.get_value()
-
-        for next_X in xrange(filter_shape[0]):
-            for this_X in xrange(filter_shape[1]):
-                dict_a = 'W ' + str(i) + ' ' + str(next_X) + ' ' + str(this_X)
-                W_array[next_X, this_X, :, :] = factor * np.asarray(string_2_array(nnet_dict[dict_a]))
-
-        conv_layer.W.set_value(W_array)
-
-        dict_a = 'b ' + str(i)
-        conv_layer.b.set_value(np.asarray(string_2_array(nnet_dict[dict_a]), dtype=theano.config.floatX))
diff --git a/example/speech-demo/io_func/regr_feat_io.py b/example/speech-demo/io_func/regr_feat_io.py
deleted file mode 100644
index a1737bf9ab..0000000000
--- a/example/speech-demo/io_func/regr_feat_io.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import sys
-import random
-import shlex
-import time
-import re
-
-from utils.utils import to_bool
-from feat_readers.common import *
-from feat_readers import stats
-from feat_io import DataReadStream
-
-class RegrDataReadStream(object):
-
-    def __init__(self, dataset_args, n_ins):
-        dataset_args["has_labels"] = False
-        assert("seed" in dataset_args)
-
-        args1 = dict(dataset_args)
-        args2 = dict(dataset_args)
-
-        args1["lst_file"] = dataset_args["input_lst_file"]
-        args2["lst_file"] = dataset_args["output_lst_file"]
-
-        self.input = DataReadStream(args1, n_ins)
-        self.output = DataReadStream(args2, n_ins)
-
-    def read_by_part(self):
-        self.input.read_by_part()
-        self.output.read_by_part()
-
-    def read_by_matrix(self):
-        self.input.read_by_matrix()
-        self.output.read_by_matrix()
-
-    def make_shared(self):
-        self.input.make_shared()
-        self.output.make_shared()
-
-    def get_shared(self):
-        iret = self.input.get_shared()
-        oret = self.output.get_shared()
-        assert(iret[1] is None)
-        assert(oret[1] is None)
-        return iret[0], oret[0]
-
-    def initialize_read(self):
-        self.input.initialize_read()
-        self.output.initialize_read()
-
-    def current_utt_id(self):
-        a = self.input.current_utt_id()
-        b = self.output.current_utt_id()
-        assert(a == b)
-        return a
-
-    def load_next_block(self):
-        a = self.input.load_next_block()
-        b = self.output.load_next_block()
-        assert(a == b)
-        return a
-
-    def get_state(self):
-        a = self.input.get_state()
-        b = self.output.get_state()
-        assert(a[0] == b[0])
-        assert(a[2] == b[2])
-        assert(a[3] == b[3])
-        assert(a[4] == b[4])
-        assert(numpy.array_equal(a[1], b[1]))
-        return a
-
-    def set_state(self, state):
-        self.input.set_state(state)
-        self.output.set_state(state)
diff --git a/example/speech-demo/io_func/utils.py b/example/speech-demo/io_func/utils.py
deleted file mode 100644
index 4ba8496c7f..0000000000
--- a/example/speech-demo/io_func/utils.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import sys, subprocess, pickle, os, json, logging, socket
-import logging.config
-import datetime
-
-from . import info
-
-def getRunDir():
-    return os.path.dirname(os.path.realpath(sys.argv[0]))
-
-def setup_logger(logging_ini):
-    if logging_ini is not None:
-        print("Using custom logger")
-    else:
-        logging_ini = os.path.join(info.CONFIGS, 'logging.ini')
-
-    logging.config.fileConfig(logging_ini)
-    logger = logging.getLogger(__name__)
-    logger.info("**************************************************")
-    logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
-    logger.info("Host:   " + str(socket.gethostname()))
-    logger.info("Screen: " + os.getenv("STY", "unknown"))
-    logger.info("PWD:    " + os.getenv("PWD", "unknown"))
-    logger.info("Cmd:    " + str(sys.argv))
-    logger.info("**************************************************")
-
-def to_bool(obj):
-    if str(obj).lower() in ["true", "1"]:
-        return True
-    elif str(obj).lower() in ["false", "0"]:
-        return False
-    else:
-        raise Exception("to_bool: cannot convert to bool")
-
-def line_with_arg(line):
-    line = line.strip()
-    return line is not "" and not line.startswith("#")
-
-def parse_conv_spec(conv_spec, batch_size):
-    # "1x29x29:100,5x5,p2x2:200,4x4,p2x2,f"
-    conv_spec = conv_spec.replace('X', 'x')
-    structure = conv_spec.split(':')
-    conv_layer_configs = []
-    for i in range(1, len(structure)):
-        config = {}
-        elements = structure[i].split(',')
-        if i == 1:
-            input_dims = structure[i - 1].split('x')
-            prev_map_number = int(input_dims[0])
-            prev_feat_dim_x = int(input_dims[1])
-            prev_feat_dim_y = int(input_dims[2])
-        else:
-            prev_map_number = conv_layer_configs[-1]['output_shape'][1]
-            prev_feat_dim_x = conv_layer_configs[-1]['output_shape'][2]
-            prev_feat_dim_y = conv_layer_configs[-1]['output_shape'][3]
-
-        current_map_number = int(elements[0])
-        filter_xy = elements[1].split('x')
-        filter_size_x = int(filter_xy[0])
-        filter_size_y = int(filter_xy[1])
-        pool_xy = elements[2].replace('p','').replace('P','').split('x')
-        pool_size_x = int(pool_xy[0])
-        pool_size_y = int(pool_xy[1])
-        output_dim_x = (prev_feat_dim_x - filter_size_x + 1) / pool_size_x
-        output_dim_y = (prev_feat_dim_y - filter_size_y + 1) / pool_size_y
-
-        config['input_shape'] = (batch_size, prev_map_number, prev_feat_dim_x, prev_feat_dim_y)
-        config['filter_shape'] = (current_map_number, prev_map_number, filter_size_x, filter_size_y)
-        config['poolsize'] = (pool_size_x, pool_size_y)
-        config['output_shape'] = (batch_size, current_map_number, output_dim_x, output_dim_y)
-        if len(elements) == 4 and elements[3] == 'f':
-            config['flatten'] = True
-        else:
-            config['flatten'] = False
-
-        conv_layer_configs.append(config)
-    return conv_layer_configs
-
-def _relu(x):
-    return x * (x > 0)
-
-def _capped_relu(x):
-    return T.minimum(x * (x > 0), 6)
-
-def _linear(x):
-    return x * 1.0
-
-def parse_activation(act_str):
-    print("***", act_str)
-    if act_str == 'sigmoid':
-        return T.nnet.sigmoid
-    elif act_str == 'tanh':
-        return T.tanh
-    elif act_str == 'relu':
-        return _relu
-    elif act_str == 'capped_relu':
-        return _capped_relu
-    elif act_str == 'linear':
-        return _linear
-    return T.nnet.sigmoid
-
-def activation_to_txt(act_func):
-    if act_func == T.nnet.sigmoid:
-        return 'sigmoid'
-    if act_func == T.tanh:
-        return 'tanh'
-
-def parse_two_integers(argument_str):
-    elements = argument_str.split(":")
-    int_strs = elements[1].split(",")
-    return int(int_strs[0]), int(int_strs[1])
-
-"""
-Usage:
-    command = 'mysqladmin create test -uroot -pmysqladmin12'
-    for line in run_command(command):
-        print(line)
-"""
-def run_command(command):
-    fnull = open(os.devnull, 'w')
-    p = subprocess.Popen(command,
-                         stdout=subprocess.PIPE,
-                         stderr=fnull,
-                         shell=True)
-    return p, iter(p.stdout.readline, b'')
-
-def pickle_load(filename):
-    f = open(filename, "rb")
-    try:
-        obj = pickle.load(f)
-    except Exception:
-        f.close()
-        f = open(filename, "rb")
-        print("Not a pickled file... try to load as text format: " + filename)
-        obj = json.load(f)
-    f.close()
-    return obj
-
-def pickle_save(obj, filename):
-    f = open(filename + ".new", "wb")
-    pickle.dump(obj, f)
-    f.close()
-    os.rename(filename + ".new", filename)
-
-def makedirs(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-def kahan_add(total, carry, inc):
-    cs = T.add_no_assoc(carry, inc)
-    s = T.add_no_assoc(total, cs)
-    update_carry = T.sub(cs, T.sub(s, total))
-    update_total = s
-    return update_total, update_carry
diff --git a/example/speech-demo/io_util.py b/example/speech-demo/io_util.py
deleted file mode 100644
index e5bd74cb6f..0000000000
--- a/example/speech-demo/io_util.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-import sys
-from io_func.feat_io import DataReadStream
-
-# The interface of a data iter that works for bucketing
-#
-# DataIter
-#   - default_bucket_key: the bucket key for the default symbol.
-#
-# DataBatch
-#   - provide_data: same as DataIter, but specific to this batch
-#   - provide_label: same as DataIter, but specific to this batch
-#   - bucket_key: the key for the bucket that should be used for this batch
-
-
-def read_content(path):
-    with open(path) as input:
-        content = input.read()
-        content = content.replace('\n', ' <eos> ').replace('. ', ' <eos> ')
-        return content
-
-
-class SimpleBatch(object):
-    def __init__(self, data_names, data, label_names, label, bucket_key,
-                 utt_id=None, utt_len=0, effective_sample_count=None):
-        self.data = data
-        self.label = label
-        self.data_names = data_names
-        self.label_names = label_names
-        self.bucket_key = bucket_key
-        self.utt_id = utt_id
-        self.utt_len = utt_len
-        self.effective_sample_count = effective_sample_count
-
-        self.pad = 0
-        self.index = None  # TODO: what is index?
-
-    @property
-    def provide_data(self):
-        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
-
-    @property
-    def provide_label(self):
-        if len(self.label_names):
-            return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
-        else:
-            return None
-
-class SimpleIter(mx.io.DataIter):
-    """DataIter used in Calculate Statistics (in progress).
-
-    Parameters
-    ----------
-    pad_zeros : bool
-        Default `False`. Control the behavior of padding when we run
-        out of the whole dataset. When true, we will pad with all-zeros.
-        When false, will pad with a random sentence in the dataset.
-        Usually, for training we would like to use `False`, but
-        for testing use `True` so that the evaluation metric can
-        choose to ignore the padding by detecting the zero-labels.
-    """
-    def __init__(self, train_sets, batch_size,
-            init_states, delay=5, feat_dim=40, label_dim=1955,
-            label_mean_sets=None, data_name='data',
-            label_name='softmax_label', has_label=True, load_label_mean=True):
-
-        self.train_sets = train_sets
-        self.label_mean_sets = label_mean_sets
-        self.train_sets.initialize_read()
-
-        self.data_name = data_name
-        if has_label:
-            self.label_name = label_name
-
-        features = []
-        labels = []
-        utt_lens = []
-        utt_ids = []
-        buckets = []
-        self.has_label = has_label
-
-        if label_mean_sets is not None:
-            self.label_mean_sets.initialize_read()
-            (feats, tgts, utt_id) = self.label_mean_sets.load_next_seq()
-
-            self.label_mean = feats/np.sum(feats)
-            for i, v in enumerate(feats):
-                if v <= 1.0:
-                    self.label_mean[i] = 1
-
-        sys.stderr.write("Loading data...\n")
-        buckets_map = {}
-        n = 0
-        while True:
-            (feats, tgts, utt_id) = self.train_sets.load_next_seq()
-            if utt_id is None:
-                break
-            if tgts is None and self.has_label:
-                continue
-            if feats.shape[0] == 0:
-                continue
-            features.append(feats)
-            utt_lens.append(feats.shape[0])
-            utt_ids.append(utt_id)
-            if self.has_label:
-                labels.append(tgts+1)
-            if feats.shape[0] not in buckets:
-                buckets_map[feats.shape[0]] = feats.shape[0]
-
-        for k, v in buckets_map.iteritems():
-            buckets.append(k)
-
-        buckets.sort()
-        i_max_bucket = len(buckets)-1
-        max_bucket = buckets[i_max_bucket]
-        self.buckets = buckets
-        self.data = [[] for k in buckets]
-        self.utt_id = [[] for k in buckets]
-        self.utt_lens = [[] for k in buckets]
-        self.feat_dim = feat_dim
-        self.default_bucket_key = max(buckets)
-
-        for i, feats in enumerate(features):
-            if has_label:
-                tgts = labels[i]
-            utt_len = utt_lens[i]
-            utt_id = utt_ids[i]
-
-            for i, bkt in enumerate(buckets):
-                if bkt >= utt_len:
-                    i_bucket = i
-                    break
-
-            if self.has_label:
-                self.data[i_bucket].append((feats, tgts))
-            else:
-                self.data[i_bucket].append(feats)
-            self.utt_id[i_bucket].append(utt_id)
-            self.utt_lens[i_bucket].append(utt_len)
-
-        # Get the size of each bucket, so that we could sample
-        # uniformly from the bucket
-        bucket_sizes = [len(x) for x in self.data]
-
-        self.batch_size = batch_size
-        # convert data into ndarrays for better speed during training
-
-        data = [np.zeros((len(x), buckets[i], self.feat_dim))
-                if len(x) % self.batch_size == 0
-                else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i], self.feat_dim))
-                for i, x in enumerate(self.data)]
-
-        label = [np.zeros((len(x), buckets[i]))
-                 if len(x) % self.batch_size == 0
-                 else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i]))
-                 for i, x in enumerate(self.data)]
-
-        utt_id = [[] for k in buckets]
-        for i, x in enumerate(data):
-            utt_id[i] = ["GAP_UTT"] * len(x)
-        utt_lens = [[] for k in buckets]
-        for i, x in enumerate(data):
-            utt_lens[i] = [0] * len(x)
-
-
-        for i_bucket in range(len(self.buckets)):
-            for j in range(len(self.data[i_bucket])):
-                sentence = self.data[i_bucket][j]
-                if self.has_label:
-                    sentence[1][delay:] = sentence[1][:-delay]
-                    sentence[1][:delay] = sentence[1][0] # broadcast assignment
-                    data[i_bucket][j, :len(sentence[0])] = sentence[0]
-                    label[i_bucket][j, :len(sentence[1])] = sentence[1]
-                else:
-                    data[i_bucket][j, :len(sentence)] = sentence
-                    # borrow this place to pass in sentence length. TODO: use a less hacky way.
-                    label[i_bucket][j, :len(sentence)] += len(sentence)
-
-                utt_id[i_bucket][j] = self.utt_id[i_bucket][j]
-                utt_lens[i_bucket][j] = self.utt_lens[i_bucket][j]
-
-        self.data = data
-        self.label = label
-        self.utt_id = utt_id
-        self.utt_lens = utt_lens
-
-
-        # Get the size of each bucket, so that we could sample
-        # uniformly from the bucket
-        bucket_sizes = [len(x) for x in self.data]
-
-        sys.stderr.write("Summary of dataset ==================\n")
-        for bkt, sz in zip(buckets, bucket_sizes):
-            sys.stderr.write("bucket of len %3d : %d samples\n" % (bkt, sz))
-
-        bucket_size_tot = float(sum(bucket_sizes))
-
-        self.bucket_sizes = bucket_sizes
-        self.make_data_iter_plan()
-
-        self.init_states = init_states
-        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
-
-        self.provide_data = [(data_name, (batch_size, self.default_bucket_key, self.feat_dim))] + init_states
-        self.provide_label = None
-        if has_label:
-            self.provide_label = [(label_name, (self.batch_size, self.default_bucket_key))]
-
-    def make_data_iter_plan(self):
-        "make a random data iteration plan"
-        # truncate each bucket into multiple of batch-size
-        bucket_n_batches = []
-        for i in range(len(self.data)):
-            bucket_n_batches.append(len(self.data[i]) / self.batch_size)
-            self.data[i] = self.data[i][:int(bucket_n_batches[i]*self.batch_size),:]
-            self.label[i] = self.label[i][:int(bucket_n_batches[i]*self.batch_size)]
-
-        bucket_plan = np.hstack([np.zeros(n, int)+i for i, n in enumerate(bucket_n_batches)])
-        np.random.shuffle(bucket_plan)
-
-        bucket_idx_all = [np.random.permutation(len(x)) for x in self.data]
-
-        self.bucket_plan = bucket_plan
-        self.bucket_idx_all = bucket_idx_all
-        self.bucket_curr_idx = [0 for x in self.data]
-
-        self.data_buffer = []
-        self.label_buffer = []
-        for i_bucket in range(len(self.data)):
-            data = mx.nd.zeros((self.batch_size, self.buckets[i_bucket], self.feat_dim))
-            label = mx.nd.zeros((self.batch_size, self.buckets[i_bucket]))
-            self.data_buffer.append(data)
-            self.label_buffer.append(label)
-
-    def __iter__(self):
-        init_state_names = [x[0] for x in self.init_states]
-        data_names = [self.data_name] + init_state_names
-        label_names = []
-        if self.has_label:
-            label_names = [self.label_name]
-
-        for i_bucket in self.bucket_plan:
-            data = self.data_buffer[i_bucket]
-            label = self.label_buffer[i_bucket]
-
-            i_idx = self.bucket_curr_idx[i_bucket]
-            idx = self.bucket_idx_all[i_bucket][i_idx:i_idx+self.batch_size]
-            self.bucket_curr_idx[i_bucket] += self.batch_size
-            data[:] = self.data[i_bucket][idx]
-            label[:] = self.label[i_bucket][idx]
-            data_all = [data] + self.init_state_arrays
-            label_all = [label]
-            utt_id = np.array(self.utt_id[i_bucket])[idx]
-            utt_len = np.array(self.utt_lens[i_bucket])[idx]
-            effective_sample_count = mx.nd.sum(label)
-            data_batch = SimpleBatch(data_names, data_all, label_names, label_all,
-                                     self.buckets[i_bucket], utt_id, utt_len,
-                                     effective_sample_count=effective_sample_count)
-            yield data_batch
-
-    def reset(self):
-        self.bucket_curr_idx = [0 for x in self.data]
-
-class TruncatedSentenceIter(mx.io.DataIter):
-    """DataIter used in Truncated-BPTT.
-
-    Each sentence is split into chunks of fixed lengths. The states are
-    forwarded during forward, but the backward is only computed within
-    chunks. This mechanism does not require bucketing, and it sometimes
-    avoid gradient exploding problems in very long sequences.
-
-    Parameters
-    ----------
-    pad_zeros : bool
-        Default `False`. Control the behavior of padding when we run
-        out of the whole dataset. When true, we will pad with all-zeros.
-        When false, will pad with a random sentence in the dataset.
-        Usually, for training we would like to use `False`, but
-        for testing use `True` so that the evaluation metric can
-        choose to ignore the padding by detecting the zero-labels.
-    """
-    def __init__(self, train_sets, batch_size, init_states, truncate_len=20, delay=5,
-                 feat_dim=40, data_name='data', label_name='softmax_label',
-                 has_label=True, do_shuffling=True, pad_zeros=False, time_major=False):
-
-        self.train_sets = train_sets
-        self.train_sets.initialize_read()
-
-        self.data_name = data_name
-        self.label_name = label_name
-
-        self.feat_dim = feat_dim
-        self.has_label = has_label
-        self.batch_size = batch_size
-        self.truncate_len = truncate_len
-        self.delay = delay
-
-        self.do_shuffling = do_shuffling
-        self.pad_zeros = pad_zeros
-
-        self.time_major = time_major
-
-        self.label = None
-        if self.time_major:
-            self.data = [mx.nd.zeros((truncate_len, batch_size, feat_dim))]
-            if has_label:
-                self.label = [mx.nd.zeros((truncate_len, batch_size))]
-        else:
-            self.data = [mx.nd.zeros((batch_size, truncate_len, feat_dim))]
-            if has_label:
-                self.label = [mx.nd.zeros((batch_size, truncate_len))]
-
-        self.init_state_names = [x[0] for x in init_states]
-        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
-
-        self.provide_data = [(data_name, self.data[0].shape)] + init_states
-        self.provide_label = None
-        if has_label:
-            self.provide_label = [(label_name, self.label[0].shape)]
-
-        self._load_data()
-        self._make_data_plan()
-
-    def _load_data(self):
-        sys.stderr.write('Loading data into memory...\n')
-        self.features = []
-        self.labels = []
-        self.utt_ids = []
-
-        seq_len_tot = 0.0
-        while True:
-            (feats, tgs, utt_id) = self.train_sets.load_next_seq()
-            if utt_id is None:
-                break
-            if tgs is None and self.has_label:
-                continue
-            if feats.shape[0] == 0:
-                continue
-
-            if self.has_label and self.delay > 0:
-                # delay the labels
-                tgs[self.delay:] = tgs[:-self.delay]
-                tgs[:self.delay] = tgs[0]  # boradcast assign
-            self.features.append(feats)
-            if self.has_label:
-                self.labels.append(tgs+1)
-            self.utt_ids.append(utt_id)
-            seq_len_tot += feats.shape[0]
-
-        sys.stderr.write('    %d utterances loaded...\n' % len(self.utt_ids))
-        sys.stderr.write('    avg-sequence-len = %.0f\n' % (seq_len_tot/len(self.utt_ids)))
-
-    def _make_data_plan(self):
-        if self.do_shuffling:
-            # TODO: should we group utterances of similar length together?
-            self._data_plan = np.random.permutation(len(self.features))
-        else:
-            # we might not want to do shuffling for testing for example
-            self._data_plan = np.arange(len(self.features))
-
-    def __iter__(self):
-        assert len(self._data_plan) >= self.batch_size, \
-            "Total number of sentences smaller than batch size, consider using smaller batch size"
-        utt_idx = self._data_plan[:self.batch_size]
-        utt_inside_idx = [0] * self.batch_size
-
-        next_utt_idx = self.batch_size
-        is_pad = [False] * self.batch_size
-        pad = 0
-
-        if self.time_major:
-            np_data_buffer = np.zeros((self.truncate_len, self.batch_size, self.feat_dim))
-            np_label_buffer = np.zeros((self.truncate_len, self.batch_size))
-        else:
-            np_data_buffer = np.zeros((self.batch_size, self.truncate_len, self.feat_dim))
-            np_label_buffer = np.zeros((self.batch_size, self.truncate_len))
-
-        utt_id_buffer = [None] * self.batch_size
-
-        data_names = [self.data_name] + self.init_state_names
-        label_names = [self.label_name]
-
-        # reset states
-        for state in self.init_state_arrays:
-            state[:] = 0.1
-
-        while True:
-            effective_sample_count = self.batch_size * self.truncate_len
-            for i, idx in enumerate(utt_idx):
-                fea_utt = self.features[idx]
-                if utt_inside_idx[i] >= fea_utt.shape[0]:
-                    # we have consumed this sentence
-
-                    # reset the states
-                    for state in self.init_state_arrays:
-                        if self.time_major:
-                            state[:, i:i+1, :] = 0.1
-                        else:
-                            state[i:i+1] = 0.1
-                    # load new sentence
-                    if is_pad[i]:
-                        # I am already a padded sentence, just rewind to the
-                        # beginning of the sentece
-                        utt_inside_idx[i] = 0
-                    elif next_utt_idx >= len(self.features):
-                        # we consumed the whole dataset, simply repeat this sentence
-                        # and set pad
-                        pad += 1
-                        is_pad[i] = True
-                        utt_inside_idx[i] = 0
-                    else:
-                        # move to the next sentence
-                        utt_idx[i] = self._data_plan[next_utt_idx]
-                        idx = utt_idx[i]
-                        fea_utt = self.features[idx]
-                        utt_inside_idx[i] = 0
-                        next_utt_idx += 1
-
-                if is_pad[i] and self.pad_zeros:
-                    np_data_buffer[i] = 0
-                    np_label_buffer[i] = 0
-                    effective_sample_count -= self.truncate_len
-                else:
-                    idx_take = slice(utt_inside_idx[i],
-                                     min(utt_inside_idx[i]+self.truncate_len,
-                                         fea_utt.shape[0]))
-                    n_take = idx_take.stop - idx_take.start
-                    if self.time_major:
-                        np_data_buffer[:n_take, i, :] = fea_utt[idx_take]
-                        np_label_buffer[:n_take, i] = self.labels[idx][idx_take]
-                    else:
-                        np_data_buffer[i, :n_take, :] = fea_utt[idx_take]
-                        np_label_buffer[i, :n_take] = self.labels[idx][idx_take]
-
-                    if n_take < self.truncate_len:
-                        if self.time_major:
-                            np_data_buffer[n_take:, i, :] = 0
-                            np_label_buffer[n_take:, i] = 0
-                        else:
-                            np_data_buffer[i, n_take:, :] = 0
-                            np_label_buffer[i, n_take:] = 0
-
-                        effective_sample_count -= self.truncate_len - n_take
-
-                    utt_inside_idx[i] += n_take
-
-                utt_id_buffer[i] = self.utt_ids[idx]
-
-            if pad == self.batch_size:
-                # finished all the senteces
-                break
-
-            self.data[0][:] = np_data_buffer
-            self.label[0][:] = np_label_buffer
-
-            data_batch = SimpleBatch(data_names,
-                                     self.data + self.init_state_arrays,
-                                     label_names, self.label, bucket_key=None,
-                                     utt_id=utt_id_buffer,
-                                     effective_sample_count=effective_sample_count)
-
-            # Instead of using the 'pad' property, we use an array 'is_pad'. Because
-            # our padded sentence could be in the middle of a batch. A sample is pad
-            # if we are running out of the data set and they are just some previously
-            # seen data to be filled for a whole batch. In prediction, those data
-            # should be ignored
-            data_batch.is_pad = is_pad
-
-            yield data_batch
-
-    def reset(self):
-        self._make_data_plan()
-
-
-class BucketSentenceIter(mx.io.DataIter):
-    def __init__(self, train_sets, buckets, batch_size,
-                 init_states, delay=5, feat_dim=40,
-                 data_name='data', label_name='softmax_label', has_label=True):
-
-        self.train_sets = train_sets
-        self.train_sets.initialize_read()
-
-        self.data_name = data_name
-        self.label_name = label_name
-
-        buckets.sort()
-        i_max_bucket = len(buckets)-1
-        max_bucket = buckets[i_max_bucket]
-
-        if has_label != True:
-            buckets = [i for i in range(1, max_bucket)]
-            i_max_bucket = len(buckets)-1
-            max_bucket = buckets[i_max_bucket]
-
-        self.buckets = buckets
-        self.data = [[] for k in buckets]
-        self.utt_id = [[] for k in buckets]
-        self.feat_dim = feat_dim
-        self.default_bucket_key = max(buckets)
-        self.has_label = has_label
-
-        sys.stderr.write("Loading data...\n")
-        T_OVERLAP = buckets[0]/2
-        n = 0
-        while True:
-            (feats, tgts, utt_id) = self.train_sets.load_next_seq()
-            if utt_id is None:
-                break
-            if tgts is None and self.has_label:
-                continue
-            if feats.shape[0] == 0:
-                continue
-
-            # we split sentence into overlapping segments if it is
-            # longer than the largest bucket
-            t_start = 0
-            t_end = feats.shape[0]
-            while t_start < t_end:
-                if t_end - t_start > max_bucket:
-                    t_take = max_bucket
-                    i_bucket = i_max_bucket
-                else:
-                    for i, bkt in enumerate(buckets):
-                        if bkt >= t_end-t_start:
-                            t_take = t_end-t_start
-                            i_bucket = i
-                            break
-
-                n += 1
-                if self.has_label:
-                    self.data[i_bucket].append((feats[t_start:t_start+t_take],
-                                                tgts[t_start:t_start+t_take]+1))
-                else:
-                    self.data[i_bucket].append(feats[t_start:t_start+t_take])
-
-                self.utt_id[i_bucket].append(utt_id)
-                t_start += t_take
-                if t_start >= t_end:
-                    # this sentence is consumed
-                    break
-                t_start -= T_OVERLAP
-
-        # Get the size of each bucket, so that we could sample
-        # uniformly from the bucket
-        bucket_sizes = [len(x) for x in self.data]
-
-        self.batch_size = batch_size
-        # convert data into ndarrays for better speed during training
-
-        data = [np.zeros((len(x), buckets[i], self.feat_dim))
-                if len(x) % self.batch_size == 0
-                else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i],
-                               self.feat_dim))
-                for i, x in enumerate(self.data)]
-
-        label = [np.zeros((len(x), buckets[i]))
-                 if len(x) % self.batch_size == 0
-                 else np.zeros(((len(x)/self.batch_size + 1) * self.batch_size, buckets[i]))
-                 for i, x in enumerate(self.data)]
-
-        utt_id = [[] for k in buckets]
-        for i, x in enumerate(data):
-            utt_id[i] = ["GAP_UTT"] * len(x)
-
-        for i_bucket in range(len(self.buckets)):
-            for j in range(len(self.data[i_bucket])):
-                sentence = self.data[i_bucket][j]
-                if self.has_label:
-                    sentence[1][delay:] = sentence[1][:-delay]
-                    sentence[1][:delay] = sentence[1][0]  # broadcast assignment
-                    data[i_bucket][j, :len(sentence[0])] = sentence[0]
-                    label[i_bucket][j, :len(sentence[1])] = sentence[1]
-                else:
-                    data[i_bucket][j, :len(sentence)] = sentence
-                    # borrow this place to pass in sentence length. TODO: use a less hacky way.
-                    label[i_bucket][j, :len(sentence)] += len(sentence)
-
-                utt_id[i_bucket][j] = self.utt_id[i_bucket][j]
-
-        self.data = data
-        self.label = label
-        self.utt_id = utt_id
-
-        # Get the size of each bucket, so that we could sample
-        # uniformly from the bucket
-        bucket_sizes = [len(x) for x in self.data]
-
-        sys.stderr.write("Summary of dataset ==================\n")
-        for bkt, sz in zip(buckets, bucket_sizes):
-            sys.stderr.write("bucket of len %3d : %d samples\n" % (bkt, sz))
-
-        self.bucket_sizes = bucket_sizes
-        self.make_data_iter_plan()
-
-        self.init_states = init_states
-        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
-
-        self.provide_data = [(data_name, (batch_size, self.default_bucket_key, self.feat_dim))] + \
-            init_states
-        self.provide_label = [(label_name, (self.batch_size, self.default_bucket_key))]
-
-    def make_data_iter_plan(self):
-        "make a random data iteration plan"
-        # truncate each bucket into multiple of batch-size
-        bucket_n_batches = []
-        for i in range(len(self.data)):
-            bucket_n_batches.append(len(self.data[i]) / self.batch_size)
-            self.data[i] = self.data[i][:int(bucket_n_batches[i]*self.batch_size), :]
-            self.label[i] = self.label[i][:int(bucket_n_batches[i]*self.batch_size)]
-
-        bucket_plan = np.hstack([np.zeros(n, int)+i for i, n in enumerate(bucket_n_batches)])
-        np.random.shuffle(bucket_plan)
-
-        bucket_idx_all = [np.random.permutation(len(x)) for x in self.data]
-
-        self.bucket_plan = bucket_plan
-        self.bucket_idx_all = bucket_idx_all
-        self.bucket_curr_idx = [0 for x in self.data]
-
-        self.data_buffer = []
-        self.label_buffer = []
-        for i_bucket in range(len(self.data)):
-            data = mx.nd.zeros((self.batch_size, self.buckets[i_bucket], self.feat_dim))
-            label = mx.nd.zeros((self.batch_size, self.buckets[i_bucket]))
-            self.data_buffer.append(data)
-            self.label_buffer.append(label)
-
-    def __iter__(self):
-        init_state_names = [x[0] for x in self.init_states]
-        data_names = [self.data_name] + init_state_names
-        label_names = [self.label_name]
-
-        for i_bucket in self.bucket_plan:
-            data = self.data_buffer[i_bucket]
-            label = self.label_buffer[i_bucket]
-
-            i_idx = self.bucket_curr_idx[i_bucket]
-            idx = self.bucket_idx_all[i_bucket][i_idx:i_idx+self.batch_size]
-            self.bucket_curr_idx[i_bucket] += self.batch_size
-            data[:] = self.data[i_bucket][idx]
-            label[:] = self.label[i_bucket][idx]
-            data_all = [data] + self.init_state_arrays
-            label_all = [label]
-            utt_id = np.array(self.utt_id[i_bucket])[idx]
-            effective_sample_count = mx.nd.sum(label)
-            data_batch = SimpleBatch(data_names, data_all, label_names, label_all,
-                                     self.buckets[i_bucket], utt_id,
-                                     effective_sample_count=effective_sample_count)
-            yield data_batch
-
-    def reset(self):
-        self.bucket_curr_idx = [0 for x in self.data]
-
diff --git a/example/speech-demo/lstm_proj.py b/example/speech-demo/lstm_proj.py
deleted file mode 100644
index a27518c604..0000000000
--- a/example/speech-demo/lstm_proj.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint:skip-file
-import mxnet as mx
-import numpy as np
-from collections import namedtuple
-
-LSTMState = namedtuple("LSTMState", ["c", "h"])
-LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias",
-                                     "h2h_weight", "h2h_bias",
-                                     "ph2h_weight",
-                                     "c2i_bias", "c2f_bias", "c2o_bias"])
-LSTMModel = namedtuple("LSTMModel", ["rnn_exec", "symbol",
-                                     "init_states", "last_states",
-                                     "seq_data", "seq_labels", "seq_outputs",
-                                     "param_blocks"])
-
-def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0., num_hidden_proj=0):
-    """LSTM Cell symbol"""
-    if dropout > 0.:
-        indata = mx.sym.Dropout(data=indata, p=dropout)
-
-    i2h = mx.sym.FullyConnected(data=indata,
-                                weight=param.i2h_weight,
-                                bias=param.i2h_bias,
-                                num_hidden=num_hidden * 4,
-                                name="t%d_l%d_i2h" % (seqidx, layeridx))
-    h2h = mx.sym.FullyConnected(data=prev_state.h,
-                                weight=param.h2h_weight,
-                                #bias=param.h2h_bias,
-                                no_bias=True,
-                                num_hidden=num_hidden * 4,
-                                name="t%d_l%d_h2h" % (seqidx, layeridx))
-    gates = i2h + h2h
-    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
-                                      name="t%d_l%d_slice" % (seqidx, layeridx))
-
-    Wcidc = mx.sym.broadcast_mul(param.c2i_bias,  prev_state.c) + slice_gates[0]
-    in_gate = mx.sym.Activation(Wcidc, act_type="sigmoid")
-    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
-
-    Wcfdc = mx.sym.broadcast_mul(param.c2f_bias, prev_state.c) + slice_gates[2]
-    forget_gate = mx.sym.Activation(Wcfdc, act_type="sigmoid")
-    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
-
-    Wcoct = mx.sym.broadcast_mul(param.c2o_bias, next_c) + slice_gates[3]
-    out_gate = mx.sym.Activation(Wcoct, act_type="sigmoid")
-
-    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")
-
-    if num_hidden_proj > 0:
-        proj_next_h = mx.sym.FullyConnected(data=next_h,
-                                            weight=param.ph2h_weight,
-                                            no_bias=True,
-                                            num_hidden=num_hidden_proj,
-                                            name="t%d_l%d_ph2h" % (seqidx, layeridx))
-
-        return LSTMState(c=next_c, h=proj_next_h)
-    else:
-        return LSTMState(c=next_c, h=next_h)
-
-def lstm_unroll(num_lstm_layer, seq_len, input_size,
-                num_hidden, num_label, dropout=0., output_states=False, take_softmax=True, num_hidden_proj=0):
-
-    cls_weight = mx.sym.Variable("cls_weight")
-    cls_bias = mx.sym.Variable("cls_bias")
-    param_cells = []
-    last_states = []
-    for i in range(num_lstm_layer):
-        param_cells.append(LSTMParam(i2h_weight = mx.sym.Variable("l%d_i2h_weight" % i),
-                                     i2h_bias = mx.sym.Variable("l%d_i2h_bias" % i),
-                                     h2h_weight = mx.sym.Variable("l%d_h2h_weight" % i),
-                                     h2h_bias = mx.sym.Variable("l%d_h2h_bias" % i),
-                                     ph2h_weight = mx.sym.Variable("l%d_ph2h_weight" % i),
-                                     c2i_bias = mx.sym.Variable("l%d_c2i_bias" % i, shape=(1,num_hidden)),
-                                     c2f_bias = mx.sym.Variable("l%d_c2f_bias" % i, shape=(1,num_hidden)),
-                                     c2o_bias = mx.sym.Variable("l%d_c2o_bias" % i, shape=(1, num_hidden))
-                                     ))
-        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
-                          h=mx.sym.Variable("l%d_init_h" % i))
-        last_states.append(state)
-    assert(len(last_states) == num_lstm_layer)
-
-    data = mx.sym.Variable('data')
-    label = mx.sym.Variable('softmax_label')
-
-    dataSlice = mx.sym.SliceChannel(data=data, num_outputs=seq_len, squeeze_axis=1)
-
-    hidden_all = []
-    for seqidx in range(seq_len):
-        hidden = dataSlice[seqidx]
-
-        # stack LSTM
-        for i in range(num_lstm_layer):
-            if i == 0:
-                dp = 0.
-            else:
-                dp = dropout
-            next_state = lstm(num_hidden, indata=hidden,
-                              prev_state=last_states[i],
-                              param=param_cells[i],
-                              seqidx=seqidx, layeridx=i, dropout=dp, num_hidden_proj=num_hidden_proj)
-            hidden = next_state.h
-            last_states[i] = next_state
-        # decoder
-        if dropout > 0.:
-            hidden = mx.sym.Dropout(data=hidden, p=dropout)
-        hidden_all.append(hidden)
-
-    hidden_concat = mx.sym.Concat(*hidden_all, dim=1)
-    if num_hidden_proj > 0:
-        hidden_final = mx.sym.Reshape(hidden_concat, target_shape=(0, num_hidden_proj))
-    else:
-        hidden_final = mx.sym.Reshape(hidden_concat, target_shape=(0, num_hidden))
-    pred = mx.sym.FullyConnected(data=hidden_final, num_hidden=num_label,
-                                 weight=cls_weight, bias=cls_bias, name='pred')
-    pred = mx.sym.Reshape(pred, shape=(-1, num_label))
-    label = mx.sym.Reshape(label, shape=(-1,))
-    if take_softmax:
-        sm = mx.sym.SoftmaxOutput(data=pred, label=label, ignore_label=0,
-                                  use_ignore=True, name='softmax')
-    else:
-        sm = pred
-
-    if output_states:
-        # block the gradients of output states
-        for i in range(num_lstm_layer):
-            state = last_states[i]
-            state = LSTMState(c=mx.sym.BlockGrad(state.c, name="l%d_last_c" % i),
-                              h=mx.sym.BlockGrad(state.h, name="l%d_last_h" % i))
-            last_states[i] = state
-
-        # also output states, used in truncated-bptt to copy over states
-        unpack_c = [state.c for state in last_states]
-        unpack_h = [state.h for state in last_states]
-        sm = mx.sym.Group([sm] + unpack_c + unpack_h)
-
-    return sm
diff --git a/example/speech-demo/make_stats.py b/example/speech-demo/make_stats.py
deleted file mode 100644
index 64991db20a..0000000000
--- a/example/speech-demo/make_stats.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import sys
-sys.path.insert(0, "../../python")
-import time
-import logging
-import os.path
-
-import mxnet as mx
-import numpy as np
-
-from lstm_proj import lstm_unroll
-from io_util import BucketSentenceIter, TruncatedSentenceIter, SimpleIter, DataReadStream
-from config_util import parse_args, get_checkpoint_path, parse_contexts
-
-from io_func.feat_readers.writer_kaldi import KaldiWriteOut
-
-# some constants
-METHOD_BUCKETING = 'bucketing'
-METHOD_TBPTT = 'truncated-bptt'
-METHOD_SIMPLE = 'simple'
-
-
-def prepare_data(args):
-    batch_size = args.config.getint('train', 'batch_size')
-    num_hidden = args.config.getint('arch', 'num_hidden')
-    num_lstm_layer = args.config.getint('arch', 'num_lstm_layer')
-
-    init_c = [('l%d_init_c' % l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
-    init_h = [('l%d_init_h' % l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
-
-    init_states = init_c + init_h
-
-    file_test = args.config.get('data', 'train')
-
-    file_format = args.config.get('data', 'format')
-    feat_dim = args.config.getint('data', 'xdim')
-
-    test_data_args = {
-            "gpu_chunk": 32768,
-            "lst_file": file_test,
-            "file_format": file_format,
-            "separate_lines": True,
-            "has_labels": True
-            }
-
-    test_sets = DataReadStream(test_data_args, feat_dim)
-
-    return (init_states, test_sets)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    args.config.write(sys.stderr)
-
-    decoding_method = args.config.get('train', 'method')
-    contexts = parse_contexts(args)
-
-    init_states, test_sets = prepare_data(args)
-    state_names = [x[0] for x in init_states]
-
-    batch_size = args.config.getint('train', 'batch_size')
-    num_hidden = args.config.getint('arch', 'num_hidden')
-    num_lstm_layer = args.config.getint('arch', 'num_lstm_layer')
-    feat_dim = args.config.getint('data', 'xdim')
-    label_dim = args.config.getint('data', 'ydim')
-    out_file = args.config.get('data', 'out_file')
-    num_epoch = args.config.getint('train', 'num_epoch')
-    model_name = get_checkpoint_path(args)
-    logging.basicConfig(level=logging.DEBUG, format='%(asctime)-15s %(message)s')
-
-    # load the model
-    label_mean = np.zeros((label_dim,1), dtype='float32')
-    data_test = TruncatedSentenceIter(test_sets, batch_size, init_states,
-                                         20, feat_dim=feat_dim,
-                                         do_shuffling=False, pad_zeros=True, has_label=True)
-
-    for i, batch in enumerate(data_test.labels):
-        hist, edges = np.histogram(batch.flat, bins=range(0,label_dim+1))
-        label_mean += hist.reshape(label_dim,1)
-
-    kaldiWriter = KaldiWriteOut(None, out_file)
-    kaldiWriter.open_or_fd()
-    kaldiWriter.write("label_mean", label_mean)
-
-
-    args.config.write(sys.stderr)
diff --git a/example/speech-demo/python_wrap/Makefile b/example/speech-demo/python_wrap/Makefile
deleted file mode 100644
index 2c020b0d87..0000000000
--- a/example/speech-demo/python_wrap/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-all:
-
-include ../kaldi.mk
-
-OBJFILES = ctypes.o
-
-LIBNAME = kaldi-python-wrap
-
-ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a  ../hmm/kaldi-hmm.a ../cudamatrix/kaldi-cudamatrix.a ../nnet/kaldi-nnet.a ../thread/kaldi-thread.a
-
-include ../makefiles/default_rules.mk
diff --git a/example/speech-demo/python_wrap/ctypes.cc b/example/speech-demo/python_wrap/ctypes.cc
deleted file mode 100644
index a2c79468ed..0000000000
--- a/example/speech-demo/python_wrap/ctypes.cc
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <iostream>
-
-#include "util/table-types.h"
-#include "hmm/posterior.h"
-#include "nnet/nnet-nnet.h"
-#include "cudamatrix/cu-device.h"
-
-class Foo{
-    public:
-        Foo() {
-            x[0] = 0.5f;
-            x[1] = 1.5f;
-            x[2] = 2.5f;
-            x[3] = 3.5f;
-            x[4] = 4.5f;
-        }
-        void bar(){
-            std::cout << "Hello" << std::endl;
-        }
-        float * getx() {
-            return x;
-        }
-        int sizex() {
-            return sizeof(x) / sizeof(float);
-        }
-    private:
-        float x[5];
-};
-
-namespace kaldi {
-  typedef SequentialBaseFloatMatrixReader SBFMReader;
-  typedef Matrix<BaseFloat> MatrixF;
-  typedef RandomAccessPosteriorReader RAPReader;
-
-  namespace nnet1 {
-    typedef class Nnet_t_ {
-    public:
-      Nnet nnet_transf;
-      CuMatrix<BaseFloat> feats_transf;
-      MatrixF buf;
-    } Nnet_t;
-  }
-}
-
-extern "C" {
-
-  Foo* Foo_new(){ return new Foo(); }
-  void Foo_bar(Foo* foo){ foo->bar(); }
-  float * Foo_getx(Foo* foo) { return foo->getx(); }
-  int Foo_sizex(Foo* foo) { return foo->sizex(); }
-
-  using namespace kaldi;
-  using namespace kaldi::nnet1;
-
-  /****************************** SBFMReader ******************************/
-
-  //SequentialTableReader(): impl_(NULL) { }
-  SBFMReader* SBFMReader_new() {
-    return new SBFMReader();
-  }
-  //SequentialTableReader(const std::string &rspecifier);
-  SBFMReader* SBFMReader_new_char(char * rspecifier) {
-    return new SBFMReader(rspecifier);
-  }
-  //bool Open(const std::string &rspecifier);
-  int SBFMReader_Open(SBFMReader* r, char * rspecifier) {
-    return r->Open(rspecifier);
-  }
-  //inline bool Done();
-  int SBFMReader_Done(SBFMReader* r) {
-    return r->Done();
-  }
-  //inline std::string Key();
-  const char * SBFMReader_Key(SBFMReader* r) {
-    return r->Key().c_str();
-  }
-  //void FreeCurrent();
-  void SBFMReader_FreeCurrent(SBFMReader* r) {
-    r->FreeCurrent();
-  }
-  //const T &Value();
-  const MatrixF * SBFMReader_Value(SBFMReader* r) {
-    return &r->Value(); //despite how dangerous this looks, this is safe because holder maintains object (it's not stack allocated)
-  }
-  //void Next();
-  void SBFMReader_Next(SBFMReader* r) {
-    r->Next();
-  }
-  //bool IsOpen() const;
-  int SBFMReader_IsOpen(SBFMReader* r) {
-    return r->IsOpen();
-  }
-  //bool Close();
-  int SBFMReader_Close(SBFMReader* r) {
-    return r->Close();
-  }
-  //~SequentialTableReader();
-  void SBFMReader_Delete(SBFMReader* r) {
-    delete r;
-  }
-
-  /****************************** MatrixF ******************************/
-
-  //NumRows ()
-  int MatrixF_NumRows(MatrixF *m) {
-    return m->NumRows();
-  }
-  //NumCols ()
-  int MatrixF_NumCols(MatrixF *m) {
-    return m->NumCols();
-  }
-
-  //Stride ()
-  int MatrixF_Stride(MatrixF *m) {
-    return m->Stride();
-  }
-
-  void MatrixF_cpy_to_ptr(MatrixF *m, float * dst, int dst_stride) {
-    int num_rows = m->NumRows();
-    int num_cols = m->NumCols();
-    int src_stride = m->Stride();
-    int bytes_per_row = num_cols * sizeof(float);
-
-    float * src = m->Data();
-
-    for (int r=0; r<num_rows; r++) {
-      memcpy(dst, src, bytes_per_row);
-      src += src_stride;
-      dst += dst_stride;
-    }
-  }
-
-  //SizeInBytes ()
-  int MatrixF_SizeInBytes(MatrixF *m) {
-    return m->SizeInBytes();
-  }
-  //Data (), Real is usually float32
-  const float * MatrixF_Data(MatrixF *m) {
-    return m->Data();
-  }
-
-  /****************************** RAPReader ******************************/
-
-  RAPReader* RAPReader_new_char(char * rspecifier) {
-    return new RAPReader(rspecifier);
-  }
-
-  //bool  HasKey (const std::string &key)
-  int RAPReader_HasKey(RAPReader* r, char * key) {
-    return r->HasKey(key);
-  }
-
-  //const T &   Value (const std::string &key)
-  int * RAPReader_Value(RAPReader* r, char * key) {
-    //return &r->Value(key);
-    const Posterior p = r->Value(key);
-    int num_rows = p.size();
-    if (num_rows == 0) {
-      return NULL;
-    }
-
-    //std::cout << "num_rows " << num_rows << std::endl;
-
-    int * vals = new int[num_rows];
-
-    for (int row=0; row<num_rows; row++) {
-      int num_cols = p.at(row).size();
-      if (num_cols != 1) {
-        std::cout << "num_cols != 1: " << num_cols << std::endl;
-        delete vals;
-        return NULL;
-      }
-      std::pair<int32, BaseFloat> pair = p.at(row).at(0);
-      if (pair.second != 1) {
-        std::cout << "pair.second != 1: " << pair.second << std::endl;
-        delete vals;
-        return NULL;
-      }
-      vals[row] = pair.first;
-    }
-
-    return vals;
-  }
-
-  void RAPReader_DeleteValue(RAPReader* r, int * vals) {
-    delete vals;
-  }
-
-  //~RandomAccessTableReader ()
-  void RAPReader_Delete(RAPReader* r) {
-    delete r;
-  }
-
-  /****************************** Nnet_t ******************************/
-
-  Nnet_t* Nnet_new(char * filename, float dropout_retention, int crossvalidate) {
-    //std::cout << "dropout_retention " << dropout_retention << " crossvalidate " << crossvalidate << std::endl;
-
-    Nnet_t * nnet = new Nnet_t();
-
-    if(strcmp(filename, "") != 0) {
-      nnet->nnet_transf.Read(filename);
-    }
-
-    if (dropout_retention > 0.0) {
-      nnet->nnet_transf.SetDropoutRate(dropout_retention);
-    }
-    if (crossvalidate) {
-      nnet->nnet_transf.SetDropoutRate(1.0);
-    }
-
-    return nnet;
-  }
-
-  const MatrixF * Nnet_Feedforward(Nnet_t* nnet, MatrixF * inputs) {
-    nnet->nnet_transf.Feedforward(CuMatrix<BaseFloat>(*inputs), &nnet->feats_transf);
-    nnet->buf.Resize(nnet->feats_transf.NumRows(), nnet->feats_transf.NumCols());
-    nnet->feats_transf.CopyToMat(&nnet->buf);
-    return &nnet->buf;
-  }
-
-  void Nnet_Delete(Nnet_t* nnet) {
-    delete nnet;
-  }
-}
diff --git a/example/speech-demo/python_wrap/example_usage/README.txt b/example/speech-demo/python_wrap/example_usage/README.txt
deleted file mode 100644
index 23fbb3d035..0000000000
--- a/example/speech-demo/python_wrap/example_usage/README.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-# If not already done, make sure kaldi/src is compiled as shared libraries
-cd kaldi/src
-./configure --shared
-make depend
-make
-
-# Copy python_wrap/ to kaldi/src and compile it
-cd python_wrap/
-make
-
-cd example_usage/
-# Add kaldi/src/lib to LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=../../lib:$LD_LIBRARY_PATH
-python example.py
\ No newline at end of file
diff --git a/example/speech-demo/python_wrap/example_usage/data.ark b/example/speech-demo/python_wrap/example_usage/data.ark
deleted file mode 100644
index d4939db527..0000000000
Binary files a/example/speech-demo/python_wrap/example_usage/data.ark and /dev/null differ
diff --git a/example/speech-demo/python_wrap/example_usage/data.scp b/example/speech-demo/python_wrap/example_usage/data.scp
deleted file mode 100644
index 10589e8bc7..0000000000
--- a/example/speech-demo/python_wrap/example_usage/data.scp
+++ /dev/null
@@ -1 +0,0 @@
-test_feat data.ark:10
diff --git a/example/speech-demo/python_wrap/example_usage/data.txt b/example/speech-demo/python_wrap/example_usage/data.txt
deleted file mode 100644
index de5b46e1d0..0000000000
--- a/example/speech-demo/python_wrap/example_usage/data.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-test_feat  [
-  1.2345 6.789
-  -9.876 0.0001 ]
diff --git a/example/speech-demo/python_wrap/example_usage/example.py b/example/speech-demo/python_wrap/example_usage/example.py
deleted file mode 100644
index d930327f19..0000000000
--- a/example/speech-demo/python_wrap/example_usage/example.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import ctypes
-import numpy
-
-c_float_ptr = ctypes.POINTER(ctypes.c_float)
-c_int_ptr = ctypes.POINTER(ctypes.c_int)
-c_void_p = ctypes.c_void_p
-c_int = ctypes.c_int
-c_char_p = ctypes.c_char_p
-c_float = ctypes.c_float
-
-kaldi = ctypes.cdll.LoadLibrary("libkaldi-python-wrap.so")  # this needs to be in LD_LIBRARY_PATH
-
-def decl(f, restype, argtypes):
-    f.restype = restype
-    if argtypes is not None and len(argtypes) != 0:
-        f.argtypes = argtypes
-
-decl(kaldi.Foo_new, c_void_p, [])
-decl(kaldi.Foo_bar, None, [c_void_p])
-decl(kaldi.Foo_getx, c_float_ptr, [c_void_p])
-decl(kaldi.Foo_sizex, c_int, [c_void_p])
-
-decl(kaldi.SBFMReader_new,          c_void_p,   [])
-decl(kaldi.SBFMReader_new_char,     c_void_p,   [c_char_p])
-decl(kaldi.SBFMReader_Open,         c_int,      [c_void_p, c_char_p])
-decl(kaldi.SBFMReader_Done,         c_int,      [c_void_p])
-decl(kaldi.SBFMReader_Key,          c_char_p,   [c_void_p])
-decl(kaldi.SBFMReader_FreeCurrent,  None,       [c_void_p])
-decl(kaldi.SBFMReader_Value,        c_void_p,   [c_void_p])
-decl(kaldi.SBFMReader_Next,         None,       [c_void_p])
-decl(kaldi.SBFMReader_IsOpen,       c_int,      [c_void_p])
-decl(kaldi.SBFMReader_Close,        c_int,      [c_void_p])
-decl(kaldi.SBFMReader_Delete,       None,       [c_void_p])
-
-decl(kaldi.MatrixF_NumRows,     c_int,       [c_void_p])
-decl(kaldi.MatrixF_NumCols,     c_int,       [c_void_p])
-decl(kaldi.MatrixF_Stride,      c_int,       [c_void_p])
-decl(kaldi.MatrixF_cpy_to_ptr,  None,        [c_void_p, c_float_ptr, c_int])
-decl(kaldi.MatrixF_SizeInBytes, c_int,       [c_void_p])
-decl(kaldi.MatrixF_Data,        c_float_ptr, [c_void_p])
-
-if __name__ == "__main__":
-    print("-------- Foo class example --------")
-    a = kaldi.Foo_new()
-    print("Calling Foo_bar(): ",)
-    kaldi.Foo_bar(a)
-    print()
-    print("Result of Foo_getx(): ", kaldi.Foo_getx(a))
-    print("Result of Foo_sizex(): ", kaldi.Foo_sizex(a))
-
-    print()
-    print("-------- Kaldi SBFMReader and MatrixF class example --------")
-
-    reader = kaldi.SBFMReader_new_char("scp:data.scp")
-
-    # data.scp has exactly one utterance, assert it's there
-    assert(not kaldi.SBFMReader_Done(reader))
-
-    utt_id = kaldi.SBFMReader_Key(reader)
-
-    feat_value = kaldi.SBFMReader_Value(reader)
-    feat_rows = kaldi.MatrixF_NumRows(feat_value)
-    feat_cols = kaldi.MatrixF_NumCols(feat_value)
-    feat_data = kaldi.MatrixF_Data(feat_value)
-
-    # never use numpy.ndarray(buf=) or numpy.ctypeslib.as_array
-    # because you don't know if Python or C owns buffer
-    # (even if you numpy.copy() resulting array)
-    # http://stackoverflow.com/questions/4355524/getting-data-from-ctypes-array-into-numpy
-    #
-    # Can't use memmove/memcpy because arrays are strided
-    # Use cpy_to_ptr
-    feats = numpy.empty((feat_rows,feat_cols), dtype=numpy.float32)
-
-    # MUST: cast Python int to pointer, otherwise C interprets as 32-bit
-    # if you print the pointer value before casting, you might see weird value before seg fault
-    # casting fixes that
-    feats_numpy_ptr = ctypes.cast(feats.ctypes.data, c_float_ptr)
-    kaldi.MatrixF_cpy_to_ptr(feat_value, feats_numpy_ptr, feats.strides[0]/4)
-
-    print("Read utterance:")
-    print("  ID: ", utt_id)
-    print("  Rows: ", feat_rows)
-    print("  Cols: ", feat_cols)
-    print("  Value: ", feat_data)
-    print(feats)
-    print("  This should match data.txt")
-
-    # assert no more utterances left
-    kaldi.SBFMReader_Next(reader)
-    assert(kaldi.SBFMReader_Done(reader))
-
-    kaldi.SBFMReader_Delete(reader)
diff --git a/example/speech-demo/run_ami.sh b/example/speech-demo/run_ami.sh
deleted file mode 100755
index 0103fd1832..0000000000
--- a/example/speech-demo/run_ami.sh
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# This script trains and evaluate LSTM models. There is no
-# discriminative training yet.
-# In this recipe, MXNet directly read Kaldi features and labels,
-# which makes the whole pipline much simpler.
-
-set -e           #Exit on non-zero return code from any command
-set -o pipefail  #Exit if any of the commands in the pipeline will
-                 #return non-zero return code
-set -u           #Fail on an undefined variable
-
-. ./cmd.sh
-. ./path.sh
-
-cmd=hostd3.pl
-# root folder,
-expdir=exp_mxnet
-
-##################################################
-# Kaldi generated folder
-##################################################
-
-# alignment folder
-ali_src=exp_cntk/sdm1/dnn_120fbank_ali
-
-# decoding graph
-graph_src=exp/sdm1/tri3a/graph_ami_fsh.o3g.kn.pr1-7/
-
-# features
-train_src=data/sdm1/train_fbank_gcmvn
-dev_src=data/sdm1/eval_fbank_gcmvn
-
-# config file
-config=ami_local_bptt.cfg
-
-# optional settings,
-njdec=128
-scoring="--min-lmwt 5 --max-lmwt 19"
-
-# The device number to run the training
-# change to AUTO to select the card automatically
-deviceNumber=gpu1
-
-# decoding method
-method=simple
-modelName=
-# model
-prefix=
-num_epoch=
-acwt=0.1
-#smbr training variables
-num_utts_per_iter=40
-smooth_factor=0.1
-use_one_sil=true
-
-stage=0
-. utils/parse_options.sh || exit 1;
-
-
-###############################################
-# Training
-###############################################
-
-mkdir -p $expdir
-dir=$expdir/data-for-mxnet
-
-# prepare listing data
-if [ $stage -le 0 ] ; then
-    mkdir -p $dir
-    mkdir -p $dir/log
-    mkdir -p $dir/rawpost
-
-    # for compressed ali
-    #$cmd JOB=1:$njdec $dir/log/gen_post.JOB.log \
-    #    ali-to-pdf $ali_src/final.mdl "ark:gunzip -c $ali_src/ali.JOB.gz |" \
-    #        ark:- | ali-to-post ark:- ark,scp:$dir/rawpost/post.JOB.ark,$dir/rawpost/post.JOB.scp || exit 1;
-    num=`cat $ali_src/num_jobs`
-    $cmd JOB=1:$num $dir/log/gen_post.JOB.log \
-        ali-to-pdf $ali_src/final.mdl ark:$ali_src/ali.JOB.ark \
-            ark:- \| ali-to-post ark:- ark,scp:$dir/rawpost/post.JOB.ark,$dir/rawpost/post.JOB.scp || exit 1;
-
-
-    for n in $(seq $num); do
-        cat $dir/rawpost/post.${n}.scp || exit 1;
-    done > $dir/post.scp
-fi
-
-if [ $stage -le 1 ] ; then
-    # split the data : 90% train and 10% held-out
-    [ ! -e ${train_src}_tr90 ] && utils/subset_data_dir_tr_cv.sh $train_src ${train_src}_tr90 ${train_src}_cv10
-
-    # generate dataset list
-    echo NO_FEATURE_TRANSFORM scp:${train_src}_tr90/feats.scp > $dir/train.feats
-    echo scp:$dir/post.scp >> $dir/train.feats
-
-    echo NO_FEATURE_TRANSFORM scp:${train_src}_cv10/feats.scp > $dir/dev.feats
-    echo scp:$dir/post.scp >> $dir/dev.feats
-
-    echo NO_FEATURE_TRANSFORM scp:${dev_src}/feats.scp > $dir/test.feats
-fi
-
-# generate label counts
-if [ $stage -le 2 ] ; then
-    $cmd JOB=1:1 $dir/log/gen_label_mean.JOB.log \
-        python make_stats.py --configfile $config --data_train $dir/train.feats \| copy-feats ark:- ark:$dir/label_mean.ark
-    echo NO_FEATURE_TRANSFORM ark:$dir/label_mean.ark > $dir/label_mean.feats
-fi
-
-
-# training, note that weight decay is for the whole batch (0.00001 * 20 (minibatch) * 40 (batch_size))
-if [ $stage -le 3 ] ; then
-    python train_lstm_proj.py --configfile $config --data_train $dir/train.feats --data_dev $dir/dev.feats --train_prefix $PWD/$expdir/$prefix --train_optimizer speechSGD --train_learning_rate 1 --train_context $deviceNumber --train_weight_decay 0.008 --train_show_every 1000
-fi
-
-# decoding
-if [ $stage -le 4 ] ; then
-  cp $ali_src/final.mdl $expdir
-  mxnet_string="OMP_NUM_THREADS=1 python decode_mxnet.py --config $config --data_test $dir/test.feats --data_label_mean $dir/label_mean.feats --train_method $method --train_prefix $PWD/$expdir/$prefix --train_num_epoch $num_epoch --train_context cpu0 --train_batch_size 1"
-  ./decode_mxnet.sh --nj $njdec --cmd $decode_cmd --acwt $acwt --scoring-opts "$scoring" \
-    $graph_src $dev_src $expdir/decode_${prefix}_$(basename $dev_src) "$mxnet_string" || exit 1;
-
-fi
diff --git a/example/speech-demo/run_timit.sh b/example/speech-demo/run_timit.sh
deleted file mode 100755
index 023ae6f229..0000000000
--- a/example/speech-demo/run_timit.sh
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# This script trains and evaluate LSTM models. There is no
-# discriminative training yet.
-# In this recipe, MXNet directly read Kaldi features and labels,
-# which makes the whole pipline much simpler.
-
-set -e           #Exit on non-zero return code from any command
-set -o pipefail  #Exit if any of the commands in the pipeline will
-                 #return non-zero return code
-set -u           #Fail on an undefined variable
-
-. ./cmd.sh
-. ./path.sh
-
-cmd=run.pl
-# root folder,
-expdir=exp_timit
-
-##################################################
-# Kaldi generated folder
-##################################################
-
-# alignment folder
-ali_src=/home/sooda/speech/kaldi/egs/timit/s5/exp/tri3_ali
-
-# decoding graph
-graph_src=/home/sooda/speech/kaldi/egs/timit/s5/exp/tri3/graph
-
-# features
-train_src=/home/sooda/speech/kaldi/egs/timit/s5/data/train
-dev_src=/home/sooda/speech/kaldi/egs/timit/s5/data/dev
-
-# config file
-config=default_timit.cfg
-# optional settings,
-njdec=8
-scoring="--min-lmwt 5 --max-lmwt 19"
-
-# The device number to run the training
-# change to AUTO to select the card automatically
-deviceNumber=gpu0
-
-# decoding method
-method=simple
-modelName=
-# model
-prefix=timit
-num_epoch=12
-acwt=0.1
-#smbr training variables
-num_utts_per_iter=40
-smooth_factor=0.1
-use_one_sil=true
-
-stage=4
-. utils/parse_options.sh || exit 1;
-
-
-###############################################
-# Training
-###############################################
-
-mkdir -p $expdir
-dir=$expdir/data-for-mxnet
-
-# prepare listing data
-if [ $stage -le 0 ] ; then
-    mkdir -p $dir
-    mkdir -p $dir/log
-    mkdir -p $dir/rawpost
-
-    # for compressed ali
-    num=`cat $ali_src/num_jobs`
-    $cmd JOB=1:$num $dir/log/gen_post.JOB.log \
-        ali-to-pdf $ali_src/final.mdl "ark:gunzip -c $ali_src/ali.JOB.gz |" \
-            ark:- \| ali-to-post ark:- ark,scp:$dir/rawpost/post.JOB.ark,$dir/rawpost/post.JOB.scp || exit 1;
-    #num=`cat $ali_src/num_jobs`
-    #$cmd JOB=1:$num $dir/log/gen_post.JOB.log \
-    #    ali-to-pdf $ali_src/final.mdl ark:$ali_src/ali.JOB.ark \
-    #        ark:- \| ali-to-post ark:- ark,scp:$dir/rawpost/post.JOB.ark,$dir/rawpost/post.JOB.scp || exit 1;
-
-
-    for n in $(seq $num); do
-        cat $dir/rawpost/post.${n}.scp || exit 1;
-    done > $dir/post.scp
-fi
-
-if [ $stage -le 1 ] ; then
-    # split the data : 90% train and 10% held-out
-    [ ! -e ${train_src}_tr90 ] && utils/subset_data_dir_tr_cv.sh $train_src ${train_src}_tr90 ${train_src}_cv10
-
-    # generate dataset list
-    echo NO_FEATURE_TRANSFORM scp:${train_src}_tr90/feats.scp > $dir/train.feats
-    echo scp:$dir/post.scp >> $dir/train.feats
-
-    echo NO_FEATURE_TRANSFORM scp:${train_src}_cv10/feats.scp > $dir/dev.feats
-    echo scp:$dir/post.scp >> $dir/dev.feats
-
-    echo NO_FEATURE_TRANSFORM scp:${dev_src}/feats.scp > $dir/test.feats
-fi
-
-# generate label counts
-if [ $stage -le 2 ] ; then
-    $cmd JOB=1:1 $dir/log/gen_label_mean.JOB.log \
-        python make_stats.py --configfile $config --data_train $dir/train.feats \| copy-feats ark:- ark:$dir/label_mean.ark
-    echo NO_FEATURE_TRANSFORM ark:$dir/label_mean.ark > $dir/label_mean.feats
-fi
-
-
-# training, note that weight decay is for the whole batch (0.00001 * 20 (minibatch) * 40 (batch_size))
-if [ $stage -le 3 ] ; then
-    python train_lstm_proj.py --configfile $config --data_train $dir/train.feats --data_dev $dir/dev.feats --train_prefix $PWD/$expdir/$prefix --train_optimizer speechSGD --train_learning_rate 1 --train_context $deviceNumber --train_weight_decay 0.008 --train_show_every 1000
-fi
-
-# decoding
-if [ $stage -le 4 ] ; then
-  cp $ali_src/final.mdl $expdir
-  mxnet_string="OMP_NUM_THREADS=1 python decode_mxnet.py --config $config --data_test $dir/test.feats --data_label_mean $dir/label_mean.feats --train_method $method --train_prefix $PWD/$expdir/$prefix --train_num_epoch $num_epoch --train_context cpu0 --train_batch_size 1"
-  ./decode_mxnet.sh --nj $njdec --cmd $cmd --acwt $acwt --scoring-opts "$scoring" \
-    $graph_src $dev_src $expdir/decode_${prefix}_$(basename $dev_src) "$mxnet_string" || exit 1;
-
-fi
diff --git a/example/speech-demo/speechSGD.py b/example/speech-demo/speechSGD.py
deleted file mode 100644
index 931f40afc0..0000000000
--- a/example/speech-demo/speechSGD.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-
-from mxnet.ndarray import NDArray, zeros, clip, sqrt
-from mxnet.random import normal
-
-@mx.optimizer.register
-class speechSGD(mx.optimizer.Optimizer):
-    """A very simple SGD optimizer with momentum and weight regularization.
-
-    Parameters
-    ----------
-    learning_rate : float, optional
-        learning_rate of SGD
-
-    momentum : float, optional
-       momentum value
-
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : float, optional
-        rescaling factor of gradient.
-
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-
-    param_idx2name : dict of string/int to float, optional
-        special treat weight decay in parameter ends with bias, gamma, and beta
-    """
-    def __init__(self, momentum=0.0, **kwargs):
-        super(speechSGD, self).__init__(**kwargs)
-        self.momentum = momentum
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state such as momentum.
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-
-        """
-        if self.momentum == 0.0:
-            return None
-        else:
-            return zeros(weight.shape, weight.context, dtype=weight.dtype)
-
-    def _get_lr(self, index):
-        """get learning rate for index.
-
-        Parameters
-        ----------
-        index : int
-            The index for weight
-
-        Returns
-        -------
-        lr : float
-            learning rate for this index
-        """
-        mom = 0.0
-        if self.lr_scheduler is not None:
-            (lr, mom) = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
-
-        if index in self.lr_mult:
-            lr *= self.lr_mult[index]
-        elif index in self.idx2name:
-            lr *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lr, mom
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        (lr, momentum) = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        if state:
-            mom = state
-            mom[:] *= momentum
-            mom[:] += -lr * (1.0 - momentum) * (grad + wd * weight)
-            weight[:] += mom
-        else:
-            assert self.momentum == 0.0
-            weight[:] += -lr * (grad + self.wd * weight)
-
-
-
diff --git a/example/speech-demo/tests/test_nothing.py b/example/speech-demo/tests/test_nothing.py
deleted file mode 100644
index d6e810f6e9..0000000000
--- a/example/speech-demo/tests/test_nothing.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-def test_nothing():
-	pass
diff --git a/example/speech-demo/tests/test_system.py b/example/speech-demo/tests/test_system.py
deleted file mode 100644
index a64879ae44..0000000000
--- a/example/speech-demo/tests/test_system.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-from pdnn.run_DNN import run_DNN
-from pdnn.run_RBM import run_RBM
-from pdnn.run_SDA import run_SDA
-from pdnn.eval_DNN import eval_DNN
-import json
-from utils.utils import setup_logger
-
-MNIST_CONF = json.load(open("configs/unittest_mnist.json"))
-MAX_ITERS = 2
-setup_logger(None)
-
-def banner(s):
-    print("***********************" + s + "*************************")
-
-def test_hi():
-    print("hi")
-
-def test_rbm_dnn():
-    banner("rbm dnn")
-    mnist_conf = MNIST_CONF.copy()
-
-    mnist_conf["train_rbm"]["max_iters"] = MAX_ITERS
-    run_RBM(mnist_conf)
-
-    mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS
-    mnist_conf["init_dnn"] = {
-        "filename": "temp/rbm/final.nnet",
-        "num_hidden_layers": -1,
-        "with_final": 1
-    }
-    run_DNN(mnist_conf)
-
-    mnist_conf["init_rbm"] = {
-        "filename": "temp/dnn/final.nnet",
-        "num_hidden_layers": -1,
-        "with_final": 1
-    }
-    mnist_conf["train_rbm"]["max_iters"] = 0
-    run_RBM(mnist_conf)
-
-def test_sda_dnn():
-    banner("sda dnn")
-    mnist_conf = MNIST_CONF.copy()
-
-    mnist_conf["train_sda"]["max_iters"] = MAX_ITERS
-    run_SDA(mnist_conf)
-
-    mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS
-    mnist_conf["init_dnn"] = {
-        "filename": "temp/sda/final.nnet",
-        "num_hidden_layers": -1,
-        "with_final": 1
-    }
-    run_DNN(mnist_conf)
-
-    mnist_conf["init_sda"] = {
-        "filename": "temp/dnn/final.nnet",
-        "num_hidden_layers": -1,
-        "with_final": 1
-    }
-    mnist_conf["train_sda"]["max_iters"] = 1
-    run_SDA(mnist_conf)
-
-def test_dnn_eval():
-    banner("dnn cv")
-    mnist_conf = MNIST_CONF.copy()
-
-    mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS
-    run_DNN(mnist_conf)
-
-    mnist_conf["init_dnn"] = {
-        "filename": "temp/dnn/final.nnet",
-        "num_hidden_layers": -1,
-        "with_final": 1
-    }
-
-    # per-part
-    eval_DNN(mnist_conf)
-
-    mnist_conf["eval_dnn"] = {"mode": "cv", "batch_size": 1024}
-    eval_DNN(mnist_conf)
-
-    mnist_conf["eval_dnn"] = {"mode": "per-feat", "batch_size": 1024}
-    eval_DNN(mnist_conf)
-
-def test_dropout():
-    banner("dropout")
-    mnist_conf = MNIST_CONF.copy()
-    mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS
-    mnist_conf["model"]["dropout_factor"] = "0.4"
-    run_DNN(mnist_conf)
diff --git a/example/speech-demo/train_lstm_proj.py b/example/speech-demo/train_lstm_proj.py
deleted file mode 100644
index 5749b0c39d..0000000000
--- a/example/speech-demo/train_lstm_proj.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import re
-import sys
-sys.path.insert(0, "../../python")
-import time
-import logging
-import os.path
-
-import mxnet as mx
-import numpy as np
-from speechSGD import speechSGD
-from lstm_proj import lstm_unroll
-from io_util import BucketSentenceIter, TruncatedSentenceIter, DataReadStream
-from config_util import parse_args, get_checkpoint_path, parse_contexts
-
-
-# some constants
-METHOD_BUCKETING = 'bucketing'
-METHOD_TBPTT = 'truncated-bptt'
-
-def prepare_data(args):
-    batch_size = args.config.getint('train', 'batch_size')
-    num_hidden = args.config.getint('arch', 'num_hidden')
-    num_hidden_proj = args.config.getint('arch', 'num_hidden_proj')
-    num_lstm_layer = args.config.getint('arch', 'num_lstm_layer')
-
-    init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
-    if num_hidden_proj > 0:
-        init_h = [('l%d_init_h'%l, (batch_size, num_hidden_proj)) for l in range(num_lstm_layer)]
-    else:
-        init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
-
-    init_states = init_c + init_h
-
-    file_train = args.config.get('data', 'train')
-    file_dev = args.config.get('data', 'dev')
-    file_format = args.config.get('data', 'format')
-    feat_dim = args.config.getint('data', 'xdim')
-
-    train_data_args = {
-            "gpu_chunk": 32768,
-            "lst_file": file_train,
-            "file_format": file_format,
-            "separate_lines": True
-            }
-
-    dev_data_args = {
-            "gpu_chunk": 32768,
-            "lst_file": file_dev,
-            "file_format": file_format,
-            "separate_lines": True
-            }
-
-    train_sets = DataReadStream(train_data_args, feat_dim)
-    dev_sets = DataReadStream(dev_data_args, feat_dim)
-
-    return (init_states, train_sets, dev_sets)
-
-def CrossEntropy(labels, preds):
-    labels = labels.reshape((-1,))
-    preds = preds.reshape((-1, preds.shape[1]))
-    loss = 0.
-    num_inst = 0
-    for i in range(preds.shape[0]):
-        label = labels[i]
-
-        if label > 0:
-            loss += -np.log(max(1e-10, preds[i][int(label)]))
-            num_inst += 1
-    return loss , num_inst
-
-def Acc_exclude_padding(labels, preds):
-    labels = labels.reshape((-1,))
-    preds = preds.reshape((-1, preds.shape[1]))
-    sum_metric = 0
-    num_inst = 0
-    for i in range(preds.shape[0]):
-        pred_label = np.argmax(preds[i], axis=0)
-        label = labels[i]
-
-        ind = np.nonzero(label.flat)
-        pred_label_real = pred_label.flat[ind]
-        label_real = label.flat[ind]
-        sum_metric += (pred_label_real == label_real).sum()
-        num_inst += len(pred_label_real)
-    return sum_metric, num_inst
-
-class SimpleLRScheduler(mx.lr_scheduler.LRScheduler):
-    """A simple lr schedule that simply return `dynamic_lr`. We will set `dynamic_lr`
-    dynamically based on performance on the validation set.
-    """
-    def __init__(self, dynamic_lr, effective_sample_count=1, momentum=0.9, optimizer="sgd"):
-        super(SimpleLRScheduler, self).__init__()
-        self.dynamic_lr = dynamic_lr
-        self.effective_sample_count = effective_sample_count
-        self.momentum = momentum
-        self.optimizer = optimizer
-
-    def __call__(self, num_update):
-        if self.optimizer == "speechSGD":
-            return self.dynamic_lr / self.effective_sample_count, self.momentum
-        else:
-            return self.dynamic_lr / self.effective_sample_count
-
-def score_with_state_forwarding(module, eval_data, eval_metric):
-    eval_data.reset()
-    eval_metric.reset()
-
-    for eval_batch in eval_data:
-        module.forward(eval_batch, is_train=False)
-        module.update_metric(eval_metric, eval_batch.label)
-
-        # copy over states
-        outputs = module.get_outputs()
-        # outputs[0] is softmax, 1:end are states
-        for i in range(1, len(outputs)):
-            outputs[i].copyto(eval_data.init_state_arrays[i-1])
-
-
-def get_initializer(args):
-    init_type = getattr(mx.initializer, args.config.get('train', 'initializer'))
-    init_scale = args.config.getfloat('train', 'init_scale')
-    if init_type is mx.initializer.Xavier:
-        return mx.initializer.Xavier(magnitude=init_scale)
-    return init_type(init_scale)
-
-
-def do_training(training_method, args, module, data_train, data_val):
-    from distutils.dir_util import mkpath
-    mkpath(os.path.dirname(get_checkpoint_path(args)))
-
-    batch_size = data_train.batch_size
-    batch_end_callbacks = [mx.callback.Speedometer(batch_size,
-                                                   args.config.getint('train', 'show_every'))]
-    eval_allow_extra = True if training_method == METHOD_TBPTT else False
-    eval_metric = [mx.metric.np(CrossEntropy, allow_extra_outputs=eval_allow_extra),
-                   mx.metric.np(Acc_exclude_padding, allow_extra_outputs=eval_allow_extra)]
-    eval_metric = mx.metric.create(eval_metric)
-    optimizer = args.config.get('train', 'optimizer')
-    momentum = args.config.getfloat('train', 'momentum')
-    learning_rate = args.config.getfloat('train', 'learning_rate')
-    lr_scheduler = SimpleLRScheduler(learning_rate, momentum=momentum, optimizer=optimizer)
-
-    if training_method == METHOD_TBPTT:
-        lr_scheduler.seq_len = data_train.truncate_len
-
-    n_epoch = 0
-    num_epoch = args.config.getint('train', 'num_epoch')
-    learning_rate = args.config.getfloat('train', 'learning_rate')
-    decay_factor = args.config.getfloat('train', 'decay_factor')
-    decay_bound = args.config.getfloat('train', 'decay_lower_bound')
-    clip_gradient = args.config.getfloat('train', 'clip_gradient')
-    weight_decay = args.config.getfloat('train', 'weight_decay')
-    if clip_gradient == 0:
-        clip_gradient = None
-
-    last_acc = -float("Inf")
-    last_params = None
-
-    module.bind(data_shapes=data_train.provide_data,
-                label_shapes=data_train.provide_label,
-                for_training=True)
-    module.init_params(initializer=get_initializer(args))
-
-    def reset_optimizer():
-        if optimizer == "sgd" or optimizer == "speechSGD":
-            module.init_optimizer(kvstore='device',
-                              optimizer=args.config.get('train', 'optimizer'),
-                              optimizer_params={'lr_scheduler': lr_scheduler,
-                                                'momentum': momentum,
-                                                'rescale_grad': 1.0,
-                                                'clip_gradient': clip_gradient,
-                                                'wd': weight_decay},
-                              force_init=True)
-        else:
-            module.init_optimizer(kvstore='device',
-                              optimizer=args.config.get('train', 'optimizer'),
-                              optimizer_params={'lr_scheduler': lr_scheduler,
-                                                'rescale_grad': 1.0,
-                                                'clip_gradient': clip_gradient,
-                                                'wd': weight_decay},
-                              force_init=True)
-    reset_optimizer()
-
-    while True:
-        tic = time.time()
-        eval_metric.reset()
-
-        for nbatch, data_batch in enumerate(data_train):
-            if training_method == METHOD_TBPTT:
-                lr_scheduler.effective_sample_count = data_train.batch_size * truncate_len
-                lr_scheduler.momentum = np.power(np.power(momentum, 1.0/(data_train.batch_size * truncate_len)), data_batch.effective_sample_count)
-            else:
-                if data_batch.effective_sample_count is not None:
-                    lr_scheduler.effective_sample_count = 1#data_batch.effective_sample_count
-
-            module.forward_backward(data_batch)
-            module.update()
-            module.update_metric(eval_metric, data_batch.label)
-
-            batch_end_params = mx.model.BatchEndParam(epoch=n_epoch, nbatch=nbatch,
-                                                      eval_metric=eval_metric,
-                                                      locals=None)
-            for callback in batch_end_callbacks:
-                callback(batch_end_params)
-
-            if training_method == METHOD_TBPTT:
-                # copy over states
-                outputs = module.get_outputs()
-                # outputs[0] is softmax, 1:end are states
-                for i in range(1, len(outputs)):
-                    outputs[i].copyto(data_train.init_state_arrays[i-1])
-
-        for name, val in eval_metric.get_name_value():
-            logging.info('Epoch[%d] Train-%s=%f', n_epoch, name, val)
-        toc = time.time()
-        logging.info('Epoch[%d] Time cost=%.3f', n_epoch, toc-tic)
-
-        data_train.reset()
-
-        # test on eval data
-        score_with_state_forwarding(module, data_val, eval_metric)
-
-        # test whether we should decay learning rate
-        curr_acc = None
-        for name, val in eval_metric.get_name_value():
-            logging.info("Epoch[%d] Dev-%s=%f", n_epoch, name, val)
-            if name == 'CrossEntropy':
-                curr_acc = val
-        assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric'
-
-        if n_epoch > 0 and lr_scheduler.dynamic_lr > decay_bound and curr_acc > last_acc:
-            logging.info('Epoch[%d] !!! Dev set performance drops, reverting this epoch',
-                         n_epoch)
-            logging.info('Epoch[%d] !!! LR decay: %g => %g', n_epoch,
-                         lr_scheduler.dynamic_lr, lr_scheduler.dynamic_lr / float(decay_factor))
-
-            lr_scheduler.dynamic_lr /= decay_factor
-            # we reset the optimizer because the internal states (e.g. momentum)
-            # might already be exploded, so we want to start from fresh
-            reset_optimizer()
-            module.set_params(*last_params)
-        else:
-            last_params = module.get_params()
-            last_acc = curr_acc
-            n_epoch += 1
-
-            # save checkpoints
-            mx.model.save_checkpoint(get_checkpoint_path(args), n_epoch,
-                                     module.symbol, *last_params)
-
-        if n_epoch == num_epoch:
-            break
-
-if __name__ == '__main__':
-    args = parse_args()
-    args.config.write(sys.stdout)
-
-    training_method = args.config.get('train', 'method')
-    contexts = parse_contexts(args)
-
-    init_states, train_sets, dev_sets = prepare_data(args)
-    state_names = [x[0] for x in init_states]
-
-    batch_size = args.config.getint('train', 'batch_size')
-    num_hidden = args.config.getint('arch', 'num_hidden')
-    num_hidden_proj = args.config.getint('arch', 'num_hidden_proj')
-    num_lstm_layer = args.config.getint('arch', 'num_lstm_layer')
-    feat_dim = args.config.getint('data', 'xdim')
-    label_dim = args.config.getint('data', 'ydim')
-
-    logging.basicConfig(level=logging.DEBUG, format='%(asctime)-15s %(message)s')
-
-    if training_method == METHOD_BUCKETING:
-        buckets = args.config.get('train', 'buckets')
-        buckets = list(map(int, re.split(r'\W+', buckets)))
-        data_train = BucketSentenceIter(train_sets, buckets, batch_size, init_states, feat_dim=feat_dim)
-        data_val   = BucketSentenceIter(dev_sets, buckets, batch_size, init_states, feat_dim=feat_dim)
-
-        def sym_gen(seq_len):
-            sym = lstm_unroll(num_lstm_layer, seq_len, feat_dim, num_hidden=num_hidden,
-                              num_label=label_dim, num_hidden_proj=num_hidden_proj)
-            data_names = ['data'] + state_names
-            label_names = ['softmax_label']
-            return (sym, data_names, label_names)
-
-        module = mx.mod.BucketingModule(sym_gen,
-                                        default_bucket_key=data_train.default_bucket_key,
-                                        context=contexts)
-        do_training(training_method, args, module, data_train, data_val)
-    elif training_method == METHOD_TBPTT:
-        truncate_len = args.config.getint('train', 'truncate_len')
-        data_train = TruncatedSentenceIter(train_sets, batch_size, init_states,
-                                           truncate_len=truncate_len, feat_dim=feat_dim)
-        data_val = TruncatedSentenceIter(dev_sets, batch_size, init_states,
-                                         truncate_len=truncate_len, feat_dim=feat_dim,
-                                         do_shuffling=False, pad_zeros=True)
-        sym = lstm_unroll(num_lstm_layer, truncate_len, feat_dim, num_hidden=num_hidden,
-                          num_label=label_dim, output_states=True, num_hidden_proj=num_hidden_proj)
-        data_names = [x[0] for x in data_train.provide_data]
-        label_names = [x[0] for x in data_train.provide_label]
-        module = mx.mod.Module(sym, context=contexts, data_names=data_names,
-                               label_names=label_names)
-        do_training(training_method, args, module, data_train, data_val)
-    else:
-        raise RuntimeError('Unknown training method: %s' % training_method)
-
-    print("="*80)
-    print("Finished Training")
-    print("="*80)
-    args.config.write(sys.stdout)
diff --git a/example/ssd/symbol/symbol_factory.py b/example/ssd/symbol/symbol_factory.py
index 1b1419f04e..3a4364a570 100644
--- a/example/ssd/symbol/symbol_factory.py
+++ b/example/ssd/symbol/symbol_factory.py
@@ -86,7 +86,7 @@ def get_config(network, data_shape, **kwargs):
         num_layers = 101
         image_shape = '3,224,224'
         network = 'resnet'
-        from_layers = ['_plus12', '_plus15', '', '', '', '']
+        from_layers = ['_plus29', '_plus32', '', '', '', '']
         num_filters = [-1, -1, 512, 256, 256, 128]
         strides = [-1, -1, 2, 2, 2, 2]
         pads = [-1, -1, 1, 1, 1, 1]
diff --git a/example/ssd/train.py b/example/ssd/train.py
index f08aafb97b..1648c826c7 100644
--- a/example/ssd/train.py
+++ b/example/ssd/train.py
@@ -72,7 +72,7 @@ def parse_args():
                         help='blue mean value')
     parser.add_argument('--lr-steps', dest='lr_refactor_step', type=str, default='80, 160',
                         help='refactor learning rate at specified epochs')
-    parser.add_argument('--lr-factor', dest='lr_refactor_ratio', type=str, default=0.1,
+    parser.add_argument('--lr-factor', dest='lr_refactor_ratio', type=float, default=0.1,
                         help='ratio to refactor learning rate')
     parser.add_argument('--freeze', dest='freeze_pattern', type=str, default="^(conv1_|conv2_).*",
                         help='freeze layer pattern')
diff --git a/example/stochastic-depth/README.md b/example/stochastic-depth/README.md
new file mode 100644
index 0000000000..08c466eb8b
--- /dev/null
+++ b/example/stochastic-depth/README.md
@@ -0,0 +1,29 @@
+Stochastic Depth
+================
+
+This folder contains examples showing implementation of the stochastic depth algorithm described in the paper
+Huang, Gao, et al. ["Deep networks with stochastic depth."](https://arxiv.org/abs/1603.09382)
+arXiv preprint arXiv:1603.09382 (2016). This paper introduces a new way to perturb networks during training
+in order to improve their performance. Stochastic Depth (SD) is a method for residual networks,
+which randomly removes/deactivates residual blocks during training.
+
+The paper talks about constructing the network of residual blocks which are basically a set of
+convolution layers and a bypass that passes the information from the previous layer through without any change.
+With stochastic depth, the convolution block is sometimes switched off allowing the information
+to flow through the layer without being changed, effectively removing the layer from the network.
+During testing, all layers are left in and the weights are modified by their survival probability.
+This is very similar to how dropout works, except instead of dropping a single node in a layer
+the entire layer is dropped!
+
+The main idea behind stochastic depth is relatively simple, but the results are surprisingly good.
+The authors demonstrated the new architecture on CIFAR-10, CIFAR-100, and the Street View House Number dataset (SVHN).
+They achieve the lowest published error on CIFAR-10 and CIFAR-100, and second lowest for SVHN.
+
+Files in this example folder:
+
+- `sd_mnist.py` example shows sample implementation of the algorithm just for the sanity check.
+
+- **sd_cifar10.py** shows the algorithm implementation for 500 epochs on cifar_10 dataset. After 500 epochs, ~9.4% error
+was achieved for cifar10, it can be further improved by some more careful hyper parameters tuning to achieve
+the reported numbers in the paper.
+You can see the sample result log in the top section of sd_cifar10.py file.
diff --git a/example/stochastic-depth/sd_cifar10.py b/example/stochastic-depth/sd_cifar10.py
index c123562cf7..7eb3202870 100644
--- a/example/stochastic-depth/sd_cifar10.py
+++ b/example/stochastic-depth/sd_cifar10.py
@@ -214,4 +214,3 @@ def get_death_rate(i_res_block):
             num_epoch=num_epochs, batch_end_callback=batch_end_callbacks,
             epoch_end_callback=epoch_end_callbacks,
             initializer=initializer)
-
diff --git a/example/torch/torch_module.py b/example/torch/torch_module.py
deleted file mode 100644
index e2f7821362..0000000000
--- a/example/torch/torch_module.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: skip-file
-import sys
-import os
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
-from get_data import MNISTIterator
-import mxnet as mx
-import numpy as np
-import logging
-
-# define mlp
-
-use_torch_criterion = False
-
-data = mx.symbol.Variable('data')
-fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')
-act1 = mx.symbol.TorchModule(data_0=fc1, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu1')
-fc2 = mx.symbol.TorchModule(data_0=act1, lua_string='nn.Linear(128, 64)', num_data=1, num_params=2, num_outputs=1, name='fc2')
-act2 = mx.symbol.TorchModule(data_0=fc2, lua_string='nn.ReLU(false)', num_data=1, num_params=0, num_outputs=1, name='relu2')
-fc3 = mx.symbol.TorchModule(data_0=act2, lua_string='nn.Linear(64, 10)', num_data=1, num_params=2, num_outputs=1, name='fc3')
-
-if use_torch_criterion:
-    logsoftmax = mx.symbol.TorchModule(data_0=fc3, lua_string='nn.LogSoftMax()', num_data=1, num_params=0, num_outputs=1, name='logsoftmax')
-    # Torch's label starts from 1
-    label = mx.symbol.Variable('softmax_label') + 1
-    mlp = mx.symbol.TorchCriterion(data=logsoftmax, label=label, lua_string='nn.ClassNLLCriterion()', name='softmax')
-else:
-    mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
-
-# data
-
-train, val = MNISTIterator(batch_size=100, input_shape = (784,))
-
-# train
-
-logging.basicConfig(level=logging.DEBUG)
-
-model = mx.model.FeedForward(
-    ctx = mx.cpu(0), symbol = mlp, num_epoch = 20,
-    learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
-
-if use_torch_criterion:
-    model.fit(X=train, eval_data=val, eval_metric=mx.metric.Torch())
-else:
-    model.fit(X=train, eval_data=val)
diff --git a/example/vae/README.md b/example/vae/README.md
new file mode 100644
index 0000000000..c6e68d54c4
--- /dev/null
+++ b/example/vae/README.md
@@ -0,0 +1,21 @@
+Variational Auto Encoder(VAE)
+=============================
+
+This folder contains a tutorial which implements the Variational Auto Encoder in MXNet using the MNIST handwritten digit
+recognition dataset. Model built is referred from [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/)
+paper. This paper introduces a stochastic variational inference and learning algorithm that scales to large datasets.
+
+Prerequisites:
+To run this example, you need:
+- [Jupyter Notebook](http://jupyter.org/index.html)
+- Matplotlib
+
+Files in this folder:
+- **VAE_example.ipynb** : Jupyter notebook which explains concept of VAE step by step and also shows how to use
+MXNet-based VAE class(from VAE.py) to do the training directly.
+
+- **VAE.py** : Contains class which implements the Variational Auto Encoder. This is used in the above tutorial.
+
+In VAE, the encoder becomes a variational inference network that maps the data to a distribution
+for the hidden variables, and the decoder becomes a generative network that maps the latent variables back to the data.
+The network architecture shown in the tutorial uses Gaussian MLP as an encoder and Bernoulli MLP as a decoder.
diff --git a/example/vae/VAE.py b/example/vae/VAE.py
index 9de1abf07a..ba0673331d 100644
--- a/example/vae/VAE.py
+++ b/example/vae/VAE.py
@@ -21,86 +21,90 @@
 import os
 import logging
 
-
 class VAE:
-    '''This class implements the Variational Auto Encoder'''
+    """This class implements the Variational Auto Encoder"""
     
     def Bernoulli(x_hat,loss_label):
-        return(-mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(x_hat)) + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-x_hat)),axis=1))
-
-    
-    def __init__(self,n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None, initializer = mx.init.Normal(0.01),likelihood=Bernoulli):
-        
-
-        self.n_latent = n_latent                            #dimension of the latent space Z
-        self.num_hidden_ecoder = num_hidden_ecoder          #number of hidden units in the encoder
-        self.num_hidden_decoder = num_hidden_decoder        #number of hidden units in the decoder
-        self.batch_size = batch_size                        #mini batch size
-        self.learning_rate = learning_rate                  #learning rate during training
-        self.weight_decay = weight_decay                    #weight decay during training, for regulariization of parameters
-        self.num_epoch = num_epoch                          #total number of training epoch
-        self.optimizer = optimizer
-
-
-
-        #train the model
-        self.model, self.training_loss = VAE.train_vae(x_train,x_valid,batch_size,n_latent,num_hidden_ecoder,num_hidden_decoder,learning_rate,weight_decay,num_epoch,optimizer,model_prefix,likelihood,initializer)
-        #save model parameters (i.e. weights and biases)
+        return(-mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(x_hat))
+                              + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-x_hat)), axis=1))
+
+    def __init__(self, n_latent=5, num_hidden_ecoder=400, num_hidden_decoder=400, x_train=None, x_valid=None,
+                 batch_size=100, learning_rate=0.001, weight_decay=0.01, num_epoch=100, optimizer='sgd',
+                 model_prefix=None, initializer=mx.init.Normal(0.01), likelihood=Bernoulli):
+        self.n_latent = n_latent                      # dimension of the latent space Z
+        self.num_hidden_ecoder = num_hidden_ecoder    # number of hidden units in the encoder
+        self.num_hidden_decoder = num_hidden_decoder  # number of hidden units in the decoder
+        self.batch_size = batch_size                  # mini batch size
+        self.learning_rate = learning_rate            # learning rate during training
+        self.weight_decay = weight_decay              # weight decay during training, for regularization of parameters
+        self.num_epoch = num_epoch                    # total number of training epoch
+        self.optimizer = optimizer                    # 'sgd' optimizer by default
+
+        # train the model
+        self.model, self.training_loss = VAE.train_vae(x_train, x_valid, batch_size, n_latent, num_hidden_ecoder,
+                                                       num_hidden_decoder, learning_rate, weight_decay,
+                                                       num_epoch,optimizer, model_prefix, likelihood, initializer)
+
+        # save model parameters (i.e. weights and biases)
         self.arg_params = self.model.get_params()[0]
-        #save loss(ELBO) for the training set 
-        nd_iter = mx.io.NDArrayIter(data={'data':x_train},label={'loss_label':x_train},batch_size = batch_size)     
 
-        #if saved parameters, can access them at specific iteration e.g. last epoch using
+        # save loss(ELBO) for the training set
+        nd_iter = mx.io.NDArrayIter(data={'data':x_train}, label={'loss_label':x_train}, batch_size=batch_size)
+
+        # if saved parameters, can access them at specific iteration e.g. last epoch using
         #   sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, self.num_epoch)
         #   assert sym.tojson() == output.tojson()
-        #   self.arg_params = arg_params 
-    def train_vae(x_train,x_valid,batch_size,n_latent,num_hidden_ecoder,num_hidden_decoder,learning_rate,weight_decay,num_epoch,optimizer,model_prefix,likelihood,initializer):
-        [N,features] = np.shape(x_train)          #number of examples and features
+        #   self.arg_params = arg_params
+
+    @staticmethod
+    def train_vae(x_train, x_valid, batch_size, n_latent, num_hidden_ecoder, num_hidden_decoder, learning_rate,
+                  weight_decay, num_epoch, optimizer, model_prefix, likelihood, initializer):
+        [N,features] = np.shape(x_train)          # number of examples and features
+
+        # create data iterator to feed into NN
+        nd_iter = mx.io.NDArrayIter(data={'data':x_train}, label={'loss_label':x_train}, batch_size=batch_size)
 
-        #create data iterator to feed into NN
-        nd_iter = mx.io.NDArrayIter(data={'data':x_train},label={'loss_label':x_train},batch_size = batch_size)
         if x_valid is not None:
-            nd_iter_val = mx.io.NDArrayIter(data={'data':x_valid},label={'loss_label':x_valid},batch_size = batch_size)
+            nd_iter_val = mx.io.NDArrayIter(data={'data':x_valid}, label={'loss_label':x_valid}, batch_size=batch_size)
         else:
             nd_iter_val = None
+
         data = mx.sym.var('data')
         loss_label = mx.sym.var('loss_label')
 
+        # build network architecture
+        encoder_h = mx.sym.FullyConnected(data=data, name="encoder_h", num_hidden=num_hidden_ecoder)
+        act_h = mx.sym.Activation(data=encoder_h, act_type="tanh", name="activation_h")
 
-        #build network architucture
-        encoder_h  = mx.sym.FullyConnected(data=data, name="encoder_h",num_hidden=num_hidden_ecoder)
-        act_h = mx.sym.Activation(data=encoder_h, act_type="tanh",name="activation_h")
+        mu = mx.sym.FullyConnected(data=act_h, name="mu", num_hidden=n_latent)
+        logvar = mx.sym.FullyConnected(data=act_h, name="logvar", num_hidden=n_latent)
 
-        
-        mu  = mx.sym.FullyConnected(data=act_h, name="mu",num_hidden = n_latent)
-        logvar  = mx.sym.FullyConnected(data=act_h, name="logvar",num_hidden = n_latent)
-        #latent manifold
-        z = mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*logvar),mx.symbol.random_normal(loc=0, scale=1,shape=(batch_size,n_latent))) 
-        decoder_z = mx.sym.FullyConnected(data=z, name="decoder_z",num_hidden=num_hidden_decoder)
-        act_z = mx.sym.Activation(data=decoder_z, act_type="tanh",name="actication_z")
+        # latent manifold
+        z = mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*logvar),
+                                         mx.symbol.random_normal(loc=0, scale=1, shape=(batch_size, n_latent)))
+        decoder_z = mx.sym.FullyConnected(data=z, name="decoder_z", num_hidden=num_hidden_decoder)
+        act_z = mx.sym.Activation(data=decoder_z, act_type="tanh", name="actication_z")
 
-        decoder_x = mx.sym.FullyConnected(data=act_z, name="decoder_x",num_hidden=features)
-        act_x = mx.sym.Activation(data=decoder_x, act_type="sigmoid",name='activation_x')
+        decoder_x = mx.sym.FullyConnected(data=act_z, name="decoder_x", num_hidden=features)
+        act_x = mx.sym.Activation(data=decoder_x, act_type="sigmoid", name='activation_x')
 
-        KL = -0.5*mx.symbol.sum(1+logvar-pow( mu,2)-mx.symbol.exp(logvar),axis=1)
+        KL = -0.5 * mx.symbol.sum(1+logvar-pow(mu,2)-mx.symbol.exp(logvar), axis=1)
 
-        #compute minus ELBO to minimize 
-        loss = likelihood(act_x,loss_label)+KL
-        output = mx.symbol.MakeLoss(sum(loss),name='loss')
+        # compute minus ELBO to minimize
+        loss = likelihood(act_x, loss_label)+KL
+        output = mx.symbol.MakeLoss(sum(loss), name='loss')
 
-        #train the model
+        # train the model
         nd_iter.reset()
         logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
 
         model = mx.mod.Module(
-            symbol = output ,
+            symbol=output ,
             data_names=['data'],
-            label_names = ['loss_label'])
-
-             #initialize the weights and bias 
-
+            label_names=['loss_label'])
 
         training_loss = list()
+
         def log_to_list(period, lst):
                 def _callback(param):
                         """The checkpoint function."""
@@ -110,37 +114,40 @@ def _callback(param):
                 return _callback
 
         model.fit(nd_iter,  # train data
-                    initializer = initializer,
-                    eval_data = nd_iter_val,
-                    optimizer = optimizer,  # use SGD to train
-                    optimizer_params = {'learning_rate':learning_rate,'wd':weight_decay},  
-                    epoch_end_callback  = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),   #save parameters for each epoch if model_prefix is supplied
-                    batch_end_callback = log_to_list(int(N/batch_size),training_loss),  #this can save the training loss
-                    num_epoch = num_epoch,
-                    eval_metric = 'Loss')
+                  initializer=initializer, # initialize the weights and bias
+                  eval_data=nd_iter_val,
+                  optimizer=optimizer,  # use SGD to train
+                  optimizer_params={'learning_rate':learning_rate, 'wd':weight_decay},
+                  # save parameters for each epoch if model_prefix is supplied
+                  epoch_end_callback=None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),
+                  batch_end_callback=log_to_list(int(N/batch_size), training_loss),  # this can save the training loss
+                  num_epoch=num_epoch,
+                  eval_metric='Loss')
 
         return model,training_loss
 
-
-    def encoder(model,x):
+    @staticmethod
+    def encoder(model, x):
         params = model.arg_params
         encoder_n = np.shape(params['encoder_h_bias'].asnumpy())[0]
-        encoder_h = np.dot(params['encoder_h_weight'].asnumpy(),np.transpose(x)) + np.reshape(params['encoder_h_bias'].asnumpy(),(encoder_n,1))
+        encoder_h = np.dot(params['encoder_h_weight'].asnumpy(), np.transpose(x)) \
+                    + np.reshape(params['encoder_h_bias'].asnumpy(), (encoder_n,1))
         act_h = np.tanh(encoder_h)
         mu = np.transpose(np.dot(params['mu_weight'].asnumpy(),act_h)) + params['mu_bias'].asnumpy()
         logvar = np.transpose(np.dot(params['logvar_weight'].asnumpy(),act_h)) + params['logvar_bias'].asnumpy()
         return mu,logvar
 
-    def sampler(mu,logvar):
-        z = mu + np.multiply(np.exp(0.5*logvar),np.random.normal(loc=0, scale=1,size=np.shape(logvar))) 
+    @staticmethod
+    def sampler(mu, logvar):
+        z = mu + np.multiply(np.exp(0.5*logvar), np.random.normal(loc=0, scale=1,size=np.shape(logvar)))
         return z
 
-
-
-    def decoder(model,z):
+    @staticmethod
+    def decoder(model, z):
         params = model.arg_params
         decoder_n = np.shape(params['decoder_z_bias'].asnumpy())[0]
-        decoder_z = np.dot(params['decoder_z_weight'].asnumpy(),np.transpose(z)) + np.reshape(params['decoder_z_bias'].asnumpy(),(decoder_n,1))
+        decoder_z = np.dot(params['decoder_z_weight'].asnumpy(),np.transpose(z)) \
+                    + np.reshape(params['decoder_z_bias'].asnumpy(),(decoder_n,1))
         act_z = np.tanh(decoder_z)
         decoder_x = np.transpose(np.dot(params['decoder_x_weight'].asnumpy(),act_z)) + params['decoder_x_bias'].asnumpy()
         reconstructed_x = 1/(1+np.exp(-decoder_x))
diff --git a/example/vae/VAE_example.ipynb b/example/vae/VAE_example.ipynb
old mode 100644
new mode 100755
index c29348a58d..e7ec03afdb
--- a/example/vae/VAE_example.ipynb
+++ b/example/vae/VAE_example.ipynb
@@ -24,11 +24,28 @@
     "\n",
     "#### Xiaoyu Lu,  July 5th, 2017\n",
     "\n",
-    "This tutorial guides you through the process of building a variational encoder in MXNet. in this notebook we'll focus on an example unsing the MNIST handwritten digit recognition dataset. Refer to [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/) for more details on the model description.\n",
+    "This tutorial guides you through the process of building a variational encoder in MXNet. In this notebook we'll focus on an example using the MNIST handwritten digit recognition dataset. Refer to [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/) for more details on the model description.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "To complete this tutorial, we need following python packages:\n",
     "\n",
+    "- numpy, matplotlib "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "## 1. Loading the Data\n",
     "\n",
-    "We first load the MNIST dataset, which contains 60000 trainings and 10000 test examples. The following code import required modules and load the data. These images are stored in a 4-D matrix with shape (`batch_size, num_channels, width, height`). For the MNIST dataset, there is only one color channel, and both width and height are 28, so we reshape each image as a 28x28 array. See below for a visualization.\n"
+    "We first load the MNIST dataset, which contains 60000 training and 10000 test examples. The following code imports required modules and loads the data. These images are stored in a 4-D matrix with shape (`batch_size, num_channels, width, height`). For the MNIST dataset, there is only one color channel, and both width and height are 28, so we reshape each image as a 28x28 array. See below for a visualization:\n"
    ]
   },
   {
@@ -50,7 +67,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -103,7 +122,7 @@
     "## 2.  Building the Network Architecture\n",
     "\n",
     "### 2.1 Gaussian MLP as encoder\n",
-    "Next we constuct the neural network, as in the paper, we use *Multilayer Perceptron (MLP)* for both the encoder and decoder. For encoder, a Gaussian MLP is used:\n",
+    "Next we constuct the neural network, as in the [paper](https://arxiv.org/abs/1312.6114/), we use *Multilayer Perceptron (MLP)* for both the encoder and decoder. For encoder, a Gaussian MLP is used as follows:\n",
     "\n",
     "\\begin{align}\n",
     "\\log q_{\\phi}(z|x) &= \\log \\mathcal{N}(z:\\mu,\\sigma^2I) \\\\\n",
@@ -112,7 +131,7 @@
     "\\end{align}\n",
     "\n",
     "where $\\{W_1,W_2,W_3,b_1,b_2,b_3\\}$ are the weights and biases of the MLP.\n",
-    "Note below that `encoder_mu` and `encoder_logvar` are symbols, can use `get_internals()` to get the values of them, after which we can sample the latent variable $z$.\n",
+    "Note below that `encoder_mu`(`mu`) and `encoder_logvar`(`logvar`) are symbols. So, we can use `get_internals()` to get the values of them, after which we can sample the latent variable $z$.\n",
     "\n",
     "\n",
     "\n"
@@ -139,7 +158,8 @@
     "logvar  = mx.sym.FullyConnected(data=act_h, name=\"logvar\",num_hidden = 5)\n",
     "\n",
     "## sample the latent variables z according to Normal(mu,var)\n",
-    "z = mu + np.multiply(mx.symbol.exp(0.5*logvar),mx.symbol.random_normal(loc=0, scale=1,shape=np.shape(logvar.get_internals()[\"logvar_output\"])))"
+    "z = mu + np.multiply(mx.symbol.exp(0.5 * logvar), \n",
+    "                     mx.symbol.random_normal(loc=0, scale=1, shape=np.shape(logvar.get_internals()[\"logvar_output\"])))"
    ]
   },
   {
@@ -181,13 +201,13 @@
    "source": [
     "### 2.3 Joint Loss Function for the Encoder and the Decoder\n",
     "\n",
-    "The variational lower bound can be estimated as:\n",
+    "The variational lower bound also called evidence lower bound (ELBO) can be estimated as:\n",
     "\n",
     "\\begin{align}\n",
     "\\mathcal{L}(\\theta,\\phi;x_{(i)}) \\approx \\frac{1}{2}\\left(1+\\log ((\\sigma_j^{(i)})^2)-(\\mu_j^{(i)})^2-(\\sigma_j^{(i)})^2\\right) + \\log p_\\theta(x^{(i)}|z^{(i)})\n",
     "\\end{align}\n",
     "\n",
-    "where the first term is the KL divergence of the approximate posterior from the prior, and the second term is an expected negative reconstruction error. We would like to maximize this lower bound, so we can define the loss to be $-\\mathcal{L}$ for MXNet to minimize."
+    "where the first term is the KL divergence of the approximate posterior from the prior, and the second term is an expected negative reconstruction error. We would like to maximize this lower bound, so we can define the loss to be $-\\mathcal{L}$(minus ELBO) for MXNet to minimize."
    ]
   },
   {
@@ -200,7 +220,8 @@
    "source": [
     "# define the objective loss function that needs to be minimized\n",
     "KL = 0.5*mx.symbol.sum(1+logvar-pow( mu,2)-mx.symbol.exp(logvar),axis=1)\n",
-    "loss = -mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(y)) + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-y)),axis=1)-KL\n",
+    "loss = -mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(y)) \n",
+    "                      + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-y)),axis=1)-KL\n",
     "output = mx.symbol.MakeLoss(sum(loss),name='loss')"
    ]
   },
@@ -209,7 +230,10 @@
    "metadata": {},
    "source": [
     "## 3. Training the model\n",
-    "Now we can define the model and train it, we initilize the weights and the biases to be Gaussian(0,0.01), and use stochastic gradient descent for optimization. To warm start the training, one may initilize with pre-trainined parameters `arg_params` using `init=mx.initializer.Load(arg_params)`. To save intermediate results, we can optionally use `epoch_end_callback  = mx.callback.do_checkpoint(model_prefix, 1)` which saves the parameters to the path given by model_prefix, and with period every $1$ epoch. To assess the performance, we output minus ELBO $-\\mathcal{L}$ after each epoch, with the command `eval_metric = 'Loss'`. We can plot the training loss for mini batches by accessing the log and save it to a list, then parse it to the argument `batch_end_callback`."
+    "\n",
+    "Now, we can define the model and train it. First we will initilize the weights and the biases to be Gaussian(0,0.01), and then use stochastic gradient descent for optimization. To warm start the training, one may also initilize with pre-trainined parameters `arg_params` using `init=mx.initializer.Load(arg_params)`. \n",
+    "\n",
+    "To save intermediate results, we can optionally use `epoch_end_callback = mx.callback.do_checkpoint(model_prefix, 1)` which saves the parameters to the path given by model_prefix, and with period every $1$ epoch. To assess the performance, we output $-\\mathcal{L}$(minus ELBO) after each epoch, with the command `eval_metric = 'Loss'` which is defined above. We will also plot the training loss for mini batches by accessing the log and saving it to a list, and then parsing it to the argument `batch_end_callback`."
    ]
   },
   {
@@ -224,7 +248,7 @@
     "nd_iter.reset()\n",
     "logging.getLogger().setLevel(logging.DEBUG)  \n",
     "\n",
-    "#define function to trave back training loss\n",
+    "# define function to trave back training loss\n",
     "def log_to_list(period, lst):\n",
     "    def _callback(param):\n",
     "        \"\"\"The checkpoint function.\"\"\"\n",
@@ -243,7 +267,9 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -459,21 +485,24 @@
     "# initilize the parameters for training using Normal.\n",
     "init = mx.init.Normal(0.01)\n",
     "model.fit(nd_iter,  # train data\n",
-    "              initializer=init,\n",
-    "              #if eval_data is supplied, test loss will also be reported\n",
-    "              #eval_data = nd_iter_test,\n",
-    "              optimizer='sgd',  # use SGD to train\n",
-    "              optimizer_params={'learning_rate':1e-3,'wd':1e-2},  \n",
-    "              epoch_end_callback  = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),   #save parameters for each epoch if model_prefix is supplied\n",
-    "              batch_end_callback = log_to_list(N/batch_size,training_loss), \n",
-    "              num_epoch=100,\n",
-    "              eval_metric = 'Loss')"
+    "          initializer=init,\n",
+    "          # if eval_data is supplied, test loss will also be reported\n",
+    "          # eval_data = nd_iter_test,\n",
+    "          optimizer='sgd',  # use SGD to train\n",
+    "          optimizer_params={'learning_rate':1e-3,'wd':1e-2},  \n",
+    "          # save parameters for each epoch if model_prefix is supplied\n",
+    "          epoch_end_callback = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),\n",
+    "          batch_end_callback = log_to_list(N/batch_size,training_loss), \n",
+    "          num_epoch=100,\n",
+    "          eval_metric = 'Loss')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -497,7 +526,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As expected, the ELBO is monotonically increasing over epoch, and we reproduced the resutls given in the paper [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/). Now we can extract/load the parameters and then feed the network forward to calculate $y$ which is the reconstructed image, and we can also calculate the ELBO for the test set. "
+    "As expected, the ELBO is monotonically increasing over epoch, and we reproduced the results given in the paper [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/). Now we can extract/load the parameters and then feed the network forward to calculate $y$ which is the reconstructed image, and we can also calculate the ELBO for the test set. "
    ]
   },
   {
@@ -510,9 +539,9 @@
    "source": [
     "arg_params = model.get_params()[0]\n",
     "\n",
-    "#if saved the parameters, can load them at e.g. 100th epoch\n",
-    "#sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 100)\n",
-    "#assert sym.tojson() == output.tojson()\n",
+    "# if saved the parameters, can load them using `load_checkpoint` method at e.g. 100th epoch\n",
+    "# sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 100)\n",
+    "# assert sym.tojson() == output.tojson()\n",
     "\n",
     "e = y.bind(mx.cpu(), {'data': nd_iter_test.data[0][1],\n",
     "                     'encoder_h_weight': arg_params['encoder_h_weight'],\n",
@@ -535,6 +564,7 @@
    "cell_type": "code",
    "execution_count": 78,
    "metadata": {
+    "collapsed": false,
     "scrolled": true
    },
    "outputs": [
@@ -566,7 +596,9 @@
   {
    "cell_type": "code",
    "execution_count": 37,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -580,7 +612,7 @@
     }
    ],
    "source": [
-    "#calculate the ELBO which is minus the loss for test set\n",
+    "# calculate the ELBO which is minus the loss for test set\n",
     "metric = mx.metric.Loss()\n",
     "model.score(nd_iter_test, metric)"
    ]
@@ -607,14 +639,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "One can directly call the class `VAE` to do the training. The outputs are the learned model and training loss.\n",
-    "```VAE(n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None, initializer = mx.init.Normal(0.01),likelihood=Bernoulli)```"
+    "One can directly call the class `VAE` to do the training:\n",
+    "\n",
+    "```VAE(n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,\n",
+    "batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None,\n",
+    "initializer = mx.init.Normal(0.01),likelihood=Bernoulli)```\n",
+    "\n",
+    "The outputs are the learned model and training loss."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -830,13 +869,7 @@
       "INFO:root:Epoch[103] Time cost=10.267\n",
       "INFO:root:Epoch[104] Train-loss=168.181174\n",
       "INFO:root:Epoch[104] Time cost=11.132\n",
-      "INFO:root:Epoch[105] Train-loss=168.021498\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
+      "INFO:root:Epoch[105] Train-loss=168.021498\n",
       "INFO:root:Epoch[105] Time cost=10.187\n",
       "INFO:root:Epoch[106] Train-loss=167.858251\n",
       "INFO:root:Epoch[106] Time cost=10.676\n",
@@ -1030,11 +1063,11 @@
     }
    ],
    "source": [
-    "# can initilize weights and biases with the learned parameters \n",
-    "#init = mx.initializer.Load(params)\n",
+    "# can initilize weights and biases with the learned parameters as follows: \n",
+    "# init = mx.initializer.Load(params)\n",
     "\n",
-    "# call the VAE , output model contains the learned model and training loss\n",
-    "out = VAE(n_latent=2,x_train=image,x_valid=None,num_epoch=200) "
+    "# call the VAE, output model contains the learned model and training loss\n",
+    "out = VAE(n_latent=2, x_train=image, x_valid=None, num_epoch=200) "
    ]
   },
   {
@@ -1047,7 +1080,7 @@
    "source": [
     "# encode test images to obtain mu and logvar which are used for sampling\n",
     "[mu,logvar] = VAE.encoder(out,image_test)\n",
-    "#sample in the latent space\n",
+    "# sample in the latent space\n",
     "z = VAE.sampler(mu,logvar)\n",
     "# decode from the latent space to obtain reconstructed images\n",
     "x_construction = VAE.decoder(out,z)\n"
@@ -1056,7 +1089,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -1085,7 +1120,9 @@
   {
    "cell_type": "code",
    "execution_count": 78,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -1145,21 +1182,21 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [Root]",
    "language": "python",
-   "name": "python3"
+   "name": "Python [Root]"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 7c136a6470..84b2fea712 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -109,11 +109,11 @@
 #endif
 
 /*! \brief major version */
-#define MXNET_MAJOR 0
+#define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 12
+#define MXNET_MINOR 0
 /*! \brief patch version */
-#define MXNET_PATCH 1
+#define MXNET_PATCH 0
 /*! \brief mxnet version */
 #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
 /*! \brief helper for making version number */
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index faa453529e..d34b194554 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1051,6 +1051,16 @@ MXNET_DLL int MXSymbolListArguments(SymbolHandle symbol,
 MXNET_DLL int MXSymbolListOutputs(SymbolHandle symbol,
                                   mx_uint *out_size,
                                   const char ***out_str_array);
+
+/*!
+ * \brief Get number of outputs of the symbol.
+ * \param symbol The symbol
+ * \param out_size number of outputs
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolGetNumOutputs(SymbolHandle symbol,
+                                     mx_uint *output_count);
+
 /*!
  * \brief Get a symbol that contains all the internals.
  * \param symbol The symbol
@@ -1077,6 +1087,7 @@ MXNET_DLL int MXSymbolGetChildren(SymbolHandle symbol,
 MXNET_DLL int MXSymbolGetOutput(SymbolHandle symbol,
                                 mx_uint index,
                                 SymbolHandle *out);
+
 /*!
  * \brief List auxiliary states in the symbol.
  * \param symbol the symbol
diff --git a/make/config.mk b/make/config.mk
index 6db22df0c8..9f7564b88f 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -33,7 +33,7 @@ DEBUG = 0
 # whether compile with profiler
 USE_PROFILER =
 
-# whether to turn on signal handler (e.g. segfault logger)
+# whether to turn on segfault signal handler to log the stack trace
 USE_SIGNAL_HANDLER =
 
 # the additional link flags you want to add
@@ -193,11 +193,6 @@ USE_CPP_PACKAGE = 0
 # CAFFE_PATH = $(HOME)/caffe
 # MXNET_PLUGINS += plugin/caffe/caffe.mk
 
-# whether to use torch integration. This requires installing torch.
-# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
-# TORCH_PATH = $(HOME)/torch
-# MXNET_PLUGINS += plugin/torch/torch.mk
-
 # WARPCTC_PATH = $(HOME)/warp-ctc
 # MXNET_PLUGINS += plugin/warpctc/warpctc.mk
 
diff --git a/mshadow b/mshadow
index 2d7780c3f2..984a3a7c25 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 2d7780c3f2eefe4453fa419862d1b2089bedb8d5
+Subproject commit 984a3a7c253a9b590c17206f8d926bdaafdea997
diff --git a/nnvm b/nnvm
index 8d79cfd0b4..7a052d6784 160000
--- a/nnvm
+++ b/nnvm
@@ -1 +1 @@
-Subproject commit 8d79cfd0b42fbe9f6ad75886d495065d5500b9dd
+Subproject commit 7a052d678455f1c96538c1cc5a25f11115363558
diff --git a/perl-package/test.sh b/perl-package/test.sh
index 1a4bd7227a..417e00a03c 100755
--- a/perl-package/test.sh
+++ b/perl-package/test.sh
@@ -29,4 +29,4 @@ make install || exit -1
 
 cd ${MXNET_HOME}/perl-package/AI-MXNet/
 perl Makefile.PL INSTALL_BASE=${MXNET_HOME}/perl5
-make test || exit -1
+make test TEST_VERBOSE=1 || exit -1 # Add debug output to test log
diff --git a/python/README.md b/python/README.md
index 3a0b25c1cf..c1aaa580af 100644
--- a/python/README.md
+++ b/python/README.md
@@ -1,22 +1,21 @@
 MXNet Python Package
 ====================
-MXNet is a deep learning framework designed for both *efficiency* and *flexibility*.
-It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
+This directory and nested files contain MXNet Python package and language binding.
 
+## Installation
+To install MXNet Python package, visit MXNet [Install Instruction](http://mxnet.incubator.apache.org/install/index.html)
 
-Installation
-------------
-To install, check [Build Instruction](http://mxnet.io/get_started/install.html)
 
+## Running the unit tests
 
-Running the tests
------------------
-
-For running the tests you can do the following in the parent directory.
+For running unit tests, you will need the [nose PyPi package](https://pypi.python.org/pypi/nose). To install:
+```bash
+pip install --upgrade nose
+```
 
+Once ```nose``` is installed, run the following from MXNet root directory:
 ```
 nosetests tests/python/unittest
 nosetests tests/python/train
 
-```
-
+```
\ No newline at end of file
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index 340a9e66f4..cc9cad86da 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -372,7 +372,7 @@ class Function(object):
 
     For example, a stable sigmoid function can be defined as::
 
-        class sigmoid(Function):
+        class sigmoid(mx.autograd.Function):
             def forward(self, x):
                 y = 1 / (1 + mx.nd.exp(-x))
                 self.save_for_backward(y)
@@ -383,6 +383,18 @@ def backward(self, dy):
                 # and returns as many NDArrays as forward's arguments.
                 y, = self.saved_tensors
                 return y * (1-y)
+
+    Then, the function can be used in the following way::
+
+        func = sigmoid()
+        x = mx.nd.random.uniform(shape=(10,))
+        x.attach_grad()
+
+        with mx.autograd.record():
+            m = func(x)
+            m.backward()
+        dx = x.grad.asnumpy()
+
     """
     _bwd_functype = CFUNCTYPE(c_int, c_int, c_int, POINTER(c_void_p),
                               POINTER(c_int), c_int, c_void_p)
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 80fc9011db..fbdf15f4f9 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, no-member
+# pylint: disable=invalid-name, no-member, trailing-comma-tuple
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index beb228ec24..02aa5c0414 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -25,9 +25,7 @@
 from multiprocessing.reduction import ForkingPickler
 import pickle
 import io
-import os
 import sys
-import warnings
 import numpy as np
 
 from . import sampler as _sampler
@@ -52,7 +50,7 @@ class ConnectionWrapper(object):
     NDArray via shared memory."""
 
     def __init__(self, conn):
-        self.conn = conn
+        self._conn = conn
 
     def send(self, obj):
         """Send object"""
@@ -67,7 +65,8 @@ def recv(self):
 
     def __getattr__(self, name):
         """Emmulate conn"""
-        return getattr(self.conn, name)
+        attr = self.__dict__.get('_conn', None)
+        return getattr(attr, name)
 
 
 class Queue(multiprocessing.queues.Queue):
@@ -188,9 +187,6 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                              "not be specified if batch_sampler is specified.")
 
         self._batch_sampler = batch_sampler
-        if num_workers > 0 and os.name == 'nt':
-            warnings.warn("DataLoader does not support num_workers > 0 on Windows yet.")
-            num_workers = 0
         self._num_workers = num_workers
         if batchify_fn is None:
             if num_workers > 0:
@@ -220,10 +216,11 @@ def __iter__(self):
 
         for idx, batch in enumerate(self._batch_sampler):
             key_queue.put((idx, batch))
+        num_batches = idx + 1
 
         data_buffer = {}
         curr_idx = 0
-        for _ in range(len(self._batch_sampler)):
+        for _ in range(num_batches):
             idx, batch = data_queue.get()
             data_buffer[idx] = batch
             while curr_idx in data_buffer:
diff --git a/python/mxnet/gluon/data/dataset.py b/python/mxnet/gluon/data/dataset.py
index 059c2a61c7..2c46f1ebee 100644
--- a/python/mxnet/gluon/data/dataset.py
+++ b/python/mxnet/gluon/data/dataset.py
@@ -55,8 +55,8 @@ def __init__(self, *args):
         self._data = []
         for i, data in enumerate(args):
             assert len(data) == self._length, \
-                "All arrays must have the same length. But the first has %s " \
-                "while the %d-th has %d."%(length, i+1, len(data))
+                "All arrays must have the same length; array[0] has length %d " \
+                "while array[%d] has %d." % (self._length, i+1, len(data))
             if isinstance(data, ndarray.NDArray) and len(data.shape) == 1:
                 data = data.asnumpy()
             self._data.append(data)
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 537d6365a4..fa38285911 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -117,7 +117,7 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
 
     def __repr__(self):
         s = 'Parameter {name} (shape={shape}, dtype={dtype})'
-        return s.format(**self.__dict__)
+        return s.format(name=self.name, shape=self.shape, dtype=self.dtype)
 
     @property
     def grad_req(self):
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index 6ed9c3a974..fbcb4355a6 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -20,9 +20,11 @@
 
 from __future__ import absolute_import, print_function
 
-import random
-import logging
 import json
+import logging
+import random
+import warnings
+
 import numpy as np
 
 from ..base import numeric_types
@@ -193,10 +195,10 @@ def __init__(self, min_object_covered=0.1, aspect_ratio_range=(0.75, 1.33),
         self.area_range = area_range
         self.enabled = False
         if (area_range[1] <= 0 or area_range[0] > area_range[1]):
-            logging.warn('Skip DetRandomCropAug due to invalid area_range: %s', area_range)
+            warnings.warn('Skip DetRandomCropAug due to invalid area_range: %s', area_range)
         elif (aspect_ratio_range[0] > aspect_ratio_range[1] or aspect_ratio_range[0] <= 0):
-            logging.warn('Skip DetRandomCropAug due to invalid aspect_ratio_range: %s',
-                         aspect_ratio_range)
+            warnings.warn('Skip DetRandomCropAug due to invalid aspect_ratio_range: %s',
+                          aspect_ratio_range)
         else:
             self.enabled = True
 
@@ -359,10 +361,10 @@ def __init__(self, aspect_ratio_range=(0.75, 1.33), area_range=(1.0, 3.0),
         self.max_attempts = max_attempts
         self.enabled = False
         if (area_range[1] <= 1.0 or area_range[0] > area_range[1]):
-            logging.warn('Skip DetRandomPadAug due to invalid parameters: %s', area_range)
+            warnings.warn('Skip DetRandomPadAug due to invalid parameters: %s', area_range)
         elif (aspect_ratio_range[0] <= 0 or aspect_ratio_range[0] > aspect_ratio_range[1]):
-            logging.warn('Skip DetRandomPadAug due to invalid aspect_ratio_range: %s',
-                         aspect_ratio_range)
+            warnings.warn('Skip DetRandomPadAug due to invalid aspect_ratio_range: %s',
+                          aspect_ratio_range)
         else:
             self.enabled = True
 
@@ -840,7 +842,7 @@ def draw_next(self, color=None, thickness=2, mean=None, std=None, clip=True,
         try:
             import cv2
         except ImportError as e:
-            logging.warn('Unable to import cv2, skip drawing: %s', str(e))
+            warnings.warn('Unable to import cv2, skip drawing: %s', str(e))
             raise StopIteration
         count = 0
         try:
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index ad598d032d..b2a4beaf93 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -408,10 +408,13 @@ def set_gradient_compression(self, compression_params):
             Other keys in this dictionary are optional and specific to the type
             of gradient compression.
         """
-        ckeys, cvals = _ctype_dict(compression_params)
-        check_call(_LIB.MXKVStoreSetGradientCompression(self.handle,
-                                                        mx_uint(len(compression_params)),
-                                                        ckeys, cvals))
+        if ('device' in self.type) or ('dist' in self.type):
+            ckeys, cvals = _ctype_dict(compression_params)
+            check_call(_LIB.MXKVStoreSetGradientCompression(self.handle,
+                                                            mx_uint(len(compression_params)),
+                                                            ckeys, cvals))
+        else:
+            raise Exception('Gradient compression is not supported for this type of kvstore')
 
     def set_optimizer(self, optimizer):
         """ Registers an optimizer with the kvstore.
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 551c786eff..26e39c93c0 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -61,4 +61,4 @@ def find_lib_path():
 
 
 # current version
-__version__ = "0.12.1"
+__version__ = "1.0.0"
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 229044e289..700dee0b07 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -658,8 +658,7 @@ def __setitem__(self, key, value):
                 if value.handle is not self.handle:
                     value.copyto(self)
             elif isinstance(value, numeric_types):
-                raise ValueError("Assigning numeric types to RowSparseNDArray " \
-                                 "is not implemented yet.")
+                _internal._set_value(float(value), out=self)
             elif isinstance(value, (np.ndarray, np.generic)):
                 warnings.warn('Assigning non-NDArray object to RowSparseNDArray is not efficient',
                               RuntimeWarning)
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 013455614f..7e8e7c2937 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -20,7 +20,6 @@
 """Weight updating functions."""
 import math
 import pickle
-import logging
 import warnings
 import numpy
 from .base import py_str
@@ -129,11 +128,10 @@ def register(klass):
         assert(isinstance(klass, type))
         name = klass.__name__.lower()
         if name in Optimizer.opt_registry:
-            logging.warning('WARNING: New optimizer %s.%s is overriding '
-                            'existing optimizer %s.%s',
-                            klass.__module__, klass.__name__,
-                            Optimizer.opt_registry[name].__module__,
-                            Optimizer.opt_registry[name].__name__)
+            warnings.warn('WARNING: New optimizer %s.%s is overriding existing '
+                          'optimizer %s.%s', klass.__module__, klass.__name__,
+                          Optimizer.opt_registry[name].__module__,
+                          Optimizer.opt_registry[name].__name__)
         Optimizer.opt_registry[name] = klass
         return klass
 
@@ -1100,7 +1098,7 @@ def update(self, index, weight, grad, state):
         t = self._index_update_count[index]
 
         # preprocess grad
-        grad *= self.rescale_grad + wd * weight
+        grad = grad * self.rescale_grad + wd * weight
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index ce7776d948..22212b0bdb 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -491,14 +491,16 @@ def __getitem__(self, index):
             Indexing key
 
         """
-        output_names = self.list_outputs()
+        output_count = len(self)
         if isinstance(index, py_slice):
             start = 0 if index.start is None else index.start
-            stop = len(output_names) if index.stop is None else index.stop
+            stop = output_count if index.stop is None else index.stop
             step = 1 if index.step is None else index.step
             return Group([self[i] for i in range(start, stop, step)])
 
         if isinstance(index, string_types):
+            # Returning this list of names is expensive. Some symbols may have hundreds of outputs
+            output_names = self.list_outputs()
             idx = None
             for i, name in enumerate(output_names):
                 if name == index:
@@ -511,7 +513,7 @@ def __getitem__(self, index):
 
         if not isinstance(index, int):
             raise TypeError('Symbol only support integer index to fetch i-th output')
-        if index >= len(output_names):
+        if index >= output_count:
             # Important, python determines the end by this exception
             raise IndexError
         handle = SymbolHandle()
@@ -745,6 +747,25 @@ def list_outputs(self):
             self.handle, ctypes.byref(size), ctypes.byref(sarr)))
         return [py_str(sarr[i]) for i in range(size.value)]
 
+    def __len__(self):
+        """Get number of outputs for the symbol.
+
+        Example
+        -------
+        >>> a = mx.sym.var('a')
+        >>> b = mx.sym.var('b')
+        >>> c = a + b
+        >>> len(c)
+
+        Returns
+        -------
+        len(self): Number of outputs
+            Number of outputs
+        """
+        output_count = mx_uint()
+        check_call(_LIB.MXSymbolGetNumOutputs(self.handle, ctypes.byref(output_count)))
+        return output_count.value
+
     def list_auxiliary_states(self):
         """Lists all the auxiliary states in the symbol.
 
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 3e667364cd..53814b766f 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -29,6 +29,7 @@
 import errno
 import logging
 import bz2
+import zipfile
 from contextlib import contextmanager
 import numpy as np
 import numpy.testing as npt
@@ -1440,6 +1441,31 @@ def read_data(label_url, image_url):
     return {'train_data':train_img, 'train_label':train_lbl,
             'test_data':test_img, 'test_label':test_lbl}
 
+def get_zip_data(data_dir, url, data_origin_name):
+    """Download and extract zip data.
+
+    Parameters
+    ----------
+
+    data_dir : str
+        Absolute or relative path of the directory name to store zip files
+    url : str
+        URL to download data from
+    data_origin_name : str
+        Name of the downloaded zip file
+
+    Examples
+    --------
+    >>> get_zip_data("data_dir",
+                     "http://files.grouplens.org/datasets/movielens/ml-10m.zip",
+                     "ml-10m.zip")
+    """
+    data_origin_name = os.path.join(data_dir, data_origin_name)
+    if not os.path.exists(data_origin_name):
+        download(url, dirname=data_dir, overwrite=False)
+        zip_file = zipfile.ZipFile(data_origin_name)
+        zip_file.extractall(path=data_dir)
+
 def get_bz2_data(data_dir, data_name, url, data_origin_name):
     """Download and extract bz2 data.
 
@@ -1465,14 +1491,12 @@ def get_bz2_data(data_dir, data_name, url, data_origin_name):
     data_name = os.path.join(data_dir, data_name)
     data_origin_name = os.path.join(data_dir, data_origin_name)
     if not os.path.exists(data_name):
-        download(url, dirname=data_dir, overwrite=False)
+        download(url, fname=data_origin_name, dirname=data_dir, overwrite=False)
         bz_file = bz2.BZ2File(data_origin_name, 'rb')
         with open(data_name, 'wb') as fout:
-            try:
-                content = bz_file.read()
-                fout.write(content)
-            finally:
-                bz_file.close()
+            for line in bz_file:
+                fout.write(line)
+            bz_file.close()
         os.remove(data_origin_name)
 
 def set_env_var(key, val, default_val=""):
@@ -1538,3 +1562,34 @@ def discard_stderr():
     finally:
         os.dup2(old_stderr, stderr_fileno)
         bit_bucket.close()
+
+class DummyIter(mx.io.DataIter):
+    """A dummy iterator that always returns the same batch of data
+    (the first data batch of the real data iter). This is usually used for speed testing.
+
+    Parameters
+    ----------
+    real_iter: mx.io.DataIter
+        The real data iterator where the first batch of data comes from
+    """
+    def __init__(self, real_iter):
+        super(DummyIter, self).__init__()
+        self.real_iter = real_iter
+        self.provide_data = real_iter.provide_data
+        self.provide_label = real_iter.provide_label
+        self.batch_size = real_iter.batch_size
+        self.the_batch = next(real_iter)
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        """Get a data batch from iterator. The first data batch of real iter is always returned.
+        StopIteration will never be raised.
+
+        Returns
+        -------
+        DataBatch
+            The data of next batch.
+        """
+        return self.the_batch
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index f15a7e315d..10f5d39638 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index 81e4d1ec59..9c9af8422d 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index 5e6cb8c7f6..986245b5cb 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index b27630db8e..aa9513a480 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 361833685e..5fede4a9af 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -71,13 +71,13 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
   </dependencies>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 9ad10c9de7..b6bdc7b6d8 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -118,7 +118,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 983135d911..e9d9ac3ead 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index 2ca851baa4..663528012a 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 4ae2426bbf..37ecbb8f22 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index eed3aee82a..08b6119553 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
 <!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index 76f2438d38..e905f2eaa4 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -41,13 +41,13 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index 47194069e4..4ea7eea9cb 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index 5e038b2d5a..b8b3ec19d0 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 227ad24f8f..0b09c1b5e1 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index ffb8740239..5013fe9348 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index a91fbe4420..68eb598012 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -5,7 +5,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>0.12.1-SNAPSHOT</version>
+  <version>1.0.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/dmlc/mxnet/tree/master/scala-package</url>
   <description>MXNet Scala Package</description>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 22114fb0a4..98d724969b 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>0.12.1-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -21,7 +21,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>0.12.1-SNAPSHOT</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/setup-utils/install-mxnet-osx-python.sh b/setup-utils/install-mxnet-osx-python.sh
index 25a44796cb..370e343891 100755
--- a/setup-utils/install-mxnet-osx-python.sh
+++ b/setup-utils/install-mxnet-osx-python.sh
@@ -33,7 +33,7 @@ then
 	# TODO: Change this to latest tag
 	#       to avoid updating this value for every release
 	#
-	export MXNET_TAG="0.12.0"
+	export MXNET_TAG="0.12.1"
 fi
 
 export TARIKH=`/bin/date +%Y-%m-%d-%H:%M:%S`
diff --git a/snapcraft.yaml b/snapcraft.yaml
index de68a8077f..bbc8087a74 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '0.12.1'
+version: '1.0.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 51f30e2231..d67d52c3dd 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -191,7 +191,6 @@ int MXInvokeCachedOp(CachedOpHandle handle,
                      NDArrayHandle *inputs,
                      int *num_outputs,
                      NDArrayHandle **outputs) {
-  static const auto cached_op = nnvm::Op::Get("_CachedOp");
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
 
   API_BEGIN();
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index dad71b0816..3668af0600 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -310,6 +310,11 @@ int MXSymbolListOutputs(SymbolHandle symbol,
   return NNSymbolListOutputNames(symbol, out_size, out_str_array);
 }
 
+int MXSymbolGetNumOutputs(SymbolHandle symbol,
+                           mx_uint *output_count) {
+  return NNSymbolGetNumOutputs(symbol, output_count);
+}
+
 int MXSymbolCompose(SymbolHandle sym,
                     const char *name,
                     mx_uint num_args,
diff --git a/src/engine/profiler.cc b/src/engine/profiler.cc
index 44ad138277..13f8cca37b 100644
--- a/src/engine/profiler.cc
+++ b/src/engine/profiler.cc
@@ -24,12 +24,8 @@
  */
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
+#include <dmlc/omp.h>
 #include <mxnet/base.h>
-#include <set>
-#include <map>
-#include <mutex>
-#include <chrono>
-#include <iostream>
 #include <fstream>
 #include <thread>
 #include "./profiler.h"
@@ -44,7 +40,6 @@
 
 namespace mxnet {
 namespace engine {
-const int INITIAL_SIZE = 1024;
 
 Profiler::Profiler()
   : state_(kNotRunning), enable_output_(false), filename_("profile.json") {
@@ -59,14 +54,13 @@ Profiler::Profiler()
 #endif
 
   this->profile_stat = new DevStat[cpu_num_ + gpu_num_ + 1];
-  this->profile_stat->opr_exec_stats.reserve(INITIAL_SIZE);
   for (unsigned int i = 0; i < cpu_num_; ++i) {
-    profile_stat[i].dev_name = "cpu/" + std::to_string(i);
+    profile_stat[i].dev_name_ = "cpu/" + std::to_string(i);
   }
   for (unsigned int i = 0; i < gpu_num_; ++i) {
-    profile_stat[cpu_num_ + i].dev_name = "gpu/" + std::to_string(i);
+    profile_stat[cpu_num_ + i].dev_name_ = "gpu/" + std::to_string(i);
   }
-  profile_stat[cpu_num_ + gpu_num_].dev_name = "cpu pinned/";
+  profile_stat[cpu_num_ + gpu_num_].dev_name_ = "cpu pinned/";
 
   mode_ = (ProfilerMode)dmlc::GetEnv("MXNET_PROFILER_MODE", static_cast<int>(kOnlySymbolic));
   if (dmlc::GetEnv("MXNET_PROFILER_AUTOSTART", 0)) {
@@ -99,7 +93,7 @@ void Profiler::SetConfig(ProfilerMode mode, std::string output_filename) {
 }
 
 OprExecStat *Profiler::AddOprStat(int dev_type, uint32_t dev_id) {
-  OprExecStat* opr_stat = new OprExecStat;
+  std::unique_ptr<OprExecStat> opr_stat(new OprExecStat);
   opr_stat->dev_type = dev_type;
   opr_stat->dev_id   = dev_id;
   opr_stat->opr_name[sizeof(opr_stat->opr_name)-1] = '\0';
@@ -116,16 +110,13 @@ OprExecStat *Profiler::AddOprStat(int dev_type, uint32_t dev_id) {
       idx = cpu_num_ + gpu_num_;
       break;
     default:
-      LOG(FATAL) << "Unkown dev_type";
+      LOG(FATAL) << "Unknown dev_type: " << dev_type;
       return NULL;
   }
 
   DevStat& dev_stat = profile_stat[idx];
-  {
-    std::lock_guard<std::mutex> lock{dev_stat.m_};
-    dev_stat.opr_exec_stats.push_back(opr_stat);
-  }
-  return opr_stat;
+  dev_stat.opr_exec_stats_->enqueue(opr_stat.get());
+  return opr_stat.release();
 }
 
 void Profiler::EmitPid(std::ostream *os, const std::string& name, uint32_t pid) {
@@ -167,19 +158,17 @@ void Profiler::DumpProfile() {
 
   for (uint32_t i = 0; i < dev_num; ++i) {
     const DevStat &d = profile_stat[i];
-    this->EmitPid(&file, d.dev_name, i);
+    this->EmitPid(&file, d.dev_name_, i);
     file << ",\n";
   }
 
   bool first_flag = true;
   for (uint32_t i = 0; i < dev_num; ++i) {
     DevStat &d = profile_stat[i];
-    std::lock_guard<std::mutex> lock(d.m_);
-    uint32_t opr_num = d.opr_exec_stats.size();
-
-    for (uint32_t j = 0; j < opr_num; ++j) {
-      const OprExecStat* opr_stat = d.opr_exec_stats[j];
-
+    OprExecStat *_opr_stat;
+    while (d.opr_exec_stats_->try_dequeue(_opr_stat)) {
+      CHECK_NOTNULL(_opr_stat);
+      std::unique_ptr<OprExecStat> opr_stat(_opr_stat);  // manage lifecycle
       uint32_t pid = i;
       uint32_t tid = opr_stat->thread_id;
 
@@ -190,10 +179,10 @@ void Profiler::DumpProfile() {
       }
       file << std::endl;
       this->EmitEvent(&file, opr_stat->opr_name, "category", "B",
-            opr_stat->opr_start_rel_micros, pid, tid);
+                      opr_stat->opr_start_rel_micros, pid, tid);
       file << ",\n";
       this->EmitEvent(&file, opr_stat->opr_name, "category", "E",
-            opr_stat->opr_end_rel_micros, pid, tid);
+                      opr_stat->opr_end_rel_micros, pid, tid);
     }
   }
 
diff --git a/src/engine/profiler.h b/src/engine/profiler.h
index dbbc773351..ebd942036c 100644
--- a/src/engine/profiler.h
+++ b/src/engine/profiler.h
@@ -25,6 +25,7 @@
 #ifndef MXNET_ENGINE_PROFILER_H_
 #define MXNET_ENGINE_PROFILER_H_
 
+#include <dmlc/concurrentqueue.h>
 #include <vector>
 #include <string>
 #include <mutex>
@@ -65,11 +66,24 @@ struct OprExecStat {
  */
 struct DevStat {
   /*! \brief device name */
-  std::string dev_name;
+  std::string dev_name_;
   /*! \brief operation execution statistics on this device */
-  std::vector<OprExecStat*> opr_exec_stats;
-  /*! \brief internal mutex of the execution state */
-  std::mutex m_;
+  std::shared_ptr<dmlc::moodycamel::ConcurrentQueue<OprExecStat *>> opr_exec_stats_ =
+    std::make_shared<dmlc::moodycamel::ConcurrentQueue<OprExecStat *>>();
+
+  /*!
+   * \brief Destructor, clean up allocated objects
+   *        TODO(cjolivier01) Investigate queueing unique_ptr<>'s if it won't hurt performance
+   */
+  ~DevStat() {
+    std::shared_ptr<dmlc::moodycamel::ConcurrentQueue<OprExecStat *>> es = opr_exec_stats_;
+    if (es) {
+      OprExecStat *opr_stat = nullptr;
+      while (es->try_dequeue(opr_stat)) {
+        delete opr_stat;
+      }
+    }
+  }
 };
 
 
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index 28bc92f7b2..e7e222f6cb 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -95,9 +95,10 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     int cpu_priority_nthreads = dmlc::GetEnv("MXNET_CPU_PRIORITY_NTHREADS", 4);
     cpu_priority_worker_.reset(new ThreadWorkerBlock<kPriorityQueue>());
     cpu_priority_worker_->pool.reset(new ThreadPool(
-        cpu_priority_nthreads, [this]() {
-          this->CPUWorker(Context(), cpu_priority_worker_.get());
-        }));
+        cpu_priority_nthreads,
+        [this](std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
+          this->CPUWorker(Context(), cpu_priority_worker_.get(), ready_event);
+        }, true));
     // GPU tasks will be created lazily
   }
 
@@ -122,9 +123,10 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
           auto ptr =
           cpu_normal_workers_.Get(dev_id, [this, ctx, nthread]() {
               auto blk = new ThreadWorkerBlock<kWorkerQueue>();
-              blk->pool.reset(new ThreadPool(nthread, [this, ctx, blk] () {
-                    this->CPUWorker(ctx, blk);
-                  }));
+              blk->pool.reset(new ThreadPool(nthread,
+                  [this, ctx, blk](std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
+                    this->CPUWorker(ctx, blk, ready_event);
+                  }, true));
               return blk;
             });
           if (ptr) {
@@ -259,12 +261,14 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
    */
   template<dmlc::ConcurrentQueueType type>
   inline void CPUWorker(Context ctx,
-                        ThreadWorkerBlock<type> *block) {
+                        ThreadWorkerBlock<type> *block,
+                        std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
     this->is_worker_ = true;
     auto* task_queue = &(block->task_queue);
     RunContext run_ctx{ctx, nullptr};
     // execute task
     OprBlock* opr_block;
+    ready_event->signal();
     while (task_queue->Pop(&opr_block)) {
       this->ExecuteOprBlock(run_ctx, opr_block);
     }
diff --git a/src/initialize.cc b/src/initialize.cc
index 56d6fe1fff..2d077f4908 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -25,37 +25,23 @@
 #include <signal.h>
 #include <dmlc/logging.h>
 #include <mxnet/engine.h>
-
 #include "engine/profiler.h"
 
 namespace mxnet {
-
-void segfault_logger(int sig) {
-  const int MAX_STACK_SIZE = 10;
-  void *stack[MAX_STACK_SIZE];
-
+#if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
+static void SegfaultLogger(int sig) {
   fprintf(stderr, "\nSegmentation fault: %d\n\n", sig);
-
-#if DMLC_LOG_STACK_TRACE
-  int nframes = backtrace(stack, MAX_STACK_SIZE);
-  fprintf(stderr, "Stack trace returned %d entries:\n", nframes);
-  char **msgs = backtrace_symbols(stack, nframes);
-  if (msgs != nullptr) {
-    for (int i = 0; i < nframes; ++i) {
-      fprintf(stderr, "[bt] (%d) %s\n", i, msgs[i]);
-    }
-  }
-#endif  // DMLC_LOG_STACK_TRACE
-
+  fprintf(stderr, "%s", dmlc::StackTrace().c_str());
   exit(-1);
 }
+#endif
 
 class LibraryInitializer {
  public:
   LibraryInitializer() {
     dmlc::InitLogging("mxnet");
-#if MXNET_USE_SIGNAL_HANDLER
-    signal(SIGSEGV, segfault_logger);
+#if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
+    signal(SIGSEGV, SegfaultLogger);
 #endif
 #if MXNET_USE_PROFILER
     // ensure profiler's constructor are called before atexit.
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index 49aa001910..f1637c4e57 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -247,7 +247,12 @@ class KVStoreDistServer {
             NDArray rsp = stored;
             stored.CheckAndAlloc({mshadow::Shape1(recved.shape()[0])});
             mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
-            op::PopulateFullIdxRspImpl(s, &rsp);
+            using namespace mxnet::op;
+            nnvm::dim_t nnr = rsp.shape()[0];
+            MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
+              IType* idx = rsp.aux_data(rowsparse::kIdx).dptr<IType>();
+              mxnet_op::Kernel<PopulateFullIdxRspKernel, cpu>::Launch(s, nnr, idx);
+            });
             mshadow::Copy(rsp.data().FlatTo1D<cpu, float>(),
                           recved.data().FlatTo1D<cpu, float>(), s);
             on_complete();
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 4a1963ae21..f09f168977 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -309,30 +309,34 @@ void SetValueOp(const real_t &rhs, NDArray *out) {
   CHECK_NE(out->is_none(), true) << "Set value target must not be empty";
   // important: callback must always capture by value
   NDArray ret = *out;
-  switch (ret.ctx().dev_mask()) {
-    case cpu::kDevMask: {
-      Engine::Get()->PushSync([rhs, ret](RunContext ctx) {
-          CHECK(ret.storage_type() == kDefaultStorage);
-          TBlob tmp = ret.data();
-          ndarray::Eval<cpu>(rhs, &tmp, ctx);
-        }, ret.ctx(), {}, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-      break;
-    }
+  const NDArrayStorageType stype = ret.storage_type();
+  Engine::Get()->PushSync([rhs, ret, stype](RunContext ctx) {
+      TBlob tmp = ret.data();
+      switch (ret.ctx().dev_mask()) {
+        case cpu::kDevMask: {
+          if (stype == kDefaultStorage) {
+            ndarray::Eval<cpu>(rhs, &tmp, ctx);
+          } else {
+            ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
+          }
+          break;
+        }
 #if MXNET_USE_CUDA
-    case gpu::kDevMask: {
-      Engine::Get()->PushSync([rhs, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Eval<gpu>(rhs, &tmp, ctx);
+        case gpu::kDevMask: {
+          if (stype == kDefaultStorage) {
+            ndarray::Eval<gpu>(rhs, &tmp, ctx);
+          } else {
+            ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
+          }
           // Wait GPU kernel to complete
           ctx.get_stream<gpu>()->Wait();
-        }, ret.ctx(), {}, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-      break;
-    }
+          break;
+        }
 #endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-  }
+        default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      }
+    }, ret.ctx(), {}, {ret.var()},
+  FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
 }
 
 /*!
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index ef0adbe5f2..552555adf8 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -183,5 +183,18 @@ void ElementwiseSum<cpu>(mshadow::Stream<cpu>* s,
   }
 }
 
+
+template<>
+void Eval<cpu>(mshadow::Stream<cpu> *s,
+               const real_t val, const NDArray& dst) {
+  NDArray temp = dst;
+  const NDArrayStorageType stype = temp.storage_type();
+  if (stype == kRowSparseStorage) {
+    SetValueRspImpl(s, val, &temp);
+  } else {
+    LOG(FATAL) << "Not implemented for storage type" << stype;
+  }
+}
+
 }  // namespace ndarray
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu
index 445f8459ae..06b5ad46a0 100644
--- a/src/ndarray/ndarray_function.cu
+++ b/src/ndarray/ndarray_function.cu
@@ -203,5 +203,17 @@ void ElementwiseSum<gpu>(mshadow::Stream<gpu>* s,
   }
 }
 
+template<>
+void Eval<gpu>(mshadow::Stream<gpu> *s,
+               const real_t val, const NDArray& dst) {
+  NDArray temp = dst;
+  const NDArrayStorageType stype = temp.storage_type();
+  if (stype == kRowSparseStorage) {
+    SetValueRspImpl(s, val, &temp);
+  } else {
+    LOG(FATAL) << "Not implemented for storage type" << stype;
+  }
+}
+
 }  // namespace ndarray
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index 6e6df3954c..518bb77317 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -32,6 +32,7 @@
 #include <mxnet/ndarray.h>
 #include <vector>
 #include "../operator/mshadow_op.h"
+#include "../operator/tensor/init_op.h"
 
 namespace mxnet {
 /*! \brief namespace to support all possible Ndarray operator */
@@ -179,6 +180,30 @@ void ElementwiseSum(mshadow::Stream<xpu>* s,
                     const std::vector<NDArray>& nds,
                     NDArray* out);
 
+/*!
+ * \brief Set a row_sparse NDArray with val
+ * \param s - The device stream
+ * \param val - The value to be set
+ * \param dst - NDArray which is to be set to val
+ */
+template<typename xpu>
+void SetValueRspImpl(mshadow::Stream<xpu> *s,
+                     const real_t val, NDArray *dst) {
+  CHECK_EQ(dst->storage_type(), kRowSparseStorage);
+  using namespace mxnet::op;
+  nnvm::dim_t nnr = dst->shape()[0];
+  dst->CheckAndAlloc({mshadow::Shape1(nnr)});
+  MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(rowsparse::kIdx), IType, {
+    IType* idx = dst->aux_data(rowsparse::kIdx).dptr<IType>();
+    mxnet_op::Kernel<PopulateFullIdxRspKernel, xpu>::Launch(s, nnr, idx);
+  });
+  Fill<false>(s, dst->data(), kWriteTo, val);
+}
+
+template<typename xpu>
+void Eval(mshadow::Stream<xpu> *s,
+          const real_t val, const NDArray& dst);
+
 // broadcasting
 template <typename Device>
 void EvalBroadcast(TBlob const& src, TBlob* ret, int size, RunContext ctx);
diff --git a/src/operator/c_lapack_api.h b/src/operator/c_lapack_api.h
index 293c3f2f81..46c8b963f4 100644
--- a/src/operator/c_lapack_api.h
+++ b/src/operator/c_lapack_api.h
@@ -143,19 +143,7 @@ inline char loup(char uplo, bool invert) { return invert ? (uplo == 'U' ? 'L' :
  * \param lda leading dimension of a
  */
 template <typename xpu, typename DType>
-inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda);
-
-template <>
-inline void flip<cpu, float>(int m, int n,
-  float *b, int ldb, float *a, int lda) {
-  for (int i = 0; i < m; ++i)
-    for (int j = 0; j < n; ++j)
-      b[j * ldb + i] = a[i * lda + j];
-}
-
-template <>
-inline void flip<cpu, double>(int m, int n,
-  double *b, int ldb, double *a, int lda) {
+inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
   for (int i = 0; i < m; ++i)
     for (int j = 0; j < n; ++j)
       b[j * ldb + i] = a[i * lda + j];
diff --git a/src/operator/contrib/krprod.cc b/src/operator/contrib/krprod.cc
new file mode 100644
index 0000000000..b5f9117ef3
--- /dev/null
+++ b/src/operator/contrib/krprod.cc
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+n * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+/*!
+ *  \file krprod.cc
+ *  \brief Operator registration for Khatri-Rao product
+ *  \author Chris Swierczewski
+ */
+
+#include <mshadow/tensor.h>
+#include <mxnet/op_attr_types.h>
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+#include "../../ndarray/ndarray_function.h"
+#include "krprod.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool KhatriRaoShape(
+      const nnvm::NodeAttrs& attrs,
+      std::vector<TShape> *in_attrs,
+      std::vector<TShape> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_GE(in_attrs->size(), 1);
+
+  // all input and output matrices must have the same number of rows/columns
+  // (when inputs_transposed is set to true/false)
+  int num_columns = static_cast<int>((*in_attrs)[0][1]);
+  int num_rows = 1;
+  for (const TShape& attr_shape : (*in_attrs)) {
+    CHECK_EQ(num_columns, static_cast<int>(attr_shape[1]));
+    num_rows *= attr_shape[0];
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, Shape2(num_rows, num_columns));
+  return true;
+}
+
+
+struct KhatriRaoParam : public dmlc::Parameter<KhatriRaoParam> {
+  int num_args;
+  bool row_wise = false;
+  DMLC_DECLARE_PARAMETER(KhatriRaoParam) {
+    DMLC_DECLARE_FIELD(num_args)
+      .set_lower_bound(1)
+      .describe("Number of input matrices.");
+  }
+};
+DMLC_REGISTER_PARAMETER(KhatriRaoParam);
+
+
+NNVM_REGISTER_OP(khatri_rao)
+.describe(R"code(Computes the Khatri-Rao product of the input matrices.
+
+Given a collection of :math:`n` input matrices,
+
+.. math::
+   A_1 \in \mathbb{R}^{M_1 \times M}, \ldots, A_n \in \mathbb{R}^{M_n \times N},
+
+the (column-wise) Khatri-Rao product is defined as the matrix,
+
+.. math::
+   X = A_1 \otimes \cdots \otimes A_n \in \mathbb{R}^{(M_1 \cdots M_n) \times N},
+
+where the :math:`k`th column is equal to the column-wise outer product
+:math:`{A_1}_k \otimes \cdots \otimes {A_n}_k` where :math:`{A_i}_k` is the kth
+column of the ith matrix.
+
+Example::
+
+  >>> A = mx.nd.array([[1, -1],
+  >>>                  [2, -3]])
+  >>> B = mx.nd.array([[1, 4],
+  >>>                  [2, 5],
+  >>>                  [3, 6]])
+  >>> C = mx.nd.khatri_rao(A, B)
+  >>> print(C.asnumpy())
+  [[  1.  -4.]
+   [  2.  -5.]
+   [  3.  -6.]
+   [  2. -12.]
+   [  4. -15.]
+   [  6. -18.]]
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<KhatriRaoParam>)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    uint32_t ret = dmlc::get<KhatriRaoParam>(attrs.parsed).num_args;
+    return ret;
+  })
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", KhatriRaoShape)
+.set_attr<nnvm::FInferType>("FInferType",
+  [](const nnvm::NodeAttrs& attrs,
+     std::vector<int> *in_attrs,
+     std::vector<int> *out_attrs) {
+    return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+      attrs, in_attrs, out_attrs, -1);
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<KhatriRaoParam>(attrs.parsed).num_args;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i)
+      ret.push_back(std::string("arg") + std::to_string(i));
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", KhatriRaoCompute<cpu>)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("args", "NDArray-or-Symbol[]", "Positional input matrices");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/krprod.h b/src/operator/contrib/krprod.h
index 90a6179e07..b947410ae4 100644
--- a/src/operator/contrib/krprod.h
+++ b/src/operator/contrib/krprod.h
@@ -21,14 +21,18 @@
  *  Copyright (c) 2017 by Contributors
  *  \file krprod.h
  *  \brief Core function for Khatri-Rao product
- *  \author Jencir Lee
+ *  \author Jencir Lee, Chris Swierczewski
  */
 #ifndef MXNET_OPERATOR_CONTRIB_KRPROD_H_
 #define MXNET_OPERATOR_CONTRIB_KRPROD_H_
+#include <algorithm>
+#include <utility>
 #include <vector>
 #include "mshadow/tensor.h"
+#include "../operator_common.h"
 #include "../c_lapack_api.h"
 
+
 namespace mxnet {
 namespace op {
 
@@ -247,6 +251,40 @@ inline void inv_khatri_rao
     LOG(FATAL) << "The linear solver in inv_prod() returns " << info;
 }
 
+
+template<typename xpu, typename DType>
+inline void KhatriRaoCompute_(const nnvm::NodeAttrs &attrs,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &in_data,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &out_data) {
+  using namespace mxnet_op;
+  if (req[0] == kNullOp) return;
+
+  Stream<xpu> *stream = ctx.get_stream<xpu>();
+  Tensor<xpu, 2, DType> out = out_data[0].get<xpu, 2, DType>(stream);
+  std::vector<Tensor<xpu, 2, DType> > ts_arr(in_data.size());
+  std::transform(in_data.begin(), in_data.end(), ts_arr.begin(),
+                 [&stream](TBlob blob) -> Tensor<xpu, 2, DType> {
+                   return blob.get<xpu, 2, DType>(stream);
+                 });
+  khatri_rao(out, ts_arr);
+}
+
+
+template<typename xpu>
+inline void KhatriRaoCompute(const nnvm::NodeAttrs &attrs,
+                             const OpContext &ctx,
+                             const std::vector<TBlob> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  CHECK_EQ(outputs.size(), 1U);
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      KhatriRaoCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
+  });
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/random/sample_op.h b/src/operator/random/sample_op.h
index 240825bfff..9fdff03886 100644
--- a/src/operator/random/sample_op.h
+++ b/src/operator/random/sample_op.h
@@ -242,44 +242,30 @@ using FSampleCompute = std::function<void (const nnvm::NodeAttrs& attrs,
 
 using mxnet::TBlob;
 
-// Convenience class that transfers a host based scalar into an
-// array on either the host or the device. Needed as
-// the core samplers expect parameters to be tensors located on the
-// appropriate device.
-template<typename xpu>
-Context AllocContext();
-template<>
-MSHADOW_FORCE_INLINE Context AllocContext<cpu>() { return Context::CPU(); }
-template<>
-MSHADOW_FORCE_INLINE Context AllocContext<gpu>() { return Context::GPU(); }
-
+// Allocates a single chunk of workspace memory and partitions it into three
+// workspace tensors that hold the seeds as well as the distribution parameters.
 template<typename xpu, typename DType>
-struct Scalar2Array {
-  Storage::Handle array;
-  Scalar2Array(DType scalar, const OpContext& ctx) {
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    array = Storage::Get()->Alloc(sizeof(DType), AllocContext<xpu>());
-    Tensor<xpu, 1, DType> src(Ref(), Shape1(1), s);
-    Copy(src, Tensor<cpu, 1, DType>(&scalar, Shape1(1)), s);
-  }
-  ~Scalar2Array() {
-    Storage::Get()->Free(array);
-  }
-  DType *Ref() { return static_cast<DType*>(array.dptr); }
-  Tensor<xpu, 1, DType> GetTensor() { return Tensor<xpu, 1, DType>(Ref(), Shape1(1)); }
-};
-
-// Convienience function to generate the required number of seeds for sampling
-template<typename xpu>
-MSHADOW_FORCE_INLINE Tensor<xpu, 1, unsigned int> GetSeeds(index_t N, const OpContext& ctx) {
+MSHADOW_FORCE_INLINE void GetSamplingTempData(index_t N, DType p1, DType p2, const OpContext& ctx,
+                                              Tensor<xpu, 1, unsigned int>* seeds,
+                                              Tensor<xpu, 1, DType>* parm1,
+                                              Tensor<xpu, 1, DType>* parm2) {
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const index_t nSeeds(OptSampleSeedNum<xpu>(N));
-  Tensor<xpu, 1, unsigned int> seeds
-    = ctx.requested[1].get_space_typed<xpu, 1, unsigned int>(Shape1(nSeeds), ctx.get_stream<xpu>());
-  ctx.requested[0].get_random<xpu, float>(s)->GetRandInt(seeds);
-  return seeds;
+  // Combined memory requirement for the workspace data.
+  const index_t nInt(nSeeds + (2 * sizeof(DType) + sizeof(unsigned) - 1) / sizeof(unsigned));
+  Tensor<xpu, 1, unsigned> wspace
+    = ctx.requested[1].get_space_typed<xpu, 1, unsigned>(Shape1(nInt), s);
+  // Partition workspace into three chunks and initialize them.
+  *seeds = Tensor<xpu, 1, unsigned>(wspace.dptr_, Shape1(nSeeds), s);
+  ctx.requested[0].get_random<xpu, float>(s)->GetRandInt(*seeds);
+  DType *pspace = static_cast<DType*>(static_cast<void*>(wspace.dptr_+nSeeds));
+  *parm1 = Tensor<xpu, 1, DType>(pspace, Shape1(1), s);
+  Copy(*parm1, Tensor<cpu, 1, DType>(&p1, Shape1(1)), s);
+  *parm2 = Tensor<xpu, 1, DType>(pspace+1, Shape1(1), s);
+  Copy(*parm2, Tensor<cpu, 1, DType>(&p2, Shape1(1)), s);
 }
 
+
 template<typename xpu, typename Sampler>
 struct SampleMaster;
 
@@ -292,12 +278,14 @@ struct SampleMaster<xpu, UniformSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
     CHECK_GE(param.high, param.low) << "low must be less or equal to high in uniform distribution";
-    Scalar2Array<xpu, float> low(param.low, ctx), high(param.high, ctx);
-    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    Tensor<xpu, 1, unsigned int> seeds;
+    Tensor<xpu, 1, float> low, high;
+    GetSamplingTempData<xpu, float>(outputs->Size(), param.low, param.high, ctx,
+                                    &seeds, &low, &high);
     UniformSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(low.GetTensor(), high.GetTensor(), out, seeds, s);
+      sampler.Sample(low, high, out, seeds, s);
     });
   }
 };
@@ -311,12 +299,14 @@ struct SampleMaster<xpu, NormalSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
     CHECK_GT(param.scale, 0) << "scale parameter in gaussian has to be positive";
-    Scalar2Array<xpu, float> loc(param.loc, ctx), scale(param.scale, ctx);
-    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    Tensor<xpu, 1, unsigned int> seeds;
+    Tensor<xpu, 1, float> loc, scale;
+    GetSamplingTempData<xpu, float>(outputs->Size(), param.loc, param.scale, ctx,
+                                    &seeds, &loc, &scale);
     NormalSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(loc.GetTensor(), scale.GetTensor(), out, seeds, s);
+      sampler.Sample(loc, scale, out, seeds, s);
     });
   }
 };
@@ -331,12 +321,14 @@ struct SampleMaster<xpu, GammaSampler<xpu>> {
     const SampleGammaParam& param = nnvm::get<SampleGammaParam>(attrs.parsed);
     CHECK_GT(param.alpha, 0) << "alpha parameter in gamma distribution has to be positive";
     CHECK_GT(param.beta, 0) << "beta parameter in gamma distribution has to be positive";
-    Scalar2Array<xpu, float> alpha(param.alpha, ctx), beta(param.beta, ctx);
-    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    Tensor<xpu, 1, unsigned int> seeds;
+    Tensor<xpu, 1, float> alpha, beta;
+    GetSamplingTempData<xpu, float>(outputs->Size(), param.alpha, param.beta, ctx,
+                                    &seeds, &alpha, &beta);
     GammaSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(alpha.GetTensor(), beta.GetTensor(), out, seeds, s);
+      sampler.Sample(alpha, beta, out, seeds, s);
     });
   }
 };
@@ -350,12 +342,13 @@ struct SampleMaster<xpu, ExponentialSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SampleExponentialParam& param = nnvm::get<SampleExponentialParam>(attrs.parsed);
     CHECK_GT(param.lam, 0) << "lambda parameter in exponential distribution has to be positive";
-    Scalar2Array<xpu, float> lam(param.lam, ctx);
-    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    Tensor<xpu, 1, unsigned int> seeds;
+    Tensor<xpu, 1, float> lam, dummy;
+    GetSamplingTempData<xpu, float>(outputs->Size(), param.lam, 0, ctx, &seeds, &lam, &dummy);
     ExponentialSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(lam.GetTensor(), out, seeds, s);
+      sampler.Sample(lam, out, seeds, s);
     });
   }
 };
@@ -369,12 +362,13 @@ struct SampleMaster<xpu, PoissonSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SamplePoissonParam& param = nnvm::get<SamplePoissonParam>(attrs.parsed);
     CHECK_GE(param.lam, 0) << "lambda parameter in poisson distribution has to be non-negative";
-    Scalar2Array<xpu, float> lam(param.lam, ctx);
-    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    Tensor<xpu, 1, unsigned int> seeds;
+    Tensor<xpu, 1, float> lam, dummy;
+    GetSamplingTempData<xpu, float>(outputs->Size(), param.lam, 0, ctx, &seeds, &lam, &dummy);
     PoissonSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(lam.GetTensor(), out, seeds, s);
+      sampler.Sample(lam, out, seeds, s);
     });
   }
 };
@@ -389,12 +383,13 @@ struct SampleMaster<xpu, NegativeBinomialSampler<xpu>> {
     const SampleNegBinomialParam& param = nnvm::get<SampleNegBinomialParam>(attrs.parsed);
     CHECK_GE(param.k, 0) << "k parameter in negative binomial distribution has to be non-negative";
     CHECK_GE(param.p, 0) << "p parameter in negative binomial distribution has to be non-negative";
-    Scalar2Array<xpu, float> k(param.k, ctx), p(param.p, ctx);
-    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    Tensor<xpu, 1, unsigned int> seeds;
+    Tensor<xpu, 1, float> k, p;
+    GetSamplingTempData<xpu, float>(outputs->Size(), param.k, param.p, ctx, &seeds, &k, &p);
     NegativeBinomialSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(k.GetTensor(), p.GetTensor(), out, seeds, s);
+      sampler.Sample(k, p, out, seeds, s);
     });
   }
 };
@@ -411,12 +406,14 @@ struct SampleMaster<xpu, GeneralizedNegativeBinomialSampler<xpu>> {
       << "mu parameter in generalized negative binomial distribution has to be non-negative";
     CHECK_GE(param.alpha, 0)
       << "alpha parameter in generalized negative binomial distribution has to be non-negative";
-    Scalar2Array<xpu, float> mu(param.mu, ctx), alpha(param.alpha, ctx);
-    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    Tensor<xpu, 1, unsigned int> seeds;
+    Tensor<xpu, 1, float> mu, alpha;
+    GetSamplingTempData<xpu, float>(outputs->Size(), param.mu, param.alpha, ctx,
+                                    &seeds, &mu, &alpha);
     GeneralizedNegativeBinomialSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(mu.GetTensor(), alpha.GetTensor(), out, seeds, s);
+      sampler.Sample(mu, alpha, out, seeds, s);
     });
   }
 };
@@ -428,13 +425,17 @@ void SampleComputeEx_(const nnvm::NodeAttrs& attrs,
                       const std::vector<OpReqType>& req,
                       const std::vector<NDArray>& outputs,
                       SampleMaster<xpu, Sampler> sample_master) {
+  using namespace mxnet::op;
   NDArray output = outputs[0];
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   if (output.storage_type() == kRowSparseStorage) {
     // indices
     nnvm::dim_t nnr = output.shape()[0];
     output.CheckAndAlloc({mshadow::Shape1(nnr)});
-    PopulateFullIdxRspImpl(s, &output);
+    MSHADOW_IDX_TYPE_SWITCH(output.aux_type(rowsparse::kIdx), IType, {
+      IType* idx = output.aux_data(rowsparse::kIdx).dptr<IType>();
+      mxnet_op::Kernel<PopulateFullIdxRspKernel, xpu>::Launch(s, nnr, idx);
+    });
     // data
     TBlob out_blob = output.data();
     sample_master.op(attrs, ctx, req[0], &out_blob);
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 916c385467..079a33e875 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -442,7 +442,8 @@ The storage type of ``rint`` output depends upon the input storage type:
    - rint(default) = default
    - rint(row_sparse) = row_sparse
 
-)code" ADD_FILELINE);
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // ceil
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(ceil, cpu, mshadow_op::ceil)
@@ -460,7 +461,8 @@ The storage type of ``ceil`` output depends upon the input storage type:
    - ceil(default) = default
    - ceil(row_sparse) = row_sparse
 
-)code" ADD_FILELINE);
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // floor
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(floor, cpu, mshadow_op::floor)
@@ -478,7 +480,8 @@ The storage type of ``floor`` output depends upon the input storage type:
    - floor(default) = default
    - floor(row_sparse) = row_sparse
 
-)code" ADD_FILELINE);
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // trunc
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(trunc, cpu, mshadow_op::trunc)
@@ -497,7 +500,8 @@ The storage type of ``trunc`` output depends upon the input storage type:
    - trunc(default) = default
    - trunc(row_sparse) = row_sparse
 
-)code" ADD_FILELINE);
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // fix
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(fix, cpu, mshadow_op::fix)
@@ -514,7 +518,8 @@ The storage type of ``fix`` output depends upon the input storage type:
    - fix(default) = default
    - fix(row_sparse) = row_sparse
 
-)code" ADD_FILELINE);
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // square
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square)
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 7d885ad473..735da31b8b 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -29,7 +29,7 @@ namespace mxnet {
 namespace op {
 
 template<>
-void SparseEmbeddingOpForwardRspImpl<cpu>(mshadow::Stream<cpu>* s,
+void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
                                           const TBlob& data,
                                           const NDArray& weight,
                                           const OpReqType req,
@@ -37,6 +37,7 @@ void SparseEmbeddingOpForwardRspImpl<cpu>(mshadow::Stream<cpu>* s,
   if (req == kNullOp) return;
   using namespace rowsparse;
   using namespace mxnet_op;
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
   // zeros weight
   if (req == kWriteTo && !weight.storage_initialized()) {
     size_t out_size = output.shape_.Size();
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index f029f02099..4021f2b3a2 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -61,7 +61,7 @@ struct AddTakeGradRspGPUKernel {
 };
 
 template<>
-void SparseEmbeddingOpForwardRspImpl<gpu>(mshadow::Stream<gpu>* s,
+void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,
                                           const NDArray& weight,
                                           const OpReqType req,
@@ -69,6 +69,7 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(mshadow::Stream<gpu>* s,
   if (req == kNullOp) return;
   using namespace rowsparse;
   using namespace mxnet_op;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   // zeros weight
   if (req == kWriteTo && !weight.storage_initialized()) {
     size_t out_size = output.shape_.Size();
@@ -85,8 +86,9 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(mshadow::Stream<gpu>* s,
     DType max = static_cast<DType>(weight.shape()[0] - 1);
     DType* data_ptr = data.dptr<DType>();
     size_t data_size = data.shape_.Size();
-    int32_t* is_valid_ptr = NULL;
-    CUDA_CALL(cudaMalloc(&is_valid_ptr, sizeof(int32_t)));
+    Tensor<gpu, 1, char> workspace = ctx.requested[0]
+        .get_space_typed<gpu, 1, char>(Shape1(sizeof(int32_t)), s);
+    int32_t* is_valid_ptr = reinterpret_cast<int32_t*>(workspace.dptr_);
     Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
     Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
     CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(int32_t),
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index b0f06de9ae..4043e76cfd 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -364,7 +364,7 @@ inline void EmbeddingOpForwardRspImpl(mshadow::Stream<xpu>* s,
 
 // Embedding forward implementation with row_sparse weight
 template<typename xpu>
-void SparseEmbeddingOpForwardRspImpl(mshadow::Stream<xpu>* s,
+void SparseEmbeddingOpForwardRspImpl(const OpContext& ctx,
                                      const TBlob& data,
                                      const NDArray& weight,
                                      const OpReqType req,
@@ -406,10 +406,9 @@ void SparseEmbeddingOpForwardEx(const nnvm::NodeAttrs& attrs,
   const auto data_stype = data.storage_type();
   const auto weight_stype = weight.storage_type();
   const auto out_stype = out.storage_type();
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   if (data_stype == kDefaultStorage && weight_stype == kRowSparseStorage &&
       out_stype == kDefaultStorage) {
-    SparseEmbeddingOpForwardRspImpl<xpu>(s, data.data(), weight, req[0], out.data());
+    SparseEmbeddingOpForwardRspImpl<xpu>(ctx, data.data(), weight, req[0], out.data());
   } else {
     LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
   }
diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu
index aeea2895b0..37660e1d36 100644
--- a/src/operator/tensor/init_op.cu
+++ b/src/operator/tensor/init_op.cu
@@ -43,6 +43,7 @@ void FillZerosCsrImpl(mshadow::Stream<mshadow::gpu> *s, const NDArray& dst) {
   });
 }
 
+
 NNVM_REGISTER_OP(_zeros)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", FillComputeZerosEx<gpu>);
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 4d899704a1..3f5014d8ca 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -291,19 +291,6 @@ inline void FillDnsZerosRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
   });
 }
 
-// Fill full indices NDArray with zeros by updating the aux shape.
-template<typename xpu>
-void PopulateFullIdxRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
-  using namespace rowsparse;
-  CHECK_EQ(dst->storage_type(), kRowSparseStorage);
-  nnvm::dim_t nnr = dst->shape()[0];
-  dst->CheckAndAllocAuxData(kIdx, mshadow::Shape1(nnr));
-  MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, {
-    IType* idx = dst->aux_data(kIdx).dptr<IType>();
-    mxnet_op::Kernel<PopulateFullIdxRspKernel, xpu>::Launch(s, nnr, idx);
-  });
-}
-
 /*!
  * \brief Fill a rsp NDArray with zeros by updating the aux shape.
  * \tparam xpu - cpu or gpu
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 367f8de053..51cffb1f26 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -397,9 +397,7 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
   const auto& in_stype = in_attrs->at(0);
   auto& out_stype = out_attrs->at(0);
   bool dispatched = false;
-  const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
-  const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback :
-                                         DispatchMode::kFComputeEx;
+  const auto dispatch_ex = DispatchMode::kFComputeEx;
   // If step = 1, no need to fallback; otherwise fallback to dense
   bool trivial_step = false;
   if (param.step.ndim() == 0U) {
@@ -452,7 +450,6 @@ void SliceCsrIndPtrImpl(const int begin, const int end, RunContext ctx,
 
 /*
  * Slice a CSR NDArray for first dimension
- * Only implemented for CPU
  */
 template<typename xpu>
 void SliceDimOneCsrImpl(const TShape &begin, const TShape &end, const OpContext& ctx,
@@ -460,7 +457,6 @@ void SliceDimOneCsrImpl(const TShape &begin, const TShape &end, const OpContext&
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace csr;
-  CHECK((std::is_same<xpu, cpu>::value)) << "SliceDimOneCsrImpl is only implemented for CPU";
   nnvm::dim_t begin_row = begin[0];
   nnvm::dim_t end_row = end[0];
   nnvm::dim_t indptr_len = end_row - begin_row + 1;
@@ -471,10 +467,13 @@ void SliceDimOneCsrImpl(const TShape &begin, const TShape &end, const OpContext&
       MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
         RType* in_indptr = in.aux_data(kIndPtr).dptr<RType>();
         RType* out_indptr = out.aux_data(kIndPtr).dptr<RType>();
-        SliceCsrIndPtrImpl<cpu, RType>(begin_row, end_row, ctx.run_ctx, in_indptr, out_indptr);
+        SliceCsrIndPtrImpl<xpu, RType>(begin_row, end_row, ctx.run_ctx, in_indptr, out_indptr);
 
-        // retrieve nnz (CPU implementation)
-        int nnz = out_indptr[indptr_len - 1];
+        Stream<xpu> *s = ctx.get_stream<xpu>();
+
+        RType nnz = 0;
+        mshadow::Copy(Tensor<cpu, 1, RType>(&nnz, Shape1(1)),
+                      Tensor<xpu, 1, RType>(out_indptr + indptr_len - 1, Shape1(1), s));
         // return csr zeros if nnz = 0
         if (nnz == 0) {
           out.set_aux_shape(kIdx, Shape1(0));
@@ -487,10 +486,15 @@ void SliceDimOneCsrImpl(const TShape &begin, const TShape &end, const OpContext&
         IType* out_idx = out.aux_data(kIdx).dptr<IType>();
         DType* in_data = in.data().dptr<DType>();
         DType* out_data = out.data().dptr<DType>();
-        int offset = in_indptr[begin_row];
-        // this is also a CPU-only implementation
-        memcpy(out_idx, in_idx + offset, nnz * sizeof(IType));
-        memcpy(out_data, in_data + offset, nnz * sizeof(DType));
+
+        RType offset = 0;
+        mshadow::Copy(Tensor<cpu, 1, RType>(&offset, Shape1(1)),
+                      Tensor<xpu, 1, RType>(in_indptr + begin_row, Shape1(1), s));
+
+        mshadow::Copy(Tensor<xpu, 1, IType>(out_idx, Shape1(nnz), s),
+                      Tensor<xpu, 1, IType>(in_idx + offset, Shape1(nnz), s), s);
+        mshadow::Copy(Tensor<xpu, 1, DType>(out_data, Shape1(nnz), s),
+                      Tensor<xpu, 1, DType>(in_data + offset, Shape1(nnz), s), s);
       });
     });
   });
@@ -535,69 +539,15 @@ struct SliceDimTwoCsrAssign {
 
 /*
  * Slice a CSR NDArray for two dimensions
- * Only implemented for CPU
  */
 template<typename xpu>
 void SliceDimTwoCsrImpl(const TShape &begin, const TShape &end, const OpContext& ctx,
-                        const NDArray &in, const NDArray &out) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace csr;
-  CHECK((std::is_same<xpu, cpu>::value)) << "SliceDimTwoCsrImpl is only implemented for CPU";
-  nnvm::dim_t begin_row = begin[0], end_row = end[0];
-  nnvm::dim_t begin_col = begin[1], end_col = end[1];
-  nnvm::dim_t indptr_len = end_row - begin_row + 1;
-  out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len));
-  // assume idx indptr share the same type
-  MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, {
-    MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, {
-      MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
-        RType *in_indptr = in.aux_data(kIndPtr).dptr<RType>();
-        IType *in_idx = in.aux_data(kIdx).dptr<IType>();
-        DType *in_data = in.data().dptr<DType>();
-        // retrieve nnz (CPU implementation)
-        RType *out_indptr = out.aux_data(kIndPtr).dptr<RType>();
-        int nnz = 0;
-        out_indptr[0] = 0;
-        // loop through indptr array and corresponding indices to count for nnz
-        for (nnvm::dim_t i = 0; i < indptr_len - 1; i++) {
-          out_indptr[i+1] = out_indptr[i];
-          for (RType j = in_indptr[i + begin_row];
-               j < in_indptr[i + begin_row + 1]; j++) {
-            // indices of CSRNDArray are in ascending order per row
-            if (in_idx[j] >= end_col) {
-              break;
-            } else if (in_idx[j] >= begin_col) {
-              out_indptr[i+1]++;
-              nnz++;
-            }
-          }
-        }
-        // returns zeros in csr format if nnz = 0
-        if (nnz == 0) {
-          out.set_aux_shape(kIdx, Shape1(0));
-          return;
-        }
-        out.CheckAndAllocAuxData(kIdx, Shape1(nnz));
-        out.CheckAndAllocData(Shape1(nnz));
-        IType *out_idx = out.aux_data(kIdx).dptr<IType>();
-        DType *out_data = out.data().dptr<DType>();
-
-        Stream<xpu> *s = ctx.get_stream<xpu>();
-        Kernel<SliceDimTwoCsrAssign, xpu>::Launch(s, indptr_len - 1, out_idx, out_data,
-                                                  out_indptr, in_idx, in_data,
-                                                  in_indptr + begin_row,
-                                                  begin_col, end_col);
-      });
-    });
-  });
-}
+                        const NDArray &in, const NDArray &out);
 
 
 template<typename xpu>
 void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
                   const NDArray &in, OpReqType req, const NDArray &out) {
-  CHECK((std::is_same<xpu, cpu>::value)) << "Slice for CSR input only implemented for CPU";
   if (req == kNullOp) return;
   CHECK_NE(req, kAddTo) << "kAddTo for Slice on CSR input is not supported";
   CHECK_NE(req, kWriteInplace) << "kWriteInplace for Slice on CSR input is not supported";
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 8f36e35d27..e8fdce4914 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -28,6 +28,64 @@
 
 namespace mxnet {
 namespace op {
+
+
+template<>
+void SliceDimTwoCsrImpl<cpu>(const TShape &begin, const TShape &end, const OpContext& ctx,
+                             const NDArray &in, const NDArray &out) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace csr;
+  nnvm::dim_t begin_row = begin[0], end_row = end[0];
+  nnvm::dim_t begin_col = begin[1], end_col = end[1];
+  nnvm::dim_t indptr_len = end_row - begin_row + 1;
+  out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len));
+  // assume idx indptr share the same type
+  MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, {
+    MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, {
+      MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
+        RType *in_indptr = in.aux_data(kIndPtr).dptr<RType>();
+        IType *in_idx = in.aux_data(kIdx).dptr<IType>();
+        DType *in_data = in.data().dptr<DType>();
+        // retrieve nnz (CPU implementation)
+        RType *out_indptr = out.aux_data(kIndPtr).dptr<RType>();
+        int nnz = 0;
+        out_indptr[0] = 0;
+        // loop through indptr array and corresponding indices to count for nnz
+        for (nnvm::dim_t i = 0; i < indptr_len - 1; i++) {
+          out_indptr[i+1] = out_indptr[i];
+          for (RType j = in_indptr[i + begin_row];
+               j < in_indptr[i + begin_row + 1]; j++) {
+            // indices of CSRNDArray are in ascending order per row
+            if (in_idx[j] >= end_col) {
+              break;
+            } else if (in_idx[j] >= begin_col) {
+              out_indptr[i+1]++;
+              nnz++;
+            }
+          }
+        }
+        // returns zeros in csr format if nnz = 0
+        if (nnz == 0) {
+          out.set_aux_shape(kIdx, Shape1(0));
+          return;
+        }
+        out.CheckAndAllocAuxData(kIdx, Shape1(nnz));
+        out.CheckAndAllocData(Shape1(nnz));
+        IType *out_idx = out.aux_data(kIdx).dptr<IType>();
+        DType *out_data = out.data().dptr<DType>();
+
+        Stream<cpu> *s = ctx.get_stream<cpu>();
+        Kernel<SliceDimTwoCsrAssign, cpu>::Launch(s, indptr_len - 1, out_idx, out_data,
+                                                  out_indptr, in_idx, in_data,
+                                                  in_indptr + begin_row,
+                                                  begin_col, end_col);
+      });
+    });
+  });
+}
+
+
 DMLC_REGISTER_PARAMETER(ReshapeParam);
 DMLC_REGISTER_PARAMETER(TransposeParam);
 DMLC_REGISTER_PARAMETER(ExpandDimParam);
@@ -298,6 +356,10 @@ Example::
 .set_attr_parser(ParamParser<SliceParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", SliceOpShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
 .set_attr<FInferStorageType>("FInferStorageType", SliceForwardInferStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice"})
 .set_attr<FCompute>("FCompute<cpu>", SliceOpForward<cpu>)
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index 30eaf23b10..b6597be7e3 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -22,11 +22,121 @@
  * \file matrix_op.cu
  * \brief GPU Implementation of matrix operations
  */
+#include <cub/cub.cuh>
 #include "./matrix_op-inl.h"
 #include "./elemwise_unary_op.h"
 
+
 namespace mxnet {
 namespace op {
+
+/*!
+ * \brief Compute the number of elements of every row.
+ */
+struct SliceMarkCsrIndPtr {
+  /*! 
+   * \brief
+   * \param i           the i-th row of the output csr ndarray
+   * \param prefix_sum  indptr array of the output csr ndarray
+   * \param in_idx      indices array of the input csr ndarray
+   * \param in_indptr   indptr array of the input csr ndarray
+   * \param begin_col   starting indice
+   * \param end_col     ending indice
+   */
+  template<typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  RType* prefix_sum,
+                                  const IType* in_idx,
+                                  const RType* in_indptr,
+                                  const int begin_col, const int end_col) {
+    if (i == 0) {
+      prefix_sum[0] = 0;
+    }
+    RType size = 0;
+    for (RType j = in_indptr[i]; j < in_indptr[i+1]; j++) {
+      // indices of CSRNDArray are in ascending order per row
+      if (in_idx[j] >= end_col) {
+        break;
+      } else if (in_idx[j] >= begin_col) {
+        size++;
+      }
+    }
+    prefix_sum[i+1] = size;
+  }
+};
+
+
+template<>
+void SliceDimTwoCsrImpl<gpu>(const TShape &begin, const TShape &end, const OpContext& ctx,
+                             const NDArray &in, const NDArray &out) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace csr;
+
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+
+  nnvm::dim_t begin_row = begin[0], end_row = end[0];
+  nnvm::dim_t begin_col = begin[1], end_col = end[1];
+  nnvm::dim_t indptr_len = end_row - begin_row + 1;
+  out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len));
+  // assume idx indptr share the same type
+  MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, {
+    MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, {
+      MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
+        RType *in_indptr = in.aux_data(kIndPtr).dptr<RType>();
+        IType *in_idx = in.aux_data(kIdx).dptr<IType>();
+        DType *in_data = in.data().dptr<DType>();
+
+        RType *out_indptr = out.aux_data(kIndPtr).dptr<RType>();
+
+        Kernel<SliceMarkCsrIndPtr, gpu>::Launch(s, indptr_len - 1,
+                                                out_indptr,
+                                                in_idx,
+                                                in_indptr + begin_row,
+                                                begin_col, end_col);
+        void* d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      out_indptr,
+                                      out_indptr,
+                                      indptr_len,
+                                      Stream<gpu>::GetStream(s));
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(temp_storage_bytes), s);
+        d_temp_storage = workspace.dptr_;
+
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      out_indptr,
+                                      out_indptr,
+                                      indptr_len,
+                                      Stream<gpu>::GetStream(s));
+        // retrieve nnr
+        RType nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, &out_indptr[indptr_len-1], sizeof(RType),
+            cudaMemcpyDeviceToHost));
+
+        // returns zeros in csr format if nnr = 0
+        if (nnr == 0) {
+          out.set_aux_shape(kIdx, Shape1(0));
+          return;
+        }
+        out.CheckAndAllocAuxData(kIdx, Shape1(nnr));
+        out.CheckAndAllocData(Shape1(nnr));
+        IType *out_idx = out.aux_data(kIdx).dptr<IType>();
+        DType *out_data = out.data().dptr<DType>();
+
+        Kernel<SliceDimTwoCsrAssign, gpu>::Launch(s, indptr_len - 1, out_idx, out_data,
+                                                  out_indptr, in_idx, in_data,
+                                                  in_indptr + begin_row,
+                                                  begin_col, end_col);
+      });
+    });
+  });
+}
+
+
 NNVM_REGISTER_OP(Reshape)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
@@ -40,7 +150,8 @@ NNVM_REGISTER_OP(expand_dims)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
 NNVM_REGISTER_OP(slice)
-.set_attr<FCompute>("FCompute<gpu>", SliceOpForward<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SliceOpForward<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SliceEx<gpu>);
 
 NNVM_REGISTER_OP(_backward_slice)
 .set_attr<FCompute>("FCompute<gpu>", SliceOpBackward<gpu>);
diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
index a052ad96cf..fcc0215c12 100644
--- a/src/operator/tensor/square_sum-inl.h
+++ b/src/operator/tensor/square_sum-inl.h
@@ -53,18 +53,15 @@ inline bool SquareSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
   const auto& in_stype = in_attrs->at(0);
   auto& out_stype = out_attrs->at(0);
   bool dispatched = false;
-  // current impl is only available on cpu
-  if (dev_mask == mshadow::cpu::kDevMask) {
-    if (!dispatched && in_stype == kRowSparseStorage && param.axis[0] == 1 && param.keepdims) {
-      // sum per row and keep dims
-      dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+  if (!dispatched && in_stype == kRowSparseStorage && param.axis[0] == 1 && param.keepdims) {
+    // sum per row and keep dims
+    dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched && in_stype == kRowSparseStorage &&
+      (param.axis[0] == 0 || (param.axis[0] == 1 && !param.keepdims))) {
+      dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                        dispatch_mode, DispatchMode::kFComputeEx);
-    }
-    if (!dispatched && in_stype == kRowSparseStorage &&
-        (param.axis[0] == 0 || (param.axis[0] == 1 && !param.keepdims))) {
-        dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                         dispatch_mode, DispatchMode::kFComputeEx);
-    }
   }
   if (!dispatched) {
     // nothing to fallback on
@@ -86,13 +83,10 @@ inline bool SquareSumBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
   const auto& in_stype = in_attrs->at(1);
   auto& grad_stype = out_attrs->at(0);
   bool dispatched = false;
-  // only implemented on cpu
-  if (dev_mask == mshadow::cpu::kDevMask) {
-    if (!dispatched && (ograd_stype == kDefaultStorage || ograd_stype == kRowSparseStorage) &&
-        in_stype == kRowSparseStorage) {
-      dispatched = storage_type_assign(&grad_stype, kRowSparseStorage,
-                                       dispatch_mode, DispatchMode::kFComputeEx);
-    }
+  if (!dispatched && (ograd_stype == kDefaultStorage || ograd_stype == kRowSparseStorage) &&
+      in_stype == kRowSparseStorage) {
+    dispatched = storage_type_assign(&grad_stype, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
   }
   if (!dispatched) {
     // nothing to fallback on
@@ -359,6 +353,25 @@ void SquareSumRspImpl(const nnvm::NodeAttrs& attrs,
   }
 }
 
+/*!
+ * \brief check the indices of ograd and input are the same.
+ */
+struct CheckSameIdxKernel {
+  template<typename IType>
+  MSHADOW_XINLINE static void Map(int i, IType* ograd_idx,
+                                  IType* in_idx, int32_t* is_diff) {
+    if (ograd_idx[i] != in_idx[i]){
+      *is_diff = 1;
+    }
+  }
+};
+
+
+template<typename xpu>
+void CheckSameIdx(const OpContext& ctx,
+                  const TBlob& ograd_row_idx,
+                  const TBlob& in_row_idx);
+
 /*!\brief
  * This function only supports the following three situations:
  * 1. ograd is a dns and input is an rsp
@@ -367,7 +380,7 @@ void SquareSumRspImpl(const nnvm::NodeAttrs& attrs,
  */
 template<typename xpu>
 void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
-                          mshadow::Stream<xpu>* s,
+                          const OpContext& ctx,
                           const NDArray& ograd,
                           const NDArray& input,
                           const OpReqType req,
@@ -381,6 +394,7 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(input.storage_type(), kRowSparseStorage);
   CHECK_EQ(igrad->storage_type(), kRowSparseStorage);
   CHECK_EQ(req, kWriteTo);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   if (!input.storage_initialized()
       || (ograd.storage_type() == kRowSparseStorage && !ograd.storage_initialized())) {
     FillZerosRspImpl(s, *igrad);
@@ -429,28 +443,16 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
     const TBlob& igrad_data = igrad->data();
     const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
     MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
-      if (std::is_same<xpu, cpu>::value) {
-        // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
-        // ograd_row_idx and in_row_idx are expected to have the same elements
-        if (in_row_idx.Size() != input.shape()[0]) {  // if input data is not a full rsp
-          CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size()) << "SquareSumRspGradImpl only supports"
-                                                               " equal ograd_row_idx and"
-                                                               " input_row_idx when ograd and"
-                                                               " input are both row-sparse and"
-                                                               " input data is not a full"
-                                                               " row-sparse matrix";
-          const IType* first1 = ograd_row_idx.dptr<IType>();
-          const IType* last1 = first1 + ograd_row_idx.Size();
-          const IType* first2 = in_row_idx.dptr<IType>();
-          CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports"
-                                                      " equal ograd_row_idx and input_row_idx"
-                                                      " when ograd and input are both"
-                                                      " row-sparse and input data is not a full"
-                                                      " row-sparse matrix";
-        }
-      } else {
-        LOG(FATAL) << "SquareSumRspGradImpl has not implemented GPU version when"
-                      " ograd and input are both row-sparse";
+      // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
+      // ograd_row_idx and in_row_idx are expected to have the same elements
+      if (in_row_idx.Size() != input.shape()[0]) {  // if input data is not a full rsp
+        CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size()) << "SquareSumRspGradImpl only supports"
+                                                             " equal ograd_row_idx and"
+                                                             " input_row_idx when ograd and"
+                                                             " input are both row-sparse and"
+                                                             " input data is not a full"
+                                                             " row-sparse matrix";
+        CheckSameIdx<xpu>(ctx, ograd_row_idx, in_row_idx);
       }
       MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
@@ -504,7 +506,6 @@ void SquareSumOpBackwardEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
-  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   const NDArrayStorageType ograd_stype = inputs[0].storage_type();
   const NDArrayStorageType input_stype = inputs[1].storage_type();
   if (input_stype == kRowSparseStorage &&
@@ -512,7 +513,7 @@ void SquareSumOpBackwardEx(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(inputs[1].shape().ndim(), 2U) << "_square_sum op only supports"
                                               " 2D ndarray as input";
     NDArray output = outputs[0];
-    SquareSumRspGradImpl(attrs, s, inputs[0], inputs[1], req[0], &output);
+    SquareSumRspGradImpl<xpu>(attrs, ctx, inputs[0], inputs[1], req[0], &output);
   } else {
     LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
   }
diff --git a/src/operator/tensor/square_sum.cc b/src/operator/tensor/square_sum.cc
index e4b49d7f7f..af365bae05 100644
--- a/src/operator/tensor/square_sum.cc
+++ b/src/operator/tensor/square_sum.cc
@@ -25,6 +25,28 @@
 
 namespace mxnet {
 namespace op {
+
+template<>
+void CheckSameIdx<cpu>(const OpContext& ctx,
+                       const TBlob& ograd_row_idx,
+                       const TBlob& in_row_idx) {
+  MSHADOW_IDX_TYPE_SWITCH(ograd_row_idx.type_flag_, IType, {
+    mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+    const IType* ograd_idx = ograd_row_idx.dptr<IType>();
+    const IType* in_idx = in_row_idx.dptr<IType>();
+    const nnvm::dim_t idx_size = ograd_row_idx.Size();
+    int32_t is_different = 0;
+    mxnet_op::Kernel<CheckSameIdxKernel, cpu>::Launch(s, idx_size,
+      ograd_idx, in_idx, &is_different);
+    CHECK_EQ(is_different, 0) << "SquareSumRspGradImpl only supports"
+                                 " equal ograd_row_idx and input_row_idx"
+                                 " when ograd and input are both"
+                                 " row-sparse and input data is not a full"
+                                 " row-sparse matrix";
+  })
+}
+
+
 MXNET_OPERATOR_REGISTER_REDUCE(_square_sum)
 .describe(R"code(Computes the square sum of array elements over a given axis
 for row-sparse matrix. This is a temporary solution for fusing ops square and
@@ -45,6 +67,10 @@ Example::
 
 MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_square_sum)
 .set_num_inputs(2)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FInferStorageType>("FInferStorageType", SquareSumBackwardInferStorageType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", SquareSumOpBackwardEx<cpu>);
 
diff --git a/src/operator/tensor/square_sum.cu b/src/operator/tensor/square_sum.cu
new file mode 100644
index 0000000000..0b40786dbd
--- /dev/null
+++ b/src/operator/tensor/square_sum.cu
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file square_sum.cu
+ * \brief GPU Implementation of square_sum op.
+ */
+#include "./square_sum-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+void CheckSameIdx<gpu>(const OpContext& ctx,
+                       const TBlob& ograd_row_idx,
+                       const TBlob& in_row_idx) {
+  MSHADOW_IDX_TYPE_SWITCH(ograd_row_idx.type_flag_, IType, {
+    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+    const IType* ograd_idx = ograd_row_idx.dptr<IType>();
+    const IType* in_idx = in_row_idx.dptr<IType>();
+    const nnvm::dim_t idx_size = ograd_row_idx.Size();
+    int32_t is_diff = 0;
+    mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
+        .get_space_typed<gpu, 1, char>(mshadow::Shape1(sizeof(int32_t)), s);
+    int32_t* is_diff_ptr = reinterpret_cast<int32_t*>(workspace.dptr_);
+    mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, 1, is_diff_ptr);
+    mxnet_op::Kernel<CheckSameIdxKernel, gpu>::Launch(s, idx_size,
+      ograd_idx, in_idx, is_diff_ptr);
+    CUDA_CALL(cudaMemcpy(&is_diff, is_diff_ptr, sizeof(int32_t), cudaMemcpyDeviceToHost));
+    CHECK_EQ(is_diff, 0) << "SquareSumRspGradImpl only supports"
+                            " equal ograd_row_idx and input_row_idx"
+                            " when ograd and input are both"
+                            " row-sparse and input data is not a full"
+                            " row-sparse matrix";
+  })
+}
+
+
+NNVM_REGISTER_OP(_square_sum)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SquareSumOpForwardEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_square_sum)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SquareSumOpBackwardEx<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/storage/cpu_shared_storage_manager.h b/src/storage/cpu_shared_storage_manager.h
index 9f0f2a354d..98f706b802 100644
--- a/src/storage/cpu_shared_storage_manager.h
+++ b/src/storage/cpu_shared_storage_manager.h
@@ -31,6 +31,9 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#else
+#include <Windows.h>
+#include <process.h>
 #endif  // _WIN32
 
 #include <unordered_map>
@@ -64,6 +67,9 @@ class CPUSharedStorageManager final : public StorageManager {
     for (const auto& kv : pool_) {
       FreeImpl(kv.second);
     }
+#ifdef _WIN32
+    CheckAndRealFree();
+#endif
   }
 
   void Alloc(Storage::Handle* handle) override;
@@ -91,11 +97,18 @@ class CPUSharedStorageManager final : public StorageManager {
  private:
   static constexpr size_t alignment_ = 16;
 
-  std::mutex mutex_;
+  std::recursive_mutex mutex_;
   std::mt19937 rand_gen_;
   std::unordered_map<void*, Storage::Handle> pool_;
+#ifdef _WIN32
+  std::unordered_map<void*, Storage::Handle> is_free_;
+  std::unordered_map<void*, HANDLE> map_handle_map_;
+#endif
 
   void FreeImpl(const Storage::Handle& handle);
+#ifdef _WIN32
+  void CheckAndRealFree();
+#endif
 
   std::string SharedHandleToString(int shared_pid, int shared_id) {
     std::stringstream name;
@@ -106,14 +119,44 @@ class CPUSharedStorageManager final : public StorageManager {
 };  // class CPUSharedStorageManager
 
 void CPUSharedStorageManager::Alloc(Storage::Handle* handle) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
   std::uniform_int_distribution<> dis(0, std::numeric_limits<int>::max());
   int fid = -1;
   bool is_new = false;
   size_t size = handle->size + alignment_;
-  void* ptr = nullptr;
-#ifdef _WIN32
-  LOG(FATAL) << "Shared memory is not supported on Windows yet.";
+  void *ptr = nullptr;
+  #ifdef _WIN32
+  CheckAndRealFree();
+  HANDLE map_handle = nullptr;
+  uint32_t error = 0;
+  if (handle->shared_id == -1 && handle->shared_pid == -1) {
+    is_new = true;
+    handle->shared_pid = _getpid();
+    for (int i = 0; i < 10; ++i) {
+      handle->shared_id = dis(rand_gen_);
+      auto filename = SharedHandleToString(handle->shared_pid, handle->shared_id);
+      map_handle = CreateFileMapping(INVALID_HANDLE_VALUE,
+                                     NULL, PAGE_READWRITE, 0, size, filename.c_str());
+      if ((error = GetLastError()) == ERROR_SUCCESS) {
+        break;;
+      }
+    }
+  } else {
+    auto filename = SharedHandleToString(handle->shared_pid, handle->shared_id);
+    map_handle = OpenFileMapping(FILE_MAP_READ | FILE_MAP_WRITE,
+                                 FALSE, filename.c_str());
+    error = GetLastError();
+  }
+
+  if (error != ERROR_SUCCESS && map_handle == nullptr) {
+    LOG(FATAL) << "Failed to open shared memory. CreateFileMapping failed with error "
+               << error;
+  }
+
+  ptr = MapViewOfFile(map_handle, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, 0);
+  CHECK_NE(ptr, (void *)0)
+      << "Failed to map shared memory. MapViewOfFile failed with error " << GetLastError();
+  map_handle_map_[ptr] = map_handle;
 #else
   if (handle->shared_id == -1 && handle->shared_pid == -1) {
     is_new = true;
@@ -153,7 +196,7 @@ void CPUSharedStorageManager::FreeImpl(const Storage::Handle& handle) {
   int count = DecrementRefCount(handle);
   CHECK_GE(count, 0);
 #ifdef _WIN32
-  LOG(FATAL) << "Shared memory is not supported on Windows yet.";
+  is_free_[handle.dptr] = handle;
 #else
   CHECK_EQ(munmap(static_cast<char*>(handle.dptr) - alignment_,
                   handle.size + alignment_), 0)
@@ -169,6 +212,26 @@ void CPUSharedStorageManager::FreeImpl(const Storage::Handle& handle) {
 #endif  // _WIN32
 }
 
+#ifdef _WIN32
+inline void CPUSharedStorageManager::CheckAndRealFree() {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  for (auto it = std::begin(is_free_); it != std::end(is_free_);) {
+    void* ptr = static_cast<char*>(it->second.dptr) - alignment_;
+    std::atomic<int>* counter = reinterpret_cast<std::atomic<int>*>(
+      static_cast<char*>(it->second.dptr) - alignment_);
+    if ((*counter) == 0) {
+      CHECK_NE(UnmapViewOfFile(ptr), 0)
+        << "Failed to UnmapViewOfFile shared memory ";
+      CHECK_NE(CloseHandle(map_handle_map_[ptr]), 0)
+        << "Failed to CloseHandle shared memory ";
+      map_handle_map_.erase(ptr);
+      it = is_free_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+#endif  // _WIN32
 }  // namespace storage
 }  // namespace mxnet
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f2758c4428..8a85c094ad 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -20,8 +20,8 @@ if(NOT MSVC)
   set(UNITTEST_STATIC_LINK ON)
 endif()
 
-if(GTEST_FOUND)
-
+# FIXME MSVC unit test linking issue
+if(GTEST_FOUND AND NOT MSVC)
   enable_testing()
 
   file(GLOB_RECURSE UNIT_TEST_SOURCE "cpp/*.cc" "cpp/*.h")
diff --git a/tests/ci_build/Dockerfile.build_cuda b/tests/ci_build/Dockerfile.build_cuda
new file mode 100644
index 0000000000..5fccec7681
--- /dev/null
+++ b/tests/ci_build/Dockerfile.build_cuda
@@ -0,0 +1,26 @@
+FROM nvidia/cuda:8.0-cudnn5-devel
+# cuda8.0 has to be used because this is the first ubuntu16.04 container
+# which is required due to OpenBLAS being incompatible with ubuntu14.04
+# the reason we used a gpu base container because we are going to test MKLDNN
+# operator implementation against GPU implementation
+
+COPY install/ubuntu_install_core.sh /install/
+RUN /install/ubuntu_install_core.sh
+COPY install/ubuntu_install_python.sh /install/
+RUN /install/ubuntu_install_python.sh
+COPY install/ubuntu_install_scala.sh /install/
+RUN /install/ubuntu_install_scala.sh
+COPY install/ubuntu_install_r.sh /install/
+RUN /install/ubuntu_install_r.sh
+COPY install/ubuntu_install_perl.sh /install/
+RUN /install/ubuntu_install_perl.sh
+
+# Allows to run tasks on a CPU without nvidia-docker and GPU 
+COPY install/ubuntu_install_nvidia.sh /install/
+RUN /install/ubuntu_install_nvidia.sh
+
+# Add MKLML libraries
+RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz
+RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
+
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/tests/ci_build/Dockerfile.caffe_gpu b/tests/ci_build/Dockerfile.caffe_gpu
index 4f6522dab8..34c4625bdc 100644
--- a/tests/ci_build/Dockerfile.caffe_gpu
+++ b/tests/ci_build/Dockerfile.caffe_gpu
@@ -1,4 +1,6 @@
-FROM nvidia/cuda:7.5-cudnn5-devel
+FROM nvidia/cuda:8.0-cudnn5-devel
+# cuda8.0 has to be used because this is the first ubuntu16.04 container
+# which is required due to OpenBLAS being incompatible with ubuntu14.04
 
 COPY install/ubuntu_install_core.sh /install/
 RUN /install/ubuntu_install_core.sh
@@ -18,6 +20,15 @@ RUN cd /; git clone http://github.com/BVLC/caffe.git; cd caffe; \
 
 RUN echo "CPU_ONLY := 1" >> /caffe/Makefile.config
 
+# Fixes https://github.com/BVLC/caffe/issues/5658 See https://github.com/intel/caffe/wiki/Ubuntu-16.04-or-15.10-Installation-Guide
+RUN echo "INCLUDE_DIRS += /usr/lib /usr/lib/x86_64-linux-gnu /usr/include/hdf5/serial/ " >> /caffe/Makefile.config
+RUN echo "LIBRARY_DIRS += /usr/lib /usr/lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu/hdf5/serial " >> /caffe/Makefile.config
+
+# Fixes https://github.com/BVLC/caffe/issues/4333 See https://github.com/intel/caffe/wiki/Ubuntu-16.04-or-15.10-Installation-Guide
+# Note: This is only valid on Ubuntu16.04 - the version numbers are bound to the distribution
+RUN ln -s /usr/lib/x86_64-linux-gnu/libhdf5_serial.so.10.0.2 /usr/lib/x86_64-linux-gnu/libhdf5.so
+RUN ln -s /usr/lib/x86_64-linux-gnu/libhdf5_serial_hl.so.10.0.2 /usr/lib/x86_64-linux-gnu/libhdf5_hl.so
+
 RUN cd caffe; make all pycaffe -j$(nproc)
 
 RUN cd caffe/python; for req in $(cat requirements.txt); do pip2 install $req; done
diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu
index c7bb0af0f7..226054a597 100644
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 COPY install/ubuntu_install_core.sh /install/
 RUN /install/ubuntu_install_core.sh
diff --git a/tests/ci_build/Dockerfile.cpu_mklml b/tests/ci_build/Dockerfile.cpu_mklml
new file mode 100644
index 0000000000..faa78646c5
--- /dev/null
+++ b/tests/ci_build/Dockerfile.cpu_mklml
@@ -0,0 +1,18 @@
+FROM ubuntu:16.04
+
+COPY install/ubuntu_install_core.sh /install/
+RUN /install/ubuntu_install_core.sh
+COPY install/ubuntu_install_python.sh /install/
+RUN /install/ubuntu_install_python.sh
+COPY install/ubuntu_install_scala.sh /install/
+RUN /install/ubuntu_install_scala.sh
+COPY install/ubuntu_install_r.sh /install/
+RUN /install/ubuntu_install_r.sh
+COPY install/ubuntu_install_perl.sh /install/
+RUN /install/ubuntu_install_perl.sh
+
+# Add MKLML library, compatiable with Ubuntu16.04
+RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz
+RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
+
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index a2893a9fb4..2483e62b99 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -1,4 +1,6 @@
-FROM nvidia/cuda:7.5-cudnn5-devel
+FROM nvidia/cuda:8.0-cudnn5-devel
+# cuda8.0 has to be used because this is the first ubuntu16.04 container
+# which is required due to OpenBLAS being incompatible with ubuntu14.04
 
 COPY install/ubuntu_install_core.sh /install/
 RUN /install/ubuntu_install_core.sh
diff --git a/tests/ci_build/Dockerfile.mklml_gpu b/tests/ci_build/Dockerfile.gpu_mklml
similarity index 65%
rename from tests/ci_build/Dockerfile.mklml_gpu
rename to tests/ci_build/Dockerfile.gpu_mklml
index 185681cb52..2c3564c45e 100644
--- a/tests/ci_build/Dockerfile.mklml_gpu
+++ b/tests/ci_build/Dockerfile.gpu_mklml
@@ -1,4 +1,6 @@
-FROM nvidia/cuda:7.5-cudnn5-devel
+FROM nvidia/cuda:8.0-cudnn5-devel
+# cuda8.0 has to be used because this is the first ubuntu16.04 container
+# # which is required due to OpenBLAS being incompatible with ubuntu14.04
 # the reason we used a gpu base container because we are going to test MKLDNN
 # operator implementation against GPU implementation
 
@@ -9,7 +11,8 @@ RUN /install/ubuntu_install_python.sh
 COPY install/ubuntu_install_scala.sh /install/
 RUN /install/ubuntu_install_scala.sh
 
-RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.10/mklml_lnx_2018.0.20170908.tgz
+# Add MKLML library, compatible with Ubuntu16.04
+RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz
 RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
 
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/tests/ci_build/Dockerfile.lint b/tests/ci_build/Dockerfile.lint
index b19b7676ec..a72b3f8886 100644
--- a/tests/ci_build/Dockerfile.lint
+++ b/tests/ci_build/Dockerfile.lint
@@ -1,5 +1,6 @@
 # For lint test
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
-RUN apt-get update && apt-get install -y python-pip
+# Sudo is not present on ubuntu16.04
+RUN apt-get update && apt-get install -y python-pip sudo
 RUN pip install cpplint pylint
diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
index 79fcd86a5d..118f618fa7 100755
--- a/tests/ci_build/ci_build.sh
+++ b/tests/ci_build/ci_build.sh
@@ -55,6 +55,12 @@ if [[ "$1" == "-it" ]]; then
     shift 1
 fi
 
+if [[ "$1" == "--dockerbinary" ]]; then
+    DOCKER_BINARY="$2"
+    echo "Using custom Docker Engine: ${DOCKER_BINARY}"
+    shift 2
+fi
+
 if [[ ! -f "${DOCKERFILE_PATH}" ]]; then
     echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\""
     exit 1
@@ -73,11 +79,15 @@ if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then
       exit 1
 fi
 
-# Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == *"gpu"* ]]; then
-    DOCKER_BINARY="nvidia-docker"
-else
-    DOCKER_BINARY="docker"
+# Only set docker binary automatically if it has not been specified
+if [[ -z "${DOCKER_BINARY}" ]]; then
+    # Use nvidia-docker if the container is GPU.
+    if [[ "${CONTAINER_TYPE}" == *"gpu"* ]]; then
+        DOCKER_BINARY="nvidia-docker"
+    else
+        DOCKER_BINARY="docker"
+    fi
+    echo "Automatically assuming ${DOCKER_BINARY} as docker binary"
 fi
 
 # Helper function to traverse directories up until given file is found.
@@ -139,6 +149,10 @@ echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_NAME}..."
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
 # pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
+
+# Turning off MXNET_STORAGE_FALLBACK_LOG_WARNING temporarily per this issue:
+# https://github.com/apache/incubator-mxnet/issues/8980
+
 ${DOCKER_BINARY} run --rm --pid=host \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
@@ -147,6 +161,8 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_UID=$(id -u)" \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
+    -e "CUDA_ARCH=-gencode arch=compute_52,code=[sm_52,compute_52] --fatbin-options -compress-all" \
+    -e "MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
     ${PRE_COMMAND} \
diff --git a/tests/ci_build/install/ubuntu_install_core.sh b/tests/ci_build/install/ubuntu_install_core.sh
index 49475740d2..eefd7590cd 100755
--- a/tests/ci_build/install/ubuntu_install_core.sh
+++ b/tests/ci_build/install/ubuntu_install_core.sh
@@ -21,6 +21,9 @@
 
 apt-get update && apt-get install -y \
     build-essential git libopenblas-dev liblapack-dev libopencv-dev \
-    libcurl4-openssl-dev libgtest-dev cmake wget unzip
+    libcurl4-openssl-dev libgtest-dev cmake wget unzip sudo  
+
+# Link Openblas to Cblas as this link does not exist on ubuntu16.04
+ln -s /usr/lib/libopenblas.so /usr/lib/libcblas.so
 
 cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
diff --git a/example/torch/torch_function.py b/tests/ci_build/install/ubuntu_install_nvidia.sh
old mode 100644
new mode 100755
similarity index 62%
rename from example/torch/torch_function.py
rename to tests/ci_build/install/ubuntu_install_nvidia.sh
index af285de227..71fde8e800
--- a/example/torch/torch_function.py
+++ b/tests/ci_build/install/ubuntu_install_nvidia.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,18 +17,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from __future__ import print_function
-import mxnet as mx
-x = mx.th.randn(2, 2, ctx=mx.cpu(0))
-print(x.asnumpy())
-y = mx.th.abs(x)
-print(y.asnumpy())
+# install nvidia libraries to compile and run CUDA without
+# the necessity of nvidia-docker and a GPU
+
+# Needed to run add-apt-repository
+apt update && apt install -y software-properties-common
 
-x = mx.th.randn(2, 2, ctx=mx.cpu(0))
-print(x.asnumpy())
-mx.th.abs(x, x) # in-place
-print(x.asnumpy())
+add-apt-repository -y ppa:graphics-drivers
 
-x = mx.th.ones(2, 2, ctx=mx.cpu(0))
-y = mx.th.ones(2, 2, ctx=mx.cpu(0))*2
-print(mx.th.cdiv(x,y).asnumpy())
+# Retrieve ppa:graphics-drivers and install nvidia-drivers.
+# Note: DEBIAN_FRONTEND required to skip the interactive setup steps
+apt update && \
+    DEBIAN_FRONTEND=noninteractive apt install -y nvidia-384
diff --git a/tests/ci_build/pip_tests/Dockerfile.in.pip_cpu b/tests/ci_build/pip_tests/Dockerfile.in.pip_cpu
index dfd675b890..de4629fab2 100644
--- a/tests/ci_build/pip_tests/Dockerfile.in.pip_cpu
+++ b/tests/ci_build/pip_tests/Dockerfile.in.pip_cpu
@@ -1,4 +1,4 @@
 # -*- mode: dockerfile -*-
 # dockerfile to test pip installation on CPU
 
-FROM ubuntu:14.04
+FROM ubuntu:16.04
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index be60ecfc53..3007285286 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -23,7 +23,6 @@
  * \brief threaded engine tests
 */
 #include <time.h>
-#include <unistd.h>
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <mxnet/engine.h>
@@ -47,7 +46,7 @@ struct Workload {
   int time;
 };
 
-static u_int32_t seed_ = 0xdeadbeef;
+static uint32_t seed_ = 0xdeadbeef;
 
 /**
  * generate a list of workloads
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index 51cbcd71a3..6a220bdad6 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -138,7 +138,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) {
       test::op::OperatorDataInitializer<DType>::FillRandom(arr.data());
     });
-    return std::move(array);
+    return array;
   }
 
   /*!
@@ -154,7 +154,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) {
       test::op::OperatorDataInitializer<DType>::FillZero(arr.data());
     });
-    return std::move(array);
+    return array;
   }
 
   nnvm::NodePtr MakeNode() const {
diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h
index bddade0830..066168e262 100644
--- a/tests/cpp/include/test_op.h
+++ b/tests/cpp/include/test_op.h
@@ -101,6 +101,8 @@ class OperatorDataInitializer {
    * \param blob Blob which to fill with random values
    */
   void FillRandom(const TBlob& blob) const {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wabsolute-value"
     std::uniform_real_distribution<> dis_real(-5.0, 5.0);
     std::uniform_int_distribution<> dis_int(-128, 127);
     test::patternFill<DType>(&blob, [this, &dis_real, &dis_int]() -> DType {
@@ -108,7 +110,7 @@ class OperatorDataInitializer {
         DType val;
         do {
           val = static_cast<DType>(dis_real(this->generator()));
-        } while (fabs(val) < 1e-5);  // If too close to zero, try again
+        } while (std::abs(val) < 1e-5);  // If too close to zero, try again
         return val;
       } else {
         DType val;
@@ -118,6 +120,7 @@ class OperatorDataInitializer {
         return val;
       }
     });
+#pragma clang diagnostic pop
   }
 
   void FillZero(const TBlob& blob) const {
@@ -284,7 +287,7 @@ inline std::vector<TShape> ShapesOf(const std::vector<NDArray>& arrays) {
   for (const NDArray& ar : arrays) {
     res.emplace_back(ar.shape());
   }
-  return std::move(res);
+  return res;
 }
 
 }  // namespace op
diff --git a/tests/cpp/include/test_perf.h b/tests/cpp/include/test_perf.h
index 672b28a426..8d2a2646ca 100644
--- a/tests/cpp/include/test_perf.h
+++ b/tests/cpp/include/test_perf.h
@@ -27,7 +27,11 @@
 #ifndef TEST_PERF_H_
 #define TEST_PERF_H_
 
+#ifndef _WIN32
 #include <sys/time.h>
+#else
+#include <Windows.h>
+#endif
 #include <dmlc/logging.h>
 #include <iomanip>
 #include <iostream>
@@ -278,6 +282,15 @@ class TimingInstrument {
       CHECK_EQ(o.nestingCount_, 0U);
     }
 
+    inline Info& operator = (const Info& o) {
+      name_ = o.name_;
+      baseTime_.store(baseTime_.load());
+      nestingCount_.store(nestingCount_.load());
+      cycleCount_.store(cycleCount_.load());
+      duration_.store(duration_.load());
+      return *this;
+    }
+
     /*!
      * \brief Return time for each operation in milliseconds
      * \return Time for each operation in milliseconds
diff --git a/tests/cpp/include/test_tune.h b/tests/cpp/include/test_tune.h
index 725aa90a10..f5e15cc181 100644
--- a/tests/cpp/include/test_tune.h
+++ b/tests/cpp/include/test_tune.h
@@ -26,7 +26,12 @@
 #ifndef TEST_TUNE_H_
 #define TEST_TUNE_H_
 
+#ifndef _WIN32
 #include <sys/time.h>
+#else
+#include <Windows.h>
+#endif
+
 #include <dmlc/logging.h>
 #include <iomanip>
 #include <iostream>
@@ -102,7 +107,7 @@ class TuningTester {
       CHECK(res.find(this_run_shapes) == res.end());
       res[this_run_shapes] = tmap;
     }
-    return std::move(res);
+    return res;
   }
 
   using tuned_timing_t = std::map<
@@ -237,7 +242,7 @@ class TuningTester {
         results[shapes] = result;
       }
     }
-    return std::move(results);
+    return results;
   }
 
   /*!
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index 8347a8a9d7..a41e62392c 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -32,6 +32,7 @@
 #include <string>
 #include <vector>
 #include <sstream>
+#include <random>
 
 #if MXNET_USE_VTUNE
 #include <ittnotify.h>
@@ -512,12 +513,16 @@ inline void print(const RunContext& ctx,
 }
 
 inline std::string demangle(const char *name) {
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
   int status = -4;  // some arbitrary value to eliminate the compiler warning
   std::unique_ptr<char, void(*)(void*)> res {
     abi::__cxa_demangle(name, nullptr, nullptr, &status),
     &std::free
   };
   return status ? name : res.get();
+#else
+  return name;
+#endif
 }
 
 template<typename T>
@@ -605,7 +610,7 @@ inline ScalarType rangedRand(const ScalarType min, const ScalarType max) {
     defect   = num_rand % num_bins;
   ScalarType x;
   do {
-    x = random();
+    x = std::rand();
   } while (num_rand - defect <= (uint64_t)x);
 
   return static_cast<ScalarType>(x / bin_size + min);
diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc
index 65bd9aaf40..e482848705 100644
--- a/tests/cpp/operator/activation_perf.cc
+++ b/tests/cpp/operator/activation_perf.cc
@@ -27,7 +27,7 @@
 #include <mxnet/tensor_blob.h>
 #include "../include/test_op_runner.h"
 #include "../include/test_legacy_op.h"
-#include "../../src/operator/activation-inl.h"
+#include "../../src/operator/nn/activation-inl.h"
 
 using namespace mxnet;
 
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 8f53ee5588..d9b426a815 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -26,7 +26,7 @@
 
 #include <dmlc/logging.h>
 #include <mxnet/tensor_blob.h>
-#include "../../src/operator/batch_norm-inl.h"
+#include "../../src/operator/nn/batch_norm-inl.h"
 #include "../../src/operator/batch_norm_v1-inl.h"
 #include "./test_legacy_op.h"
 #include "executor/exec_pass.h"
@@ -845,6 +845,7 @@ TEST(BATCH_NORM, TestStochasticTiming_2D) {
 }
 
 /*! \brief Performance tests */
+#ifndef _WIN32
 TEST(BATCH_NORM, TestTiming_2D) {
 #ifdef NDEBUG
   size_t THISCOUNT = 10;
@@ -861,7 +862,7 @@ MSHADOW_REAL_TYPE_SWITCH_EX(
     false, false,
     blank_kwargs,
     2, THISCOUNT);
-#if MXNET_USE_MKL2017 == 1
+#if defined(MXNET_USE_MKL2017) && (MXNET_USE_MKL2017 == 1)
   timingTest<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
     "MKL BatchNormProp<cpu> 2D",
     false, false,
@@ -897,6 +898,7 @@ MSHADOW_REAL_TYPE_SWITCH_EX(
 #endif
 });
 }
+#endif  // _WIN32
 
 /**
  * Backward tests (generally include forward tests as well)
@@ -1150,6 +1152,8 @@ void compare(const bool isGPU,
     typename OperatorExecutor::AccRealType>::compare(object, info_checkLoad);
 }
 
+
+#ifndef _WIN32
 TEST(BATCH_NORM, TestBackward1D_Simple) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DTypeX, AccReal,
@@ -1195,7 +1199,9 @@ TEST(BATCH_NORM, TestBackward1D_Simple) {
       compare(false, info, ___BN_TestBackward1D_Simple_data_shape_1_1_2___);
     });
 }
+#endif  // _WIN32
 
+#ifndef _WIN32
 TEST(BATCH_NORM, TestBackward3D) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DType, AccReal,
@@ -1211,6 +1217,7 @@ TEST(BATCH_NORM, TestBackward3D) {
 #endif
     });
 }
+#endif  // _WIN32
 
 // nonfixgamma_kwargs
 TEST(BATCH_NORM, Test2DBackwardMixed_cpu_cpu_nfg) {
@@ -1343,6 +1350,7 @@ static void compare(const TBlob& blob, const std::vector<DType>& vals) {
   }
 }
 
+#ifndef _WIN32
 template<typename DType, typename AccReal>
 static void compare(const std::vector<std::vector<float>>& d1,
                     const std::vector<std::vector<float>>& d2) {
@@ -1435,6 +1443,7 @@ static TShape MakeShape(const std::vector<index_t>& shape,
   return newShape;
 }
 
+
 /*! \brief Create and arrange equivalent data with different channel axes, then compare
  * normalized results */
 static void runChannelAxisTest(
@@ -1626,6 +1635,7 @@ TEST(BATCH_NORM, TestChannelAxis) {
     kwargs.pop_back();
   }
 }
+#endif
 
 #if MXNET_USE_CUDA
 
diff --git a/tests/cpp/operator/coreop_perf.cc b/tests/cpp/operator/coreop_perf.cc
index 2655740677..31ecebdfee 100644
--- a/tests/cpp/operator/coreop_perf.cc
+++ b/tests/cpp/operator/coreop_perf.cc
@@ -25,7 +25,7 @@
 
 #include <gtest/gtest.h>
 #include <mxnet/tensor_blob.h>
-#include "../../src/operator/activation-inl.h"
+#include "../../src/operator/nn/activation-inl.h"
 #include "../include/test_op_runner.h"
 #include "../include/test_core_op.h"
 
diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc
index d9a3795f46..c8d8021f6f 100644
--- a/tests/cpp/operator/fully_conn_perf.cc
+++ b/tests/cpp/operator/fully_conn_perf.cc
@@ -26,7 +26,7 @@
 
 #include <dmlc/logging.h>
 #include <mxnet/tensor_blob.h>
-#include "../../src/operator/fully_connected-inl.h"
+#include "../../src/operator/nn/fully_connected-inl.h"
 #include "../include/test_op_runner.h"
 #include "../include/test_legacy_op.h"
 
diff --git a/tests/cpp/operator/tune/operator_tune_test.cc b/tests/cpp/operator/tune/operator_tune_test.cc
index 5ecb03cc5b..60096d33df 100644
--- a/tests/cpp/operator/tune/operator_tune_test.cc
+++ b/tests/cpp/operator/tune/operator_tune_test.cc
@@ -18,7 +18,8 @@
  */
 #include <gtest/gtest.h>
 #include <mxnet/tensor_blob.h>
-#include "../../src/operator/activation-inl.h"
+#include <numeric>
+#include "../../src/operator/nn/activation-inl.h"
 #include "../../src/operator/operator_tune-inl.h"
 #include "../include/test_op_runner.h"
 #include "../include/test_core_op.h"
@@ -58,7 +59,7 @@ static std::vector<std::vector<TShape>> tuning_shapes() {
       {{50, 3, 18, 32}}
     };
   }
-  return std::move(shapes);
+  return shapes;
 }
 
 /*!
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 030b24026e..4e88c6488f 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -2,8 +2,11 @@ TEST_SRC = $(shell find tests/cpp/ -name "*.cc")
 TEST_OBJ = $(patsubst %.cc, build/%.o, $(TEST_SRC))
 TEST = build/tests/cpp/mxnet_unit_tests
 
-GTEST_LIB=$(GTEST_PATH)/lib/
-GTEST_INC=$(GTEST_PATH)/include/
+GTEST_DIR=3rdparty/googletest/googletest/
+GTEST_INC=3rdparty/googletest/googletest/include/
+GTEST_SRCS_ = $(GTEST_DIR)/src/*.cc $(GTEST_DIR)/src/*.h $(GTEST_HEADERS)
+GTEST_HEADERS = $(GTEST_DIR)/include/gtest/*.h \
+                $(GTEST_DIR)/include/gtest/internal/*.h
 
 TEST_CFLAGS = -Itests/cpp/include -Isrc $(CFLAGS)
 TEST_LDFLAGS = $(LDFLAGS) -Llib -lmxnet
@@ -15,6 +18,12 @@ endif
 
 .PHONY: runtest testclean
 
+gtest-all.o : $(GTEST_SRCS_)
+	$(CXX) $(CPPFLAGS) -I$(GTEST_INC) -I$(GTEST_DIR) $(CXXFLAGS) -c $(GTEST_DIR)/src/gtest-all.cc
+
+gtest.a : gtest-all.o
+	$(AR) $(ARFLAGS) $@ $^
+
 build/tests/cpp/%.o : tests/cpp/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
@@ -35,8 +44,8 @@ build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
 
-$(TEST): $(TEST_OBJ) lib/libmxnet.so
-	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o $@ $^ $(TEST_LDFLAGS) -L$(GTEST_LIB) -lgtest
+$(TEST): $(TEST_OBJ) lib/libmxnet.so gtest.a
+	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o $@ $^ $(TEST_LDFLAGS)
 
 runtest: $(TEST)
 	LD_LIBRARY_PATH=$(shell pwd)/lib:$(LD_LIBRARY_PATH) $(TEST)
diff --git a/tests/python/gpu/test_nccl.py b/tests/python/gpu/test_nccl.py
index 0e0c18fe5f..fd27c0b58b 100644
--- a/tests/python/gpu/test_nccl.py
+++ b/tests/python/gpu/test_nccl.py
@@ -38,5 +38,7 @@ def test_nccl_pushpull():
             for x in range(n_gpus):
                 assert(np.sum(np.abs((res[x]-n_gpus).asnumpy()))==0)
 
+    print ("Passed")
+
 if __name__ == '__main__':
     test_nccl_pushpull()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index cecda21a07..7706bce56e 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -38,6 +38,7 @@
 from test_sparse_ndarray import test_create_sparse_nd_empty, test_create_sparse_nd_from_sparse
 from test_sparse_ndarray import test_create_sparse_nd_from_dense, test_create_sparse_nd_infer_shape
 from test_sparse_ndarray import test_sparse_nd_check_format, test_sparse_nd_copy
+from test_sparse_ndarray import test_sparse_nd_setitem
 from test_sparse_operator import *
 from test_ndarray import *
 
@@ -989,6 +990,7 @@ def test_embedding_helper(data_types, weight_types, low_pad, high_pad):
     weight_types = [np.float16, np.float32, np.float64]
     test_embedding_helper(data_types, weight_types, 0, 5)
 
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8288")
 def test_svmoutput_with_type():
     sym = mx.sym.SVMOutput(name='svmoutput', use_linear=True)
     ctx_list = [{'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float64}},
diff --git a/tests/python/unittest/test_contrib_krprod.py b/tests/python/unittest/test_contrib_krprod.py
new file mode 100644
index 0000000000..07c0fb843b
--- /dev/null
+++ b/tests/python/unittest/test_contrib_krprod.py
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+
+from __future__ import print_function
+import numpy as np
+import mxnet as mx
+
+from numpy.testing import assert_allclose
+
+def assert_mx_allclose(A, B, **kwds):
+    return assert_allclose(A.asnumpy(), B.asnumpy(), **kwds)
+
+
+def test_krprod_one_input():
+    A = mx.nd.arange(1,9).reshape((2,4))
+    out = mx.nd.khatri_rao(A)
+    assert_mx_allclose(out, A, rtol=1e-12)
+
+
+def test_krprod_two_inputs():
+    A = mx.nd.arange(1,7).reshape((3,2))
+    B = mx.nd.arange(1,3).reshape((1,2))
+    out = mx.nd.khatri_rao(A, B)
+    expected = mx.nd.array([[1,4],[3,8],[5,12]])
+    assert_mx_allclose(out, expected, rtol=1e-12)
+
+    A = mx.nd.arange(1,7).reshape((3,2))
+    B = mx.nd.arange(1,9).reshape((4,2))
+    out = mx.nd.khatri_rao(A, B)
+    expected = mx.nd.array([[1,4],[3,8],[5,12],[7,16],[3,8],[9,16],[15,24],
+                            [21,32],[5,12],[15,24],[25,36],[35,48]])
+    assert_mx_allclose(out, expected, rtol=1e-12)
+
+
+def test_krprod_three_inputs():
+    A = mx.nd.arange(1,7).reshape((3,2))
+    B = mx.nd.arange(1,3).reshape((1,2))
+    C = mx.nd.arange(1,5).reshape((2,2))
+    out = mx.nd.khatri_rao(A, B, C)
+    expected = mx.nd.array([[1,8],[3,16],[3,16],[9,32],[5,24],[15,48]])
+    assert_mx_allclose(out, expected, rtol=1e-12)
+
+    out_AB = mx.nd.khatri_rao(A, B)
+    out = mx.nd.khatri_rao(out_AB, C)
+    assert_mx_allclose(out, expected, rtol=1e-12)
+
+    out_BC = mx.nd.khatri_rao(B, C)
+    out = mx.nd.khatri_rao(A, out_BC)
+    assert_mx_allclose(out, expected, rtol=1e-12)
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index c619056c11..f2d001a702 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -70,6 +70,23 @@ def forward(self, x):
     net3.load_params('net1.params', mx.cpu())
 
 
+def test_parameter_str():
+    class Net(gluon.Block):
+        def __init__(self, **kwargs):
+            super(Net, self).__init__(**kwargs)
+            with self.name_scope():
+                self.dense0 = nn.Dense(10, in_units=5, use_bias=False)
+
+    net = Net(prefix='net1_')
+    lines = str(net.collect_params()).splitlines()
+
+    assert lines[0] == 'net1_ ('
+    assert 'net1_dense0_weight' in lines[1]
+    assert '(10, 5)' in lines[1]
+    assert 'numpy.float32' in lines[1]
+    assert lines[2] == ')'
+
+
 def test_basic():
     model = nn.Sequential()
     model.add(nn.Dense(128, activation='tanh', in_units=10, flatten=False))
diff --git a/tests/python/unittest/test_init.py b/tests/python/unittest/test_init.py
index e642e65ec3..efd6ef3674 100644
--- a/tests/python/unittest/test_init.py
+++ b/tests/python/unittest/test_init.py
@@ -44,8 +44,24 @@ def test_aux_init():
     assert (mod.get_params()[1]['bn_moving_var'].asnumpy() == 1).all()
     assert (mod.get_params()[1]['bn_moving_mean'].asnumpy() == 0).all()
 
+def test_rsp_const_init():
+    def check_rsp_const_init(init, val):
+        shape = (10, 10)
+        x = mx.symbol.Variable("data", stype='csr')
+        weight = mx.symbol.Variable("weight", shape=(shape[1], 2),
+                                    init=init, stype='row_sparse')
+        dot = mx.symbol.sparse.dot(x, weight)
+        mod = mx.mod.Module(dot, label_names=None)
+        mod.bind(data_shapes=[('data', shape)])
+        mod.init_params()
+        assert (list(mod.get_params()[0].values())[0].asnumpy() == val).all()
+
+    check_rsp_const_init(mx.initializer.Constant(value=2.), 2.)
+    check_rsp_const_init(mx.initializer.Zero(), 0.)
+    check_rsp_const_init(mx.initializer.One(), 1.)
 
 if __name__ == '__main__':
     test_variable_init()
     test_default_init()
     test_aux_init()
+    test_rsp_const_init()
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 8ee4bfae00..e044df0705 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -19,6 +19,7 @@
 import numpy as np
 from mxnet import gluon
 from mxnet.test_utils import assert_almost_equal, default_context
+import unittest
 
 
 def test_loss_ndarray():
@@ -160,6 +161,7 @@ def test_l1_loss():
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1
 
 
+@unittest.skip("flaky test. https://github.com/apache/incubator-mxnet/issues/8892")
 def test_ctc_loss():
     loss = gluon.loss.CTCLoss()
     l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[1,0,-1,-1],[2,1,1,-1]]))
@@ -185,7 +187,7 @@ def test_ctc_loss():
     l = loss(mx.nd.ones((2,25,4)), mx.nd.array([[2,1,3,3],[3,2,2,3]]), mx.nd.array([20,20]), mx.nd.array([2,3]))
     mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
 
-
+@unittest.skip("flaky test. https://github.com/apache/incubator-mxnet/issues/8892")
 def test_ctc_loss_train():
     np.random.seed(1234)
     N = 20
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 5512b07c77..3a0e8a43b7 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -15,15 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import mxnet as mx
 import numpy as np
+import os
 import pickle as pkl
 import unittest
 from nose.tools import raises
-from mxnet.test_utils import *
+from mxnet.test_utils import almost_equal
+from mxnet.test_utils import assert_almost_equal
+from mxnet.test_utils import default_context
+from mxnet.test_utils import np_reduce
+from mxnet.test_utils import same
 from numpy.testing import assert_allclose
-import unittest
 import mxnet.autograd
 
 def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=[np.float32]):
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 1a26434015..6178cbe838 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -18,6 +18,7 @@
 import numpy as np
 import mxnet as mx
 import mxnet.lr_scheduler as lr_scheduler
+from mxnet import gluon
 import unittest
 from nose.tools import raises
 import math
@@ -198,6 +199,7 @@ def update(self, index, weight, grad, state):
     def update_multi_precision(self, index, weight, grad, state):
         self.update(index, weight, grad, state)
 
+@unittest.skip("Test fails intermittently. Temporarily disabled until fixed. Tracked at https://github.com/apache/incubator-mxnet/issues/9000")
 def test_sgd():
     mx.random.seed(0)
     opt1 = PySGD
@@ -643,6 +645,33 @@ def test_ftrl():
         compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape,
                           np.float32, w_stype='row_sparse', g_stype='row_sparse')
 
+def test_nadam():
+
+    def get_net(num_hidden, flatten=True):
+        data = mx.symbol.Variable('data')
+        fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128, flatten=flatten)
+        act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
+        fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64, flatten=flatten)
+        act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
+        fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
+        return fc3
+    np.random.seed(1234)
+    N = 20
+    data = mx.random.uniform(-1, 1, shape=(N, 10))
+    label = mx.random.uniform(-1, 1, shape=(N, 1))
+    data_iter = mx.io.NDArrayIter(data, label, batch_size=5, label_name='label', shuffle=True)
+    output = get_net(1)
+    l = mx.symbol.Variable('label')
+    Loss = gluon.loss.L1Loss()
+    loss = Loss(output, l)
+    loss = mx.sym.make_loss(loss)
+    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
+    mod.fit(data_iter, num_epoch=30, optimizer_params={'learning_rate': 0.005, 'wd': 0.0005},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='nadam')
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_profiler.py b/tests/python/unittest/test_profiler.py
index 724ed3a387..78baf4a183 100644
--- a/tests/python/unittest/test_profiler.py
+++ b/tests/python/unittest/test_profiler.py
@@ -19,14 +19,12 @@
 import mxnet as mx
 from mxnet import profiler
 import time
-import numpy as np
 
 def test_profiler():
     profile_filename = "test_profile.json"
-    iter_num = 100
-    begin_profiling_iter = 50
-    end_profiling_iter = 50
-
+    iter_num = 5
+    begin_profiling_iter = 2
+    end_profiling_iter = 4
 
     profiler.profiler_set_config(mode='symbolic', filename=profile_filename)
     print('profile file save to {0}'.format(profile_filename))
@@ -43,9 +41,9 @@ def test_profiler():
     a.copyto(executor.arg_dict['A'])
     b.copyto(executor.arg_dict['B'])
 
-    flag = False
     print("execution begin")
     for i in range(iter_num):
+        print("Iteration {}/{}".format(i + 1, iter_num))
         if i == begin_profiling_iter:
             t0 = time.clock()
             profiler.profiler_set_state('run')
@@ -59,6 +57,7 @@ def test_profiler():
     duration = t1 - t0
     print('duration: {0}s'.format(duration))
     print('          {0}ms/operator'.format(duration*1000/iter_num))
+    profiler.dump_profile()
 
 if __name__ == '__main__':
     test_profiler()
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index e59e476601..e404997e49 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -93,7 +93,7 @@ def check_sparse_nd_setitem(stype, shape, dst):
         x = mx.nd.zeros(shape=shape, stype=stype)
         x[:] = dst
         dst_nd = mx.nd.array(dst) if isinstance(dst, (np.ndarray, np.generic)) else dst
-        assert same(x.asnumpy(), dst_nd.asnumpy())
+        assert np.all(x.asnumpy() == dst_nd.asnumpy() if isinstance(dst_nd, NDArray) else dst)
 
     shape = rand_shape_2d()
     for stype in ['row_sparse', 'csr']:
@@ -102,7 +102,8 @@ def check_sparse_nd_setitem(stype, shape, dst):
         check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, stype))
         # numpy assignment
         check_sparse_nd_setitem(stype, shape, np.ones(shape))
-
+    # scalar assigned to row_sparse NDArray
+    check_sparse_nd_setitem('row_sparse', shape, 2)
 
 def test_sparse_nd_slice():
     shape = (rnd.randint(2, 10), rnd.randint(2, 10))    
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index a08b6187bc..a56677c5b0 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1429,63 +1429,62 @@ def test_fallback(func_name, axis=0, keepdims=True, exclude=True):
 
 
 def test_sparse_square_sum():
-    if default_context().device_type == 'cpu':
-        dim0 = 30
-        dim1 = 30
-        axes = [0, 1]
-        keepdims = [False, True]
-        densities = [0, 0.01, 0.2, 0.5, 1.0]
-        for density in densities:
-            shape = rand_shape_2d(dim0, dim1)
-            rsp = rand_ndarray(shape, 'row_sparse', density)
-            dns = rsp.tostype('default')
-            for axis in axes:
-                for keepdim in keepdims:
-                    ret = mx.nd._internal._square_sum(rsp, axis=axis, keepdims=keepdim)
-                    if axis == 1 and keepdim:
-                        assert ret.stype == 'row_sparse'
-                    else:
-                        assert ret.stype == 'default'
-                    ret_expected = mx.nd.sum(dns*dns, axis=axis, keepdims=keepdim)
-                    # check forward result
-                    assert_almost_equal(ret.asnumpy(), ret_expected.asnumpy())
-
-                    rsp_data = mx.sym.Variable('data', stype='row_sparse')
-                    test = mx.symbol._internal._square_sum(rsp_data, axis=axis, keepdims=keepdim)
-
-                    # check symbolic backward since ograd can be an rsp
-                    # and cannot be checked through check_numeric_gradient
-                    # because it will add a loss layer as the output layer
-                    # which makes ograd of the square_sum dense
-                    if axis == 1 and keepdim:
-                        dns_data = mx.sym.Variable('data')
-                        baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
-                        igrad_expected = mx.nd.empty(dns.shape)
-                        baseline_exec = baseline.bind(default_context(), args=[dns],
-                                                      args_grad=[igrad_expected])
-                        baseline_exec.forward(is_train=True)
-                        baseline_exec.backward([ret_expected])
-                        # check backward when ograd is row sparse
-                        check_symbolic_backward(test, [rsp], [ret_expected.tostype('row_sparse')],
-                                                [igrad_expected.asnumpy()], grad_stypes={'data': 'row_sparse'})
-
-                        # check backward when ograd is dense
-                        # the stype of output of the square_sum is deteremined in symbol binding stage.
-                        # The ograd stype of the last layer is the same as the output stype of the last layer.
-                        # Need to add one more layer after square_sum to trigger the kernel for ograd
-                        # with default stype in square_sum op.
-                        baseline1 = baseline + 1
-                        baseline_exec1 = baseline1.bind(default_context(), args=[dns],
-                                                        args_grad=[igrad_expected])
-                        baseline_exec1.forward(is_train=True)
-                        baseline_exec1.backward([ret_expected])
-                        test1 = test + 1
-                        check_symbolic_backward(test1, [rsp], [ret_expected], [igrad_expected.asnumpy()],
-                                                grad_stypes={'data': 'row_sparse'})
-
-                    # check numeric gradient
-                    check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'},
-                                           atol=1e-2, rtol=0.1)
+    dim0 = 30
+    dim1 = 30
+    axes = [0, 1]
+    keepdims = [False, True]
+    densities = [0, 0.01, 0.2, 0.5, 1.0]
+    for density in densities:
+        shape = rand_shape_2d(dim0, dim1)
+        rsp = rand_ndarray(shape, 'row_sparse', density)
+        dns = rsp.tostype('default')
+        for axis in axes:
+            for keepdim in keepdims:
+                ret = mx.nd._internal._square_sum(rsp, axis=axis, keepdims=keepdim)
+                if axis == 1 and keepdim:
+                    assert ret.stype == 'row_sparse'
+                else:
+                    assert ret.stype == 'default'
+                ret_expected = mx.nd.sum(dns*dns, axis=axis, keepdims=keepdim)
+                # check forward result
+                assert_almost_equal(ret.asnumpy(), ret_expected.asnumpy())
+
+                rsp_data = mx.sym.Variable('data', stype='row_sparse')
+                test = mx.symbol._internal._square_sum(rsp_data, axis=axis, keepdims=keepdim)
+
+                # check symbolic backward since ograd can be an rsp
+                # and cannot be checked through check_numeric_gradient
+                # because it will add a loss layer as the output layer
+                # which makes ograd of the square_sum dense
+                if axis == 1 and keepdim:
+                    dns_data = mx.sym.Variable('data')
+                    baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
+                    igrad_expected = mx.nd.empty(dns.shape)
+                    baseline_exec = baseline.bind(default_context(), args=[dns],
+                                                  args_grad=[igrad_expected])
+                    baseline_exec.forward(is_train=True)
+                    baseline_exec.backward([ret_expected])
+                    # check backward when ograd is row sparse
+                    check_symbolic_backward(test, [rsp], [ret_expected.tostype('row_sparse')],
+                                            [igrad_expected.asnumpy()], grad_stypes={'data': 'row_sparse'})
+
+                    # check backward when ograd is dense
+                    # the stype of output of the square_sum is deteremined in symbol binding stage.
+                    # The ograd stype of the last layer is the same as the output stype of the last layer.
+                    # Need to add one more layer after square_sum to trigger the kernel for ograd
+                    # with default stype in square_sum op.
+                    baseline1 = baseline + 1
+                    baseline_exec1 = baseline1.bind(default_context(), args=[dns],
+                                                    args_grad=[igrad_expected])
+                    baseline_exec1.forward(is_train=True)
+                    baseline_exec1.backward([ret_expected])
+                    test1 = test + 1
+                    check_symbolic_backward(test1, [rsp], [ret_expected], [igrad_expected.asnumpy()],
+                                            grad_stypes={'data': 'row_sparse'})
+
+                # check numeric gradient
+                check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'},
+                                       atol=1e-2, rtol=0.1)
 
 
 def test_sparse_storage_fallback():
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 30e76a272e..e7fb0788c8 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -46,6 +46,7 @@ def test_symbol_compose():
     composed = net2(fc3_data=net1, name='composed')
     multi_out = mx.symbol.Group([composed, net1])
     assert len(multi_out.list_outputs()) == 2
+    assert len(multi_out) == 2
 
 
 def test_symbol_copy():
@@ -72,7 +73,9 @@ def test_symbol_children():
     net1 = mx.symbol.FullyConnected(data=oldfc, name='fc2', num_hidden=100)
 
     assert net1.get_children().list_outputs() == ['fc1_output', 'fc2_weight', 'fc2_bias']
+    assert len(net1.get_children()) == 3
     assert net1.get_children().get_children().list_outputs() == ['data', 'fc1_weight', 'fc1_bias']
+    assert len(net1.get_children().get_children()) == 3
     assert net1.get_children()['fc2_weight'].list_arguments() == ['fc2_weight']
     assert net1.get_children()['fc2_weight'].get_children() is None
 
@@ -169,7 +172,8 @@ def test_symbol_fluent():
                     'degrees', 'radians', 'sinh', 'cosh', 'tanh', 'arcsinh', 'arccosh', 'arctanh',
                     'exp', 'expm1', 'log', 'log10', 'log2', 'log1p', 'sqrt', 'rsqrt',
                     'square', 'reciprocal' 'reshape_like', 'cbrt', 'rcbrt', 'relu', 'sigmoid',
-                    'softmax', 'log_softmax'])
+                    'softmax', 'log_softmax', 'rint', 'ceil', 'floor', 'trunc', 'fix'])
+
     def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False):
         with mx.name.NameManager():
             data = mx.symbol.Variable('data')
diff --git a/tools/caffe_converter/compare_layers.py b/tools/caffe_converter/compare_layers.py
index 12568ed206..3f6883e1c3 100644
--- a/tools/caffe_converter/compare_layers.py
+++ b/tools/caffe_converter/compare_layers.py
@@ -17,11 +17,13 @@
 
 """Test converted models layer by layer
 """
-import os
 import argparse
 import logging
-import mxnet as mx
+import os
+import warnings
+
 import cv2
+import mxnet as mx
 import numpy as np
 
 logging.basicConfig(level=logging.INFO)
@@ -275,8 +277,8 @@ def _process_layer_parameters(layer):
             pass
 
         else:
-            logging.warn('No handling for layer %s of type %s, should we ignore it?', layer.name,
-                         layer.type)
+            warnings.warn('No handling for layer %s of type %s, should we ignore it?', layer.name,
+                          layer.type)
 
         return
 
diff --git a/tools/caffe_converter/convert_caffe_modelzoo.py b/tools/caffe_converter/convert_caffe_modelzoo.py
index ab9042fcc5..b49165bf2c 100644
--- a/tools/caffe_converter/convert_caffe_modelzoo.py
+++ b/tools/caffe_converter/convert_caffe_modelzoo.py
@@ -41,8 +41,8 @@
 model_meta_info = {
     # pylint: disable=line-too-long
     'bvlc_alexnet' : {
-        'prototxt' : 'https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_googlenet/deploy.prototxt',
-        'caffemodel' : 'http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel',
+        'prototxt' : 'https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_alexnet/deploy.prototxt',
+        'caffemodel' : 'http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel',
         'mean' : 'https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/caffe/imagenet_mean.binaryproto',
         'top-1-acc' : 0.571,
         'top-5-acc' : 0.802
diff --git a/tools/caffe_translator/build.gradle b/tools/caffe_translator/build.gradle
index 4206767da7..da5e9003a1 100644
--- a/tools/caffe_translator/build.gradle
+++ b/tools/caffe_translator/build.gradle
@@ -10,7 +10,7 @@ apply plugin: 'maven'
 apply plugin: 'signing'
 
 group 'org.caffetranslator'
-version '0.9.1'
+version '0.9.2'
 
 def isReleaseBuild
 def repositoryUrl
diff --git a/tools/caffe_translator/gradlew b/tools/caffe_translator/gradlew
index cccdd3d517..07cc915466 100755
--- a/tools/caffe_translator/gradlew
+++ b/tools/caffe_translator/gradlew
@@ -1,5 +1,22 @@
 #!/usr/bin/env sh
 
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 ##############################################################################
 ##
 ##  Gradle start up script for UN*X
diff --git a/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Converter.java b/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Converter.java
index 90ed9d2190..96d6fec9eb 100644
--- a/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Converter.java
+++ b/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Converter.java
@@ -154,22 +154,33 @@ public String generateMXNetCode() {
             Layer layer = layers.get(layerIndex);
             SymbolGenerator generator = generators.getGenerator(layer.getType());
 
-            // If the translator cannot translate this layer to an MXNet layer,
-            // use CaffeOp or CaffeLoss instead.
+            // Handle layers for which there is no Generator
             if (generator == null) {
-                if (layer.getType().toLowerCase().endsWith("loss")) {
+                if (layer.getType().equalsIgnoreCase("Accuracy")) {
+                    // We handle accuracy layers at a later stage. Do nothing for now.
+                } else if (layer.getType().toLowerCase().endsWith("loss")) {
+                    // This is a loss layer we don't have a generator for. Wrap it in CaffeLoss.
                     generator = generators.getGenerator("CaffePluginLossLayer");
                 } else {
+                    // This is a layer we don't have a generator for. Wrap it in CaffeOp.
                     generator = generators.getGenerator("PluginIntLayerGenerator");
                 }
             }
 
-            GeneratorOutput out = generator.generate(layer, mlModel);
-            String segment = out.code;
-            code.append(segment);
-            code.append(NL);
-
-            layerIndex += out.numLayersTranslated;
+            if (generator != null) { // If we have a generator
+                // Generate code
+                GeneratorOutput out = generator.generate(layer, mlModel);
+                String segment = out.code;
+                code.append(segment);
+                code.append(NL);
+
+                // Update layerIndex depending on how many layers we ended up translating
+                layerIndex += out.numLayersTranslated;
+            } else { // If we don't have a generator
+                // We've decided to skip this layer. Generate no code. Just increment layerIndex
+                // by 1 and move on to the next layer.
+                layerIndex++;
+            }
         }
 
         String loss = getLoss(mlModel, code);
@@ -304,50 +315,8 @@ private String generateValidationMetrics(MLModel mlModel) {
     }
 
     private String generateOptimizer() {
-        String caffeOptimizer = solver.getProperty("type", "sgd").toLowerCase();
-        ST st;
-
-        String lr = solver.getProperty("base_lr");
-        String momentum = solver.getProperty("momentum", "0.9");
-        String wd = solver.getProperty("weight_decay", "0.0005");
-
-        switch (caffeOptimizer) {
-            case "adadelta":
-                st = gh.getTemplate("opt_default");
-                st.add("opt_name", "AdaDelta");
-                st.add("epsilon", solver.getProperty("delta"));
-                break;
-            case "adagrad":
-                st = gh.getTemplate("opt_default");
-                st.add("opt_name", "AdaGrad");
-                break;
-            case "adam":
-                st = gh.getTemplate("opt_default");
-                st.add("opt_name", "Adam");
-                break;
-            case "nesterov":
-                st = gh.getTemplate("opt_sgd");
-                st.add("opt_name", "NAG");
-                st.add("momentum", momentum);
-                break;
-            case "rmsprop":
-                st = gh.getTemplate("opt_default");
-                st.add("opt_name", "RMSProp");
-                break;
-            default:
-                if (!caffeOptimizer.equals("sgd")) {
-                    System.err.println("Unknown optimizer. Will use SGD instead.");
-                }
-
-                st = gh.getTemplate("opt_sgd");
-                st.add("opt_name", "SGD");
-                st.add("momentum", momentum);
-                break;
-        }
-        st.add("lr", lr);
-        st.add("wd", wd);
-
-        return st.render();
+        Optimizer optimizer = new Optimizer(solver);
+        return optimizer.generateInitCode();
     }
 
     private String generateInitializer() {
diff --git a/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Optimizer.java b/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Optimizer.java
new file mode 100644
index 0000000000..da24942495
--- /dev/null
+++ b/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Optimizer.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file Optimizer.java
+ * \brief Generates optimizer from solver prototxt
+ */
+
+package io.mxnet.caffetranslator;
+
+import org.stringtemplate.v4.ST;
+
+public class Optimizer {
+    private final GenerationHelper gh;
+    private final Solver solver;
+
+    public Optimizer(Solver solver) {
+        this.gh = new GenerationHelper();
+        this.solver = solver;
+    }
+
+    public String generateInitCode() {
+        ST st = gh.getTemplate("opt_" + solver.getType().toLowerCase());
+        if (st == null) {
+            System.err.println(String.format("Unknown optimizer type (%s). Using SGD instead.", solver.getType()));
+            st = gh.getTemplate("opt_sgd");
+        }
+
+        st.add("solver", solver);
+        return st.render();
+    }
+}
diff --git a/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Solver.java b/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Solver.java
index ec4c8128ef..969377112c 100644
--- a/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Solver.java
+++ b/tools/caffe_translator/src/main/java/io/mxnet/caffetranslator/Solver.java
@@ -24,6 +24,7 @@
 
 package io.mxnet.caffetranslator;
 
+import lombok.Getter;
 import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.CharStreams;
 import org.antlr.v4.runtime.CommonTokenStream;
@@ -31,6 +32,7 @@
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.lang.reflect.Field;
 import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.List;
@@ -38,9 +40,18 @@
 
 public class Solver {
 
+    private final String solverPath;
     private boolean parseDone;
     private Map<String, List<String>> properties;
-    private final String solverPath;
+    /**
+     * Fields corresponding to keys that can be present in the solver prototxt. 'setFields' sets these
+     * using reflection after parsing the solver prototxt. A solver object is passed to string templates
+     * and the templates read these fields.
+     */
+    @Getter
+    private String base_lr, momentum, weight_decay, lr_policy, gamma, stepsize, stepvalue, max_iter,
+            solver_mode, snapshot, snapshot_prefix, test_iter, test_interval, display, type, delta,
+            momentum2, rms_decay, solver_type;
 
     public Solver(String solverPath) {
         this.solverPath = solverPath;
@@ -67,10 +78,49 @@ public boolean parsePrototxt() {
 
         properties = solverListener.getProperties();
 
+        setFields(properties);
+
         parseDone = true;
         return true;
     }
 
+    private void setFields(Map<String, List<String>> properties) {
+        Class<?> cls = getClass();
+
+        for (Map.Entry<String, List<String>> entry : properties.entrySet()) {
+            String key = entry.getKey();
+            try {
+                Field field = cls.getDeclaredField(key);
+                field.set(this, entry.getValue().get(0));
+            } catch (NoSuchFieldException e) {
+                // Just ignore
+            } catch (IllegalAccessException e) {
+                /**
+                 * This shouldn't happen. If it does happen because we overlooked something, print
+                 * it in the console so we can investigate it.
+                 */
+                e.printStackTrace();
+            }
+        }
+
+        setDefaults();
+    }
+
+    private void setDefaults() {
+        if (type == null) {
+            type = "SGD";
+        }
+        if (delta == null) {
+            delta = "1e-8";
+        }
+        if (momentum2 == null) {
+            momentum2 = "0.999";
+        }
+        if (rms_decay == null) {
+            rms_decay = "0.99";
+        }
+    }
+
     public String getProperty(String key) {
         List<String> list = getProperties(key);
         if (list == null) {
diff --git a/tools/caffe_translator/src/main/resources/templates/accuracy.st b/tools/caffe_translator/src/main/resources/templates/accuracy.st
index f741deff7b..cbe15f6317 100644
--- a/tools/caffe_translator/src/main/resources/templates/accuracy.st
+++ b/tools/caffe_translator/src/main/resources/templates/accuracy.st
@@ -1,2 +1,20 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.metric.Accuracy(output_names=['<output_name>'], label_names=['<label_name>'], name='<name>')
 test_metrics.add(<var>)
diff --git a/tools/caffe_translator/src/main/resources/templates/activation.st b/tools/caffe_translator/src/main/resources/templates/activation.st
index 5a9c37b723..042c2e3175 100644
--- a/tools/caffe_translator/src/main/resources/templates/activation.st
+++ b/tools/caffe_translator/src/main/resources/templates/activation.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.symbol.Activation(data=<data>, act_type='<type>', name='<name>')
diff --git a/tools/caffe_translator/src/main/resources/templates/add.st b/tools/caffe_translator/src/main/resources/templates/add.st
index ca9428f24a..738ac3e562 100644
--- a/tools/caffe_translator/src/main/resources/templates/add.st
+++ b/tools/caffe_translator/src/main/resources/templates/add.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = <data1> + <data2>
diff --git a/tools/caffe_translator/src/main/resources/templates/batchnorm.st b/tools/caffe_translator/src/main/resources/templates/batchnorm.st
index c043c70072..7f2326d914 100644
--- a/tools/caffe_translator/src/main/resources/templates/batchnorm.st
+++ b/tools/caffe_translator/src/main/resources/templates/batchnorm.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <if(fix_beta)>
 <var>_beta = mx.sym.BlockGrad(mx.sym.Variable("<name>_beta", init=mx.init.Constant(0)))
 <endif>
diff --git a/tools/caffe_translator/src/main/resources/templates/concat.st b/tools/caffe_translator/src/main/resources/templates/concat.st
index 75ffa3c956..3f332751b8 100644
--- a/tools/caffe_translator/src/main/resources/templates/concat.st
+++ b/tools/caffe_translator/src/main/resources/templates/concat.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.concat(<data;separator=", "><if(dim)>, dim=<dim><endif>, name='<name>');
diff --git a/tools/caffe_translator/src/main/resources/templates/convolution.st b/tools/caffe_translator/src/main/resources/templates/convolution.st
index c4bdd5189b..c167217ad9 100644
--- a/tools/caffe_translator/src/main/resources/templates/convolution.st
+++ b/tools/caffe_translator/src/main/resources/templates/convolution.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.Convolution(data=<data>,
     <if(weight)>weight=<weight>,<endif>
     <if(bias)>bias=<bias>,<endif>
diff --git a/tools/caffe_translator/src/main/resources/templates/deconvolution.st b/tools/caffe_translator/src/main/resources/templates/deconvolution.st
index 5b63f56191..67483b91ff 100644
--- a/tools/caffe_translator/src/main/resources/templates/deconvolution.st
+++ b/tools/caffe_translator/src/main/resources/templates/deconvolution.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.Deconvolution(data=<data>,
     <if(use_weight)>weight=weight,<endif>
     <if(use_bias)>bias=bias,<endif>
diff --git a/tools/caffe_translator/src/main/resources/templates/dropout.st b/tools/caffe_translator/src/main/resources/templates/dropout.st
index 9791c098fa..ed28dc781a 100644
--- a/tools/caffe_translator/src/main/resources/templates/dropout.st
+++ b/tools/caffe_translator/src/main/resources/templates/dropout.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.Dropout(data=<data>, p=<prob>, name='<name>')
diff --git a/tools/caffe_translator/src/main/resources/templates/fc.st b/tools/caffe_translator/src/main/resources/templates/fc.st
index 22365b31d1..353b4245ce 100644
--- a/tools/caffe_translator/src/main/resources/templates/fc.st
+++ b/tools/caffe_translator/src/main/resources/templates/fc.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.symbol.FullyConnected(data=<data>, <if(weight)>weight=<weight>, <endif><if(bias)>bias=<bias>, <endif>num_hidden=<num>, <if(no_bias)>no_bias=True, <endif>name='<name>')
diff --git a/tools/caffe_translator/src/main/resources/templates/flatten.st b/tools/caffe_translator/src/main/resources/templates/flatten.st
index 8434335980..2ee6ffae7b 100644
--- a/tools/caffe_translator/src/main/resources/templates/flatten.st
+++ b/tools/caffe_translator/src/main/resources/templates/flatten.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.flatten(data=<data>, name='<name>')
diff --git a/tools/caffe_translator/src/main/resources/templates/group.st b/tools/caffe_translator/src/main/resources/templates/group.st
index 33e312f013..9cadf65669 100644
--- a/tools/caffe_translator/src/main/resources/templates/group.st
+++ b/tools/caffe_translator/src/main/resources/templates/group.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.Group([<symbols;separator=", ">]);
diff --git a/tools/caffe_translator/src/main/resources/templates/imports.st b/tools/caffe_translator/src/main/resources/templates/imports.st
index b37bd33823..da03a64ed7 100644
--- a/tools/caffe_translator/src/main/resources/templates/imports.st
+++ b/tools/caffe_translator/src/main/resources/templates/imports.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 from __future__ import division
 import copy
 import logging
diff --git a/tools/caffe_translator/src/main/resources/templates/init_params.st b/tools/caffe_translator/src/main/resources/templates/init_params.st
index 3a277b644c..7c8d7b0677 100644
--- a/tools/caffe_translator/src/main/resources/templates/init_params.st
+++ b/tools/caffe_translator/src/main/resources/templates/init_params.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <if(params_file)>
 arg_params, aux_params = load_params('<params_file>')
 module.init_params(initializer=mx.init.Xavier(), arg_params=arg_params, aux_params=aux_params,
diff --git a/tools/caffe_translator/src/main/resources/templates/iterator.st b/tools/caffe_translator/src/main/resources/templates/iterator.st
index 5bc2a9db32..d608979144 100644
--- a/tools/caffe_translator/src/main/resources/templates/iterator.st
+++ b/tools/caffe_translator/src/main/resources/templates/iterator.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <iter_name> = mx.io.CaffeDataIter(
     prototxt =
 <prototxt>,
diff --git a/tools/caffe_translator/src/main/resources/templates/logging.st b/tools/caffe_translator/src/main/resources/templates/logging.st
index 73785e55f2..cc94872726 100644
--- a/tools/caffe_translator/src/main/resources/templates/logging.st
+++ b/tools/caffe_translator/src/main/resources/templates/logging.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 def get_logger(name):
     formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                                   datefmt='%Y-%m-%d %H:%M:%S')
diff --git a/tools/caffe_translator/src/main/resources/templates/lrn.st b/tools/caffe_translator/src/main/resources/templates/lrn.st
index ec003c1677..b67989884d 100644
--- a/tools/caffe_translator/src/main/resources/templates/lrn.st
+++ b/tools/caffe_translator/src/main/resources/templates/lrn.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.LRN(data=<data>, alpha=<alpha>, beta=<beta>, knorm=<knorm>, nsize=<nsize>, name=<name>)
diff --git a/tools/caffe_translator/src/main/resources/templates/lrpolicy_exp.st b/tools/caffe_translator/src/main/resources/templates/lrpolicy_exp.st
index 43afca2e06..03daae3564 100644
--- a/tools/caffe_translator/src/main/resources/templates/lrpolicy_exp.st
+++ b/tools/caffe_translator/src/main/resources/templates/lrpolicy_exp.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 lr = optimizer_params['learning_rate']
 lr *= gamma
 optimizer_params['learning_rate'] = lr
diff --git a/tools/caffe_translator/src/main/resources/templates/lrpolicy_inv.st b/tools/caffe_translator/src/main/resources/templates/lrpolicy_inv.st
index 5da8aa6996..e62c2d3a22 100644
--- a/tools/caffe_translator/src/main/resources/templates/lrpolicy_inv.st
+++ b/tools/caffe_translator/src/main/resources/templates/lrpolicy_inv.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 lr = optimizer_params['learning_rate']
 lr = base_lr * math.pow((1 + gamma * batch_num), -power)
 optimizer_params['learning_rate'] = lr
diff --git a/tools/caffe_translator/src/main/resources/templates/lrpolicy_multistep.st b/tools/caffe_translator/src/main/resources/templates/lrpolicy_multistep.st
index fe09301fbb..07619087f4 100644
--- a/tools/caffe_translator/src/main/resources/templates/lrpolicy_multistep.st
+++ b/tools/caffe_translator/src/main/resources/templates/lrpolicy_multistep.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 lr_update_steps = [<steps;separator=", ">]
 if(batch_num in lr_update_steps):
     lr = optimizer_params['learning_rate']
diff --git a/tools/caffe_translator/src/main/resources/templates/lrpolicy_poly.st b/tools/caffe_translator/src/main/resources/templates/lrpolicy_poly.st
index e43fd78b16..d62c64bf97 100644
--- a/tools/caffe_translator/src/main/resources/templates/lrpolicy_poly.st
+++ b/tools/caffe_translator/src/main/resources/templates/lrpolicy_poly.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 lr = optimizer_params['learning_rate']
 lr = math.pow(base_lr * (1 - batch_num/max_iter), power)
 optimizer_params['learning_rate'] = lr
diff --git a/tools/caffe_translator/src/main/resources/templates/lrpolicy_sigmoid.st b/tools/caffe_translator/src/main/resources/templates/lrpolicy_sigmoid.st
index 33ba05529f..f44ab5a9b4 100644
--- a/tools/caffe_translator/src/main/resources/templates/lrpolicy_sigmoid.st
+++ b/tools/caffe_translator/src/main/resources/templates/lrpolicy_sigmoid.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 lr = optimizer_params['learning_rate']
 lr = base_lr * ( 1/(1 + math.exp(-gamma * (batch_num - stepsize))))
 optimizer_params['learning_rate'] = lr
diff --git a/tools/caffe_translator/src/main/resources/templates/lrpolicy_step.st b/tools/caffe_translator/src/main/resources/templates/lrpolicy_step.st
index 04468ae5b7..1f3d975d77 100644
--- a/tools/caffe_translator/src/main/resources/templates/lrpolicy_step.st
+++ b/tools/caffe_translator/src/main/resources/templates/lrpolicy_step.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 if(batch_num % stepsize == 0):
     lr = optimizer_params['learning_rate']
     lr *= gamma
diff --git a/tools/caffe_translator/src/main/resources/templates/maxium.st b/tools/caffe_translator/src/main/resources/templates/maxium.st
index d9431ddfba..9b18246c6b 100644
--- a/tools/caffe_translator/src/main/resources/templates/maxium.st
+++ b/tools/caffe_translator/src/main/resources/templates/maxium.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.maximum(<data1>, <data2>)
diff --git a/tools/caffe_translator/src/main/resources/templates/metrics_classes.st b/tools/caffe_translator/src/main/resources/templates/metrics_classes.st
index e8323fbebc..e586616c5f 100644
--- a/tools/caffe_translator/src/main/resources/templates/metrics_classes.st
+++ b/tools/caffe_translator/src/main/resources/templates/metrics_classes.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 class TrainMetrics():
 
     metric_map = {}
@@ -16,17 +34,16 @@ class TrainMetrics():
                 self.update_metrics(module, label, reset=True)
                 self.print_metrics(batch_num)
         else:
-            # If I'll have to print metrics 'average_loss' iterations from now,
-            # append a metric so I can start updating that.
+            # Metrics must be print 'average_loss' iterations from now.
+            # Append a metric which will get updated starting now.
             if((batch_num + self.average_loss) % self.display == 0):
                 self.append_one()
 
-            # If I'm less than 'average_loss' iteration away from a display step,
-            # update the metrics.
+            # Less that 'average_loss' iteration away from a display step. Update metrics.
             if((batch_num + self.average_loss) % self.display \< self.average_loss):
                 self.update_metrics(module, label)
 
-            # If I'm at a display step, print the metrics.
+            # At display step. Print metrics.
             if(batch_num % self.display == 0):
                 self.print_metrics(batch_num, remove_heads=True)
 
diff --git a/tools/caffe_translator/src/main/resources/templates/mul.st b/tools/caffe_translator/src/main/resources/templates/mul.st
index 411a407387..59c4837c83 100644
--- a/tools/caffe_translator/src/main/resources/templates/mul.st
+++ b/tools/caffe_translator/src/main/resources/templates/mul.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = <data1> * (<data2>)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_adadelta.st b/tools/caffe_translator/src/main/resources/templates/opt_adadelta.st
new file mode 100644
index 0000000000..cfd465b5f4
--- /dev/null
+++ b/tools/caffe_translator/src/main/resources/templates/opt_adadelta.st
@@ -0,0 +1,32 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
+<opt_vars(solver)>
+<if(solver.momentum)>
+rho = <solver.momentum>
+<endif>
+<if(solver.delta)>
+epsilon = <solver.delta>
+<endif>
+
+optimizer_params={'learning_rate':base_lr<\\>
+<if(solver.wd)>, 'wd':wd<endif><\\>
+<if(solver.momentum)>, 'rho':rho<endif><\\>
+<if(solver.delta)>, 'epsilon':epsilon<endif>}<\\>
+
+module.init_optimizer(optimizer='AdaDelta', optimizer_params=optimizer_params)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_adagrad.st b/tools/caffe_translator/src/main/resources/templates/opt_adagrad.st
new file mode 100644
index 0000000000..527cedf6f8
--- /dev/null
+++ b/tools/caffe_translator/src/main/resources/templates/opt_adagrad.st
@@ -0,0 +1,28 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
+<opt_vars(solver)>
+<if(solver.delta)>
+epsilon = <solver.delta>
+<endif>
+
+optimizer_params={'learning_rate':base_lr<\\>
+<if(solver.wd)>, 'wd':wd<endif><\\>
+<if(solver.delta)>, 'epsilon':epsilon<endif>}<\\>
+
+module.init_optimizer(optimizer='AdaGrad', optimizer_params=optimizer_params)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_adam.st b/tools/caffe_translator/src/main/resources/templates/opt_adam.st
new file mode 100644
index 0000000000..b0a8ca3687
--- /dev/null
+++ b/tools/caffe_translator/src/main/resources/templates/opt_adam.st
@@ -0,0 +1,36 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
+<opt_vars(solver)>
+<if(solver.momentum)>
+beta1 = <solver.momentum>
+<endif>
+<if(solver.momentum2)>
+beta2 = <solver.momentum2>
+<endif>
+<if(solver.delta)>
+epsilon = <solver.delta>
+<endif>
+
+optimizer_params={'learning_rate':base_lr<\\>
+<if(solver.wd)>, 'wd':swd<endif><\\>
+<if(solver.momentum)>, 'beta1':beta1<endif><\\>
+<if(solver.momentum2)>, 'beta2':beta2<endif><\\>
+<if(solver.delta)>, 'epsilon':epsilon<endif>}<\\>
+
+module.init_optimizer(optimizer='Adam', optimizer_params=optimizer_params)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_default.st b/tools/caffe_translator/src/main/resources/templates/opt_default.st
deleted file mode 100644
index e5a72ac85f..0000000000
--- a/tools/caffe_translator/src/main/resources/templates/opt_default.st
+++ /dev/null
@@ -1,15 +0,0 @@
-<if(lr)>
-base_lr = <lr>
-<endif>
-<if(momentum)>
-momentum = <momentum>
-<endif>
-<if(wd)>
-wd = <wd>
-<endif>
-<if(epsilon)>
-epsilon = <epsilon>
-<endif>
-
-optimizer_params={'learning_rate':base_lr <if(momentum)>, 'momentum':momentum<endif><if(wd)>, 'wd':wd<endif><if(epsilon)>, 'epsilon':epsilon<endif>}
-module.init_optimizer(optimizer='<opt_name>', optimizer_params=optimizer_params)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_nesterov.st b/tools/caffe_translator/src/main/resources/templates/opt_nesterov.st
new file mode 100644
index 0000000000..6262d48675
--- /dev/null
+++ b/tools/caffe_translator/src/main/resources/templates/opt_nesterov.st
@@ -0,0 +1,28 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
+<opt_vars(solver)>
+<if(solver.momentum)>
+momentum = <solver.momentum>
+<endif>
+
+optimizer_params={'learning_rate':base_lr<\\>
+<if(solver.wd)>, 'wd':wd<endif><\\>
+<if(solver.momentum)>, 'momentum':momentum<endif>}<\\>
+
+module.init_optimizer(optimizer='NAG', optimizer_params=optimizer_params)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st b/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
new file mode 100644
index 0000000000..6baec42951
--- /dev/null
+++ b/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
@@ -0,0 +1,32 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
+<opt_vars(solver)>
+<if(solver.rms_decay)>
+gamma1 = <solver.rms_decay>
+<endif>
+<if(solver.delta)>
+epsilon = <solver.delta>
+<endif>
+
+optimizer_params={'learning_rate':base_lr<\\>
+<if(solver.wd)>, 'wd':wd<endif><\\>
+<if(solver.rms_decay)>, 'gamma1':gamma1<endif><\\>
+<if(solver.delta)>, 'epsilon':epsilon<endif>}<\\>
+
+module.init_optimizer(optimizer='RMSProp', optimizer_params=optimizer_params)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_sgd.st b/tools/caffe_translator/src/main/resources/templates/opt_sgd.st
index 8a24e05f8e..aa547a6141 100644
--- a/tools/caffe_translator/src/main/resources/templates/opt_sgd.st
+++ b/tools/caffe_translator/src/main/resources/templates/opt_sgd.st
@@ -1,12 +1,28 @@
-<if(lr)>
-base_lr = <lr>
-<endif>
-<if(momentum)>
-momentum = <momentum>
-<endif>
-<if(wd)>
-wd = <wd>
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
+<opt_vars(solver)>
+<if(solver.momentum)>
+momentum = <solver.momentum>
 <endif>
 
-optimizer_params={'learning_rate':base_lr <if(momentum)>, 'momentum':momentum<endif><if(wd)>, 'wd':wd<endif>}
-module.init_optimizer(optimizer='<opt_name>', optimizer_params=optimizer_params)
+optimizer_params={'learning_rate':base_lr<\\>
+<if(solver.wd)>, 'wd':wd<endif><\\>
+<if(solver.momentum)>, 'momentum':momentum<endif>}<\\>
+
+module.init_optimizer(optimizer='SGD', optimizer_params=optimizer_params)
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_vars.st b/tools/caffe_translator/src/main/resources/templates/opt_vars.st
new file mode 100644
index 0000000000..19b2f4cc6c
--- /dev/null
+++ b/tools/caffe_translator/src/main/resources/templates/opt_vars.st
@@ -0,0 +1,24 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
+<if(solver.base_lr)>
+base_lr = <solver.base_lr>
+<endif>
+<if(solver.wd)>
+wd = <solver.wd>
+<endif>
\ No newline at end of file
diff --git a/tools/caffe_translator/src/main/resources/templates/param_initializer.st b/tools/caffe_translator/src/main/resources/templates/param_initializer.st
index b496fc34cc..abad5daeb1 100644
--- a/tools/caffe_translator/src/main/resources/templates/param_initializer.st
+++ b/tools/caffe_translator/src/main/resources/templates/param_initializer.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 class ParamInitializer():
     lst_patterns = []
     lst_initializers = []
diff --git a/tools/caffe_translator/src/main/resources/templates/params_loader.st b/tools/caffe_translator/src/main/resources/templates/params_loader.st
index 22efec4e68..c124c986d6 100644
--- a/tools/caffe_translator/src/main/resources/templates/params_loader.st
+++ b/tools/caffe_translator/src/main/resources/templates/params_loader.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 def load_params(params_file):
     save_dict = mx.nd.load(params_file)
     arg_params = {}
diff --git a/tools/caffe_translator/src/main/resources/templates/permute.st b/tools/caffe_translator/src/main/resources/templates/permute.st
index 2b06a76aa1..9f94bdbf6c 100644
--- a/tools/caffe_translator/src/main/resources/templates/permute.st
+++ b/tools/caffe_translator/src/main/resources/templates/permute.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.transpose(data=<data>, axes=(<axes;separator=", ">), name='<name>')
diff --git a/tools/caffe_translator/src/main/resources/templates/pooling.st b/tools/caffe_translator/src/main/resources/templates/pooling.st
index 5389754f00..7aceffdf0e 100644
--- a/tools/caffe_translator/src/main/resources/templates/pooling.st
+++ b/tools/caffe_translator/src/main/resources/templates/pooling.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.symbol.Pooling(data=<data>,
     pool_type='<type>',
 <if(global_pool)>
diff --git a/tools/caffe_translator/src/main/resources/templates/power.st b/tools/caffe_translator/src/main/resources/templates/power.st
index a512a67109..7fe3ee8eff 100644
--- a/tools/caffe_translator/src/main/resources/templates/power.st
+++ b/tools/caffe_translator/src/main/resources/templates/power.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = (<shift> + (<scale> * <data>)) ** <power>
diff --git a/tools/caffe_translator/src/main/resources/templates/runner.st b/tools/caffe_translator/src/main/resources/templates/runner.st
index 6df9671d94..8346ffe22b 100644
--- a/tools/caffe_translator/src/main/resources/templates/runner.st
+++ b/tools/caffe_translator/src/main/resources/templates/runner.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 ctx = <ctx>
 
 module = mx.mod.Module(symbol=<loss>, context=ctx, data_names=[<data_names;separator=", ">], label_names=[<label_names;separator=", ">])
diff --git a/tools/caffe_translator/src/main/resources/templates/softmaxoutput.st b/tools/caffe_translator/src/main/resources/templates/softmaxoutput.st
index bc63891d1f..57a8e71939 100644
--- a/tools/caffe_translator/src/main/resources/templates/softmaxoutput.st
+++ b/tools/caffe_translator/src/main/resources/templates/softmaxoutput.st
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.SoftmaxOutput(data=<data>, label=<label>, name='<name>')
 <var>_metric = mx.metric.CrossEntropy(output_names=['<name>_output'], label_names=['<label_name>'], name='<name>/metric')
 train_metrics.add(<var>_metric)
diff --git a/tools/caffe_translator/src/main/resources/templates/symbols.stg b/tools/caffe_translator/src/main/resources/templates/symbols.stg
index fda912596f..2a76eb089c 100644
--- a/tools/caffe_translator/src/main/resources/templates/symbols.stg
+++ b/tools/caffe_translator/src/main/resources/templates/symbols.stg
@@ -1,3 +1,21 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 CaffePluginIntLayer(var, tops, num_data, num_weight, num_out, data, prototxt, name) ::= "<var> = mx.symbol.CaffeOp(<if(data)><data>, <endif><if(num_data)>num_data=<num_data>, <endif><if(num_out)>num_out=<num_out>, <endif><if(num_weight)>num_weight=<num_weight>, <endif>prototxt='<prototxt>', name='<name>')
 <if(tops)><tops:{top|<top_assign(top, var, i0)>};separator=\"\n\"> <endif>"
 
diff --git a/tools/caffe_translator/src/main/resources/templates/top_k_accuracy.st b/tools/caffe_translator/src/main/resources/templates/top_k_accuracy.st
index de93ee98bf..29a713fc9a 100644
--- a/tools/caffe_translator/src/main/resources/templates/top_k_accuracy.st
+++ b/tools/caffe_translator/src/main/resources/templates/top_k_accuracy.st
@@ -1,2 +1,20 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.metric.TopKAccuracy(top_k=<k>, output_names=['<output_name>'], label_names=['<label_name>'], name='<name>')
 test_metrics.add(<var>)
diff --git a/tools/caffe_translator/src/main/resources/templates/var.st b/tools/caffe_translator/src/main/resources/templates/var.st
index e850b68929..fa08cd7350 100644
--- a/tools/caffe_translator/src/main/resources/templates/var.st
+++ b/tools/caffe_translator/src/main/resources/templates/var.st
@@ -1 +1,19 @@
+<!
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+!>
 <var> = mx.sym.Variable('<name>'<if(lr_mult)>, lr_mult=<lr_mult><endif><if(wd_mult)>, wd_mult=<wd_mult><endif><if(init)>, init=<init><endif><if(shape)>, shape=(<shape;separator=", ">)<endif>)
diff --git a/tools/coreml/converter/_layers.py b/tools/coreml/converter/_layers.py
index 4c5ebc6fb0..d0113a9155 100644
--- a/tools/coreml/converter/_layers.py
+++ b/tools/coreml/converter/_layers.py
@@ -262,10 +262,15 @@ def convert_dense(net, node, module, builder):
         A neural network builder object.
     """
     input_name, output_name = _get_input_output_name(net, node)
-    has_bias = True
     name = node['name']
 
     inputs = node['inputs']
+    param = node['attrs']
+    if 'no_bias' in param.keys():
+        has_bias = not literal_eval(param['no_bias'])
+    else:
+        has_bias = True
+
     args, _ = module.get_params()
     W = args[_get_node_name(net, inputs[1][0])].asnumpy()
     if has_bias:
diff --git a/tools/coreml/test/test_mxnet_converter.py b/tools/coreml/test/test_mxnet_converter.py
index bf1ace6b8f..6020041ac5 100644
--- a/tools/coreml/test/test_mxnet_converter.py
+++ b/tools/coreml/test/test_mxnet_converter.py
@@ -149,6 +149,13 @@ def test_tiny_inner_product_random_input(self):
         net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
         self._test_mxnet_model(net, input_shape=input_shape, mode='random')
 
+    def test_tiny_inner_product_no_bias(self):
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5, no_bias=True)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
     def test_tiny_softmax_random_input(self):
         np.random.seed(1988)
         input_shape = (1, 10)
diff --git a/tools/im2rec.py b/tools/im2rec.py
index ec6de19694..f94c5c0b0a 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -28,7 +28,6 @@
 import cv2
 import time
 import traceback
-from builtins import range
 
 try:
     import multiprocessing
@@ -216,7 +215,7 @@ def parse_args():
                         help='If this is set im2rec will create image list(s) by traversing root folder\
         and output to <prefix>.lst.\
         Otherwise im2rec will read <prefix>.lst and create a database at <prefix>.rec')
-    cgroup.add_argument('--exts', nargs='+', default=['.jpeg', '.jpg'],
+    cgroup.add_argument('--exts', nargs='+', default=['.jpeg', '.jpg', '.png'],
                         help='list of acceptable image extensions.')
     cgroup.add_argument('--chunks', type=int, default=1, help='number of chunks.')
     cgroup.add_argument('--train-ratio', type=float, default=1.0,


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services