You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/09/10 08:36:07 UTC

[GitHub] lebeg closed pull request #12495: Fixed tvm to v0.4 release

lebeg closed pull request #12495: Fixed tvm to v0.4 release
URL: https://github.com/apache/incubator-mxnet/pull/12495
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000000..993656e1276
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# The checks defined here will be run and will display by default as warnings.
+Checks: >
+    -*, cppcoreguidelines-c-copy-assignment-signature,
+    cppcoreguidelines-interfaces-global-init, cppcoreguidelines-no-malloc,
+    cppcoreguidelines-pro-bounds-constant-array-index, cppcoreguidelines-pro-type-const-cast,
+    cppcoreguidelines-pro-type-cstyle-cast, cppcoreguidelines-pro-type-member-init,
+    cppcoreguidelines-pro-type-static-cast-downcast, cppcoreguidelines-pro-type-union-access,
+    cppcoreguidelines-pro-type-vararg, cppcoreguidelines-slicing,
+    cppcoreguidelines-special-member-functions, clang-analyzer-security.FloatLoopCounter,
+    clang-analyzer-security.insecureAPI.*, clang-analyzer-core.CallAndMessage,
+    clang-analyzer-core.DivideZero, clang-analyzer-core.DynamicTypePropagation,
+    clang-analyzer-core.NonNullParamChecker, clang-analyzer-core.NullDereference,
+    clang-analyzer-core.StackAddressEscape, clang-analyzer-core.UndefinedBinaryOperatorResult,
+    clang-analyzer-core.VLASize, clang-analyzer-core.builtin.BuiltinFunctions,
+    clang-analyzer-core.builtin.NoReturnFunctions, clang-analyzer-core.uninitialized.ArraySubscript,
+    clang-analyzer-core.uninitialized.Assign, clang-analyzer-core.uninitialized.Branch,
+    clang-analyzer-core.uninitialized.CapturedBlockVariable,
+    clang-analyzer-core.uninitialized.UndefReturn, clang-analyzer-cplusplus.NewDelete,
+    clang-analyzer-cplusplus.NewDeleteLeaks, clang-analyzer-cplusplus.SelfAssignment,
+    clang-analyzer-deadcode.DeadStores, modernize-avoid-bind, modernize-deprecated-headers,
+    modernize-loop-convert, modernize-make-shared, modernize-pass-by-value,
+    modernize-raw-string-literal, modernize-redundant-void-arg, modernize-replace-auto-ptr,
+    modernize-replace-random-shuffle, modernize-return-braced-init-list, modernize-shrink-to-fit,
+    modernize-unary-static-assert, modernize-use-bool-literals, modernize-use-default-member-init,
+    modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete,
+    modernize-use-noexcept, modernize-use-nullptr, modernize-use-override,
+    modernize-use-transparent-functors, modernize-use-using, performance-*
+
+# cppcoreguidelines checks not enabled:
+# cppcoreguidelines-pro-bounds-pointer-arithmetic
+# cppcoreguidelines-pro-bounds-array-to-pointer-decay
+# cppcoreguidelines-pro-type-reinterpret-cast
+
+# modernize checks not enabled:
+# modernize-use-auto
+# modernize-make-unique (C++14 and newer only)
+
+# In order to trigger an error, you must have a rule defined both in checks and in this section.
+WarningsAsErrors: >
+    cppcoreguidelines-no-malloc
+
+# Todo: define a better regex match that includes most project headers, but excludes third party
+# code.
+HeaderFilterRegex: '^src/.*'
diff --git a/.travis.yml b/.travis.yml
index ca5d03b5008..f61bd86673d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,108 +1,31 @@
-sudo: false
+sudo: true
 
 language: cpp
 
+cache: ccache
+
 os:
-  # - linux
   - osx
 
-osx_image: xcode8
-
-env:
-  # code analysis
-  # - TASK=lint
-  # build mxnet.so with CUDA
-  # - TASK=build
-  # run tests/cpp
-  - TASK=cpp_test
-  # run tests/python
-  - TASK=python_test
-  - TASK=r_test
-  # - TASK=julia JULIA_VER=0.4
-  # - TASK=scala_test
-
-  # TODO, R test, distributed test, clang, more g++ versions
+osx_image: xcode9.4
 
 matrix:
   include:
-    - # os: linux
-      # dist: trusty
-      # env: TASK=perl_test
     - os: osx
-      ## sudo is required because
-      ## prexexisting packages conflict
-      ## with new ones.
-      ## would be nice to have macports
-      ## on travis osx, it has all needed perl packages
-      sudo: required
-      env: TASK=perl_test
-#       env: TASK=julia JULIA_VER=0.4
-#     - os: linux
-#       env: TASK=build
-#     - os: linux
-#       env: TASK=cpp_test
-#     - os: linux
-#       env: TASK=python_test
-#     - os: linux
-#       env: TASK=r_test
-#     - os: linux
-#       env: TASK=scala_test
-
-# dependent apt packages
-addons:
-  apt:
-    sources:
-      - ubuntu-toolchain-r-test
-    packages:
-      - doxygen
-      - wget
-      - git
-      - libcurl4-openssl-dev
-      - unzip
-      - libatlas-dev
-      - libopencv-dev
-      - gcc-4.8
-      - g++-4.8
-      - python-numpy
-      - python-nose
-      - python3-numpy
-      - python3-dev
-      - python3-nose
-      - python-h5py
-      - python3-h5py
-      - graphviz
-      - libmouse-perl
-      - pdl
-      - cpanminus
-      - swig
-      - libgraphviz-perl
 
 before_install:
-  - export NVCC_PREFIX=${HOME}
-  - source dmlc-core/scripts/travis/travis_setup_env.sh
   - export PYTHONPATH=${PYTHONPATH}:${PWD}/python
-  - export MAVEN_SKIP_RC=true
-  - export MAVEN_OPTS="-Xmx512m -XX:MaxPermSize=256m -XX:-UseGCOverheadLimit -XX:+CMSClassUnloadingEnabled -XX:+UseConcMarkSweepGC"
 
 install:
-  - source tests/travis/setup.sh
-
+  - brew install ccache
+  - export PATH="/usr/local/opt/ccache/libexec:$PATH"
+  - source ci/travis/install.sh
+
+# We build with 2 concurrent jobs to match the number of cores present on MacOS virutal machines.
+# nproc does not report the correct number of cores reliably in Travis, so using nproc is not
+# recommended.
+# https://docs.travis-ci.com/user/reference/overview/
 script:
-  - tests/travis/run_test.sh
-
-cache:
-  directories:
-    - ${HOME}/.cache/usr
-
-before_cache:
-  - dmlc-core/scripts/travis/travis_before_cache.sh
-
-after_failure:
-  - tests/travis/travis_after_failure.sh
-
-notifications:
-# Emails are sent to the committer's git-configured email address by default,
-  email:
-    on_success: change
-    on_failure: always
-  #slack: dmlc:NmroCzntCiWOuxUZpii40USd
+  - export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+  - mv make/osx.mk config.mk
+  - make -j 2
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 290226e1c9a..60769b77f9a 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 290226e1c9adbb3e598f9ed9184018df1c12be33
+Subproject commit 60769b77f9abe29aafabda4d5d1cd625e7c61f9f
diff --git a/Jenkinsfile b/Jenkinsfile
index 50b86ec7190..346cb19ce46 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -173,12 +173,22 @@ core_logic: {
         }
       }
     },
-    'CPU: Clang 5': {
+    'CPU: Clang 6': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-clang50') {
+        ws('workspace/build-cpu-clang60') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50', false)
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang60', false)
+          }
+        }
+      }
+    },
+    'CPU: Clang Tidy': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-clang60_tidy') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang_tidy', false)
           }
         }
       }
@@ -194,13 +204,13 @@ core_logic: {
         }
       }
     },
-    'CPU: Clang 5 MKLDNN': {
+    'CPU: Clang 6 MKLDNN': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-mkldnn-clang50') {
+        ws('workspace/build-cpu-mkldnn-clang60') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50_mkldnn', false)
-            utils.pack_lib('mkldnn_cpu_clang5', mx_mkldnn_lib)
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang60_mkldnn', false)
+            utils.pack_lib('mkldnn_cpu_clang6', mx_mkldnn_lib)
           }
         }
       }
@@ -363,16 +373,16 @@ core_logic: {
         }
       }
     },
-    // 'ARMv7':{
-    //   node(NODE_LINUX_CPU) {
-    //     ws('workspace/build-ARMv7') {
-    //       timeout(time: max_time, unit: 'MINUTES') {
-    //         utils.init_git()
-    //         utils.docker_run('armv7', 'build_armv7', false)
-    //       }
-    //     }
-    //   }
-    // },
+    'ARMv7':{
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-ARMv7') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('armv7', 'build_armv7', false)
+          }
+        }
+      }
+    },
     'ARMv6':{
       node(NODE_LINUX_CPU) {
         ws('workspace/build-ARMv6') {
@@ -890,6 +900,10 @@ core_logic: {
         }
       }
     },
+    /*  Disabled due to master build failure:
+     *  http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1221/pipeline/
+     *  https://github.com/apache/incubator-mxnet/issues/11801
+
     'dist-kvstore tests CPU': {
       node(NODE_LINUX_CPU) {
         ws('workspace/it-dist-kvstore') {
@@ -901,7 +915,7 @@ core_logic: {
           }
         }
       }
-    },
+    }, */
     'Scala: GPU': {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-scala-gpu') {
diff --git a/R-package/R/initializer.R b/R-package/R/initializer.R
index 40712432d8b..bb81a285bea 100644
--- a/R-package/R/initializer.R
+++ b/R-package/R/initializer.R
@@ -3,7 +3,7 @@
 #' @param name the name of the variable.
 #' @param shape the shape of the array to be generated.
 #'
-mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) {
+mx.init.internal.default <- function(name, shape, ctx, allow.unknown = FALSE) {
   if (endsWith(name, "bias")) return (mx.nd.zeros(shape))
   if (endsWith(name, "gamma")) return (mx.nd.ones(shape))
   if (endsWith(name, "beta")) return (mx.nd.zeros(shape))
@@ -19,7 +19,7 @@ mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) {
 #'
 #' @export
 mx.init.uniform <- function(scale) {
-  function(name, shape, ctx, allow.unknown=FALSE) {
+  function(name, shape, ctx, allow.unknown = FALSE) {
     if (!endsWith(name, "weight")) {
       return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
@@ -33,7 +33,7 @@ mx.init.uniform <- function(scale) {
 #'
 #' @export
 mx.init.normal <- function(sd) {
-  function(name, shape, ctx, allow.unknown=FALSE) {
+  function(name, shape, ctx, allow.unknown = FALSE) {
     if (!endsWith(name, "weight")) {
       return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
@@ -59,15 +59,15 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
       return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
     
-    fan_out = shape[length(shape)]
-    fan_in  = prod(shape[-length(shape)])
+    fan_out <- shape[length(shape)]
+    fan_in <- prod(shape[-length(shape)])
     factor_val <- switch(factor_type,
                          "avg" = (fan_in + fan_out) / 2,
                          "in" = fan_in,
                          "out" = fan_out,
                          stop("Not supported factor type. See usage of function mx.init.Xavier"))
 
-    scale = sqrt(magnitude / factor_val)
+    scale <- sqrt(magnitude / factor_val)
     
     if (rnd_type == "uniform"){
       return(mx.nd.random.uniform(low = -scale, high = scale, shape = shape))
@@ -83,14 +83,16 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
 #' Create initialization of argument  like arg.array
 #'
 #' @param initializer The initializer.
-#' @param shape.array named-list The shape of the weights
+#' @param shape.array A named list that represents the shape of the weights
 #' @param ctx mx.context The context of the weights
 #' @param skip.unknown Whether skip the unknown weight types
 #' @export
-mx.init.create <- function(initializer, shape.array, ctx=NULL, skip.unknown=TRUE) {
+mx.init.create <- function(initializer, shape.array, ctx = NULL, skip.unknown = TRUE) {
   if (length(shape.array) == 0) return(list())
-  names = names(shape.array)
-  ret <- lapply(seq_along(names), function(i) initializer(names[[i]], shape.array[[i]], ctx, allow.unknown=skip.unknown))
+  names <- names(shape.array)
+  ret <- lapply(
+    seq_along(names),
+    function(i) initializer(names[[i]], shape.array[[i]], ctx, allow.unknown = skip.unknown))
   names(ret) <- names
   if (skip.unknown) {
     ret <- mx.util.filter.null(ret)
diff --git a/R-package/tests/testthat/get_data.R b/R-package/tests/testthat/get_data.R
index 2676b20fa80..0e27894498b 100644
--- a/R-package/tests/testthat/get_data.R
+++ b/R-package/tests/testthat/get_data.R
@@ -3,13 +3,11 @@ GetMNIST_ubyte <- function() {
   if (!dir.exists("data")) {
     dir.create("data/")
   }
-  if (!file.exists('data/train-images-idx3-ubyte') |
-      !file.exists('data/train-labels-idx1-ubyte') |
-      !file.exists('data/t10k-images-idx3-ubyte') |
-      !file.exists('data/t10k-labels-idx1-ubyte')) {
-    download.file('http://data.mxnet.io/mxnet/data/mnist.zip', destfile = 'data/mnist.zip')
-    unzip('data/mnist.zip', exdir = 'data/')
-    file.remove('data/mnist.zip')
+  if (!file.exists("data/train-images-idx3-ubyte") | !file.exists("data/train-labels-idx1-ubyte") | 
+    !file.exists("data/t10k-images-idx3-ubyte") | !file.exists("data/t10k-labels-idx1-ubyte")) {
+    download.file("http://data.mxnet.io/mxnet/data/mnist.zip", destfile = "data/mnist.zip")
+    unzip("data/mnist.zip", exdir = "data/")
+    file.remove("data/mnist.zip")
   }
 }
 
@@ -17,12 +15,11 @@ GetMNIST_csv <- function() {
   if (!dir.exists("data")) {
     dir.create("data/")
   }
-  if (!file.exists('data/train.csv') |
-      !file.exists('data/test.csv')) {
-    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip',
-                  destfile = 'data/mnist_csv.zip')
-    unzip('data/mnist_csv.zip', exdir = 'data/')
-    file.remove('data/mnist_csv.zip')
+  if (!file.exists("data/train.csv") | !file.exists("data/test.csv")) {
+    download.file("https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip", 
+      destfile = "data/mnist_csv.zip")
+    unzip("data/mnist_csv.zip", exdir = "data/")
+    file.remove("data/mnist_csv.zip")
   }
 }
 
@@ -30,14 +27,11 @@ GetCifar10 <- function() {
   if (!dir.exists("data")) {
     dir.create("data/")
   }
-  if (!file.exists('data/cifar/train.rec') |
-      !file.exists('data/cifar/test.rec') |
-      !file.exists('data/cifar/train.lst') |
-      !file.exists('data/cifar/test.lst')) {
-    download.file('http://data.mxnet.io/mxnet/data/cifar10.zip',
-                  destfile = 'data/cifar10.zip')
-    unzip('data/cifar10.zip', exdir = 'data/')
-    file.remove('data/cifar10.zip')
+  if (!file.exists("data/cifar/train.rec") | !file.exists("data/cifar/test.rec") | 
+    !file.exists("data/cifar/train.lst") | !file.exists("data/cifar/test.lst")) {
+    download.file("http://data.mxnet.io/mxnet/data/cifar10.zip", destfile = "data/cifar10.zip")
+    unzip("data/cifar10.zip", exdir = "data/")
+    file.remove("data/cifar10.zip")
   }
 }
 
@@ -45,13 +39,13 @@ GetInception <- function() {
   if (!dir.exists("model")) {
     dir.create("model/")
   }
-  if (!file.exists('model/Inception-BN-0126.params')) {
-    download.file('http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-0126.params',
-                  destfile = 'model/Inception-BN-0126.params')
+  if (!file.exists("model/Inception-BN-0126.params")) {
+    download.file("http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-0126.params", 
+      destfile = "model/Inception-BN-0126.params")
   }
-  if (!file.exists('model/Inception-BN-symbol.json')) {
-    download.file('http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-symbol.json',
-                  destfile = 'model/Inception-BN-symbol.json')
+  if (!file.exists("model/Inception-BN-symbol.json")) {
+    download.file("http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-symbol.json", 
+      destfile = "model/Inception-BN-symbol.json")
   }
 }
 
@@ -59,12 +53,11 @@ GetCatDog <- function() {
   if (!dir.exists("data")) {
     dir.create("data/")
   }
-  if (!file.exists('data/cats_dogs/cats_dogs_train.rec') |
-      !file.exists('data/cats_dogs/cats_dogs_val.rec')) {
-    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/cats_dogs.zip',
-                  destfile = 'data/cats_dogs.zip')
-    unzip('data/cats_dogs.zip', exdir = 'data/')
-    file.remove('data/cats_dogs.zip')
+  if (!file.exists("data/cats_dogs/cats_dogs_train.rec") | !file.exists("data/cats_dogs/cats_dogs_val.rec")) {
+    download.file("https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/cats_dogs.zip", 
+      destfile = "data/cats_dogs.zip")
+    unzip("data/cats_dogs.zip", exdir = "data/")
+    file.remove("data/cats_dogs.zip")
   }
 }
 
@@ -72,11 +65,11 @@ GetMovieLens <- function() {
   if (!dir.exists("data")) {
     dir.create("data/")
   }
-  if (!file.exists('data/ml-100k/u.data')) {
-    download.file('http://files.grouplens.org/datasets/movielens/ml-100k.zip',
-                  destfile = 'data/ml-100k.zip')
-    unzip('data/ml-100k.zip', exdir = 'data/')
-    file.remove('data/ml-100k.zip')
+  if (!file.exists("data/ml-100k/u.data")) {
+    download.file("http://files.grouplens.org/datasets/movielens/ml-100k.zip", 
+      destfile = "data/ml-100k.zip")
+    unzip("data/ml-100k.zip", exdir = "data/")
+    file.remove("data/ml-100k.zip")
   }
 }
 
@@ -84,12 +77,11 @@ GetISBI_data <- function() {
   if (!dir.exists("data")) {
     dir.create("data/")
   }
-  if (!file.exists('data/ISBI/train-volume.tif') |
-      !file.exists('data/ISBI/train-labels.tif')) {
-    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/ISBI.zip',
-                  destfile = 'data/ISBI.zip')
-    unzip('data/ISBI.zip', exdir = 'data/')
-    file.remove('data/ISBI.zip')
+  if (!file.exists("data/ISBI/train-volume.tif") | !file.exists("data/ISBI/train-labels.tif")) {
+    download.file("https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/ISBI.zip", 
+      destfile = "data/ISBI.zip")
+    unzip("data/ISBI.zip", exdir = "data/")
+    file.remove("data/ISBI.zip")
   }
 }
 
@@ -97,11 +89,10 @@ GetCaptcha_data <- function() {
   if (!dir.exists("data")) {
     dir.create("data/")
   }
-  if (!file.exists('data/captcha_example/captcha_train.rec') |
-      !file.exists('data/captcha_example/captcha_test.rec')) {
-    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip',
-                  destfile = 'data/captcha_example.zip')
-    unzip('data/captcha_example.zip', exdir = 'data/')
-    file.remove('data/captcha_example.zip')
+  if (!file.exists("data/captcha_example/captcha_train.rec") | !file.exists("data/captcha_example/captcha_test.rec")) {
+    download.file("https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip", 
+      destfile = "data/captcha_example.zip")
+    unzip("data/captcha_example.zip", exdir = "data/")
+    file.remove("data/captcha_example.zip")
   }
 }
diff --git a/R-package/tests/testthat/test_img_seg.R b/R-package/tests/testthat/test_img_seg.R
index b3400cd3bbc..9b63f5078fa 100644
--- a/R-package/tests/testthat/test_img_seg.R
+++ b/R-package/tests/testthat/test_img_seg.R
@@ -2,7 +2,8 @@ require(mxnet)
 
 source("get_data.R")
 
-if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 1) {
+if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 
+  1) {
   mx.ctx.default(new = mx.gpu())
   message("Using GPU for testing.")
 }
@@ -12,76 +13,89 @@ print_inferred_shape <- function(net) {
   print(slist$out.shapes)
 }
 
-convolution_module <- function(net, kernel_size, pad_size, filter_count,
-                               stride = c(1, 1), work_space = 2048, batch_norm = TRUE,
-                               down_pool = FALSE, up_pool = FALSE, act_type = "relu",
-                               convolution = TRUE) {
+convolution_module <- function(net, kernel_size, pad_size, filter_count, stride = c(1, 
+  1), work_space = 2048, batch_norm = TRUE, down_pool = FALSE, up_pool = FALSE, 
+  act_type = "relu", convolution = TRUE) {
   if (up_pool) {
-    net = mx.symbol.Deconvolution(net, kernel = c(2, 2), pad = c(0, 0),
-                                  stride = c(2, 2), num_filter = filter_count,
-                                  workspace = work_space)
-    net = mx.symbol.BatchNorm(net)
+    net <- mx.symbol.Deconvolution(net, kernel = c(2, 2), pad = c(0, 0), stride = c(2, 
+      2), num_filter = filter_count, workspace = work_space)
+    net <- mx.symbol.BatchNorm(net)
     if (act_type != "") {
-      net = mx.symbol.Activation(net, act_type = act_type)
+      net <- mx.symbol.Activation(net, act_type = act_type)
     }
   }
   if (convolution) {
-    conv = mx.symbol.Convolution(data = net, kernel = kernel_size, stride = stride,
-                                 pad = pad_size, num_filter = filter_count,
-                                 workspace = work_space)
-    net = conv
+    conv <- mx.symbol.Convolution(data = net, kernel = kernel_size, stride = stride, 
+      pad = pad_size, num_filter = filter_count, workspace = work_space)
+    net <- conv
   }
   if (batch_norm) {
-    net = mx.symbol.BatchNorm(net)
+    net <- mx.symbol.BatchNorm(net)
   }
   
   if (act_type != "") {
-    net = mx.symbol.Activation(net, act_type = act_type)
+    net <- mx.symbol.Activation(net, act_type = act_type)
   }
   
   if (down_pool) {
-    pool = mx.symbol.Pooling(net, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
-    net = pool
+    pool <- mx.symbol.Pooling(net, pool_type = "max", kernel = c(2, 2), stride = c(2, 
+      2))
+    net <- pool
   }
   print_inferred_shape(net)
   return(net)
 }
 
 get_unet <- function() {
-  data = mx.symbol.Variable('data')
-  kernel_size = c(3, 3)
-  pad_size = c(1, 1)
-  filter_count = 32
-  pool1 = convolution_module(data, kernel_size, pad_size, filter_count = filter_count, down_pool = TRUE)
-  net = pool1
-  pool2 = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 2, down_pool = TRUE)
-  net = pool2
-  pool3 = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, down_pool = TRUE)
-  net = pool3
-  pool4 = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, down_pool = TRUE)
-  net = pool4
-  net = mx.symbol.Dropout(net)
-  pool5 = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 8, down_pool = TRUE)
-  net = pool5
-  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, up_pool = TRUE)
-  net = convolution_module(net, kernel_size, pad_size = c(2, 2), filter_count = filter_count * 4, up_pool = TRUE)
-  net = mx.symbol.Crop(net, pool3, num.args = 2)
-  net = mx.symbol.concat(c(pool3, net), num.args = 2)
-  net = mx.symbol.Dropout(net)
-  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4)
-  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, up_pool = TRUE)
+  data <- mx.symbol.Variable("data")
+  kernel_size <- c(3, 3)
+  pad_size <- c(1, 1)
+  filter_count <- 32
+  pool1 <- convolution_module(data, kernel_size, pad_size, filter_count = filter_count, 
+    down_pool = TRUE)
+  net <- pool1
+  pool2 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    2, down_pool = TRUE)
+  net <- pool2
+  pool3 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4, down_pool = TRUE)
+  net <- pool3
+  pool4 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4, down_pool = TRUE)
+  net <- pool4
+  net <- mx.symbol.Dropout(net)
+  pool5 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    8, down_pool = TRUE)
+  net <- pool5
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4, up_pool = TRUE)
+  net <- convolution_module(net, kernel_size, pad_size = c(2, 2), filter_count = filter_count * 
+    4, up_pool = TRUE)
+  net <- mx.symbol.Crop(net, pool3, num.args = 2)
+  net <- mx.symbol.concat(c(pool3, net), num.args = 2)
+  net <- mx.symbol.Dropout(net)
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4)
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4, up_pool = TRUE)
   
-  net = mx.symbol.Concat(c(pool2, net), num.args = 2)
-  net = mx.symbol.Dropout(net)
-  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4)
-  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, up_pool = TRUE)
-  convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4)
-  net = mx.symbol.Concat(c(pool1, net), num.args = 2)
-  net = mx.symbol.Dropout(net)
-  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 2)
-  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 2, up_pool = TRUE)
-  net = convolution_module(net, kernel_size, pad_size, filter_count = 1, batch_norm = FALSE, act_type = "")
-  net = mx.symbol.SoftmaxOutput(data = net, name = 'sm')
+  net <- mx.symbol.Concat(c(pool2, net), num.args = 2)
+  net <- mx.symbol.Dropout(net)
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4)
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4, up_pool = TRUE)
+  convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    4)
+  net <- mx.symbol.Concat(c(pool1, net), num.args = 2)
+  net <- mx.symbol.Dropout(net)
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    2)
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
+    2, up_pool = TRUE)
+  net <- convolution_module(net, kernel_size, pad_size, filter_count = 1, batch_norm = FALSE, 
+    act_type = "")
+  net <- mx.symbol.SoftmaxOutput(data = net, name = "sm")
   return(net)
 }
 
@@ -89,47 +103,46 @@ context("Image segmentation")
 
 test_that("UNET", {
   list.of.packages <- c("imager")
-  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
-  if(length(new.packages)) install.packages(new.packages, repos = "https://cloud.r-project.org/")
+  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, 
+    "Package"])]
+  if (length(new.packages)) 
+    install.packages(new.packages, repos = "https://cloud.r-project.org/")
   GetISBI_data()
   library(imager)
   IMG_SIZE <- 168
   files <- list.files(path = "data/ISBI/train-volume/")
-  a = 'data/ISBI/train-volume/'
-  filess = paste(a, files, sep = '')
-  list_of_images = lapply(filess, function(x) {
+  a <- "data/ISBI/train-volume/"
+  filess <- paste(a, files, sep = "")
+  list_of_images <- lapply(filess, function(x) {
     x <- load.image(x)
     y <- resize(x, size_x = IMG_SIZE, size_y = IMG_SIZE)
   })
   
-  train.x = do.call('cbind', lapply(list_of_images, as.vector))
+  train.x <- do.call("cbind", lapply(list_of_images, as.vector))
   train.array <- train.x
   dim(train.array) <- c(IMG_SIZE, IMG_SIZE, 1, 30)
   
   files <- list.files(path = "data/ISBI/train-labels")
-  b = 'data/ISBI/train-labels/'
-  filess = paste(b, files, sep = '')
-  list_of_images = lapply(filess, function(x) {
+  b <- "data/ISBI/train-labels/"
+  filess <- paste(b, files, sep = "")
+  list_of_images <- lapply(filess, function(x) {
     x <- load.image(x)
     y <- resize(x, size_x = IMG_SIZE, size_y = IMG_SIZE)
   })
   
-  train.y = do.call('cbind', lapply(list_of_images, as.vector))
+  train.y <- do.call("cbind", lapply(list_of_images, as.vector))
   
-  train.y[which(train.y < 0.5)] = 0
-  train.y[which(train.y > 0.5)] = 1
-  train.y.array = train.y
-  dim(train.y.array) = c(IMG_SIZE, IMG_SIZE, 1, 30)
+  train.y[which(train.y < 0.5)] <- 0
+  train.y[which(train.y > 0.5)] <- 1
+  train.y.array <- train.y
+  dim(train.y.array) <- c(IMG_SIZE, IMG_SIZE, 1, 30)
   
   devices <- mx.ctx.default()
   mx.set.seed(0)
   
   net <- get_unet()
   
-  model <- mx.model.FeedForward.create(net, X = train.array, y = train.y.array,
-                                       ctx = devices, num.round = 2,
-                                       initializer = mx.init.normal(sqrt(2 / 576)),
-                                       learning.rate = 0.05,
-                                       momentum = 0.99,
-                                       array.batch.size = 2)
+  model <- mx.model.FeedForward.create(net, X = train.array, y = train.y.array, 
+    ctx = devices, num.round = 2, initializer = mx.init.normal(sqrt(2/576)), 
+    learning.rate = 0.05, momentum = 0.99, array.batch.size = 2)
 })
diff --git a/R-package/tests/testthat/test_initializer.R b/R-package/tests/testthat/test_initializer.R
new file mode 100644
index 00000000000..c005244d02b
--- /dev/null
+++ b/R-package/tests/testthat/test_initializer.R
@@ -0,0 +1,114 @@
+require(mxnet)
+
+context("initializer")
+
+testthat("mx.init.uniform", {
+  uniform_init <- mx.init.uniform(scale = 1)
+  expect_equal(typeof(uniform_init), "closure")
+  
+  X_bias <- uniform_init("X_bias", c(1, 10000), ctx = mx.ctx.default())
+  expect_equal(X_bias, mx.nd.zeros(c(1, 10000)))
+  
+  X_weight <- uniform_init("X_weight", c(5, 10, 10000), ctx = mx.ctx.default())
+  expect_equal(X_weight >= -1, mx.nd.ones(c(5, 10, 10000)))
+  expect_equal(X_weight <= 1, mx.nd.ones(c(5, 10, 10000)))
+  mean_weight <- mean(as.array(X_weight))
+  expect_equal(mean_weight, 0, tolerance = 0.01)
+})
+
+testthat("mx.init.normal", {
+  normal_init <- mx.init.normal(sd = 0.1)
+  expect_equal(typeof(normal_init), "closure")
+  
+  X_bias <- normal_init("X_bias", c(1, 10000), ctx = mx.ctx.default())
+  expect_equal(X_bias, mx.nd.zeros(c(1, 10000)))
+  
+  X_weight <- normal_init("X_weight", c(5, 10, 10000), ctx = mx.ctx.default())
+  weight_mean <- mean(as.array(X_weight))
+  weight_sd <- sd(as.array(X_weight))
+  expect_equal(weight_mean, 0, tolerance = 0.01)
+  expect_equal(weight_sd, 0.1, tolerance = 0.01)
+})
+
+testthat("mx.init.Xavier", {
+  xavier_init <- mx.init.Xavier()
+  expect_equal(typeof(xavier_init), "closure")
+  
+  # default parameters
+  shape <- c(2, 3, 324, 324)
+  fan_out <- shape[length(shape)]
+  fan_in <- prod(shape[-length(shape)])
+  
+  X_bias <- xavier_init("X_bias", shape = shape, ctx = mx.ctx.default())
+  expect_equal(X_bias, mx.nd.zeros(shape))
+  
+  X_weight <- xavier_init("X_weight", shape = shape, ctx = mx.ctx.default())
+  scale <- sqrt(3/((fan_in + fan_out)/2))
+  expect_equal(X_weight >= -scale, mx.nd.ones(shape))
+  expect_equal(X_weight <= scale, mx.nd.ones(shape))
+  weight_mean <- mean(as.array(X_weight))
+  expect_equal(weight_mean, 0, tolerance = 0.01)
+  
+  for (dist_type in c("gaussian", "uniform")) {
+    for (factor_type in c("in", "out", "avg")) {
+      xavier_init <- mx.init.Xavier(rnd_type = dist_type, factor_type = factor_type, 
+        magnitude = 200)
+      expect_equal(typeof(xavier_init), "closure")
+      
+      X_weight <- xavier_init("X_weight", shape = shape, ctx = mx.ctx.default())
+      factor_val <- switch(factor_type, avg = (fan_in + fan_out)/2, `in` = fan_in, 
+        out = fan_out)
+      scale <- sqrt(200/factor_val)
+      
+      if (dist_type == "gaussian") {
+        weight_mean <- mean(as.array(X_weight))
+        weight_sd <- sd(as.array(X_weight))
+        expect_equal(weight_mean, 0, tolerance = 0.01)
+        expect_equal(weight_sd, scale, tolerance = 0.01)
+      } else {
+        expect_equal(X_weight >= -scale, mx.nd.ones(shape))
+        expect_equal(X_weight <= scale, mx.nd.ones(shape))
+        weight_mean <- mean(as.array(X_weight))
+        expect_equal(weight_mean, 0, tolerance = 0.01)
+      }
+    }
+  }
+})
+
+testthat("mx.init.internal.default", {
+  sample_bias <- mxnet:::mx.init.internal.default("X_bias", c(5, 10, 100), ctx = mx.ctx.default())
+  expect_equal(sample_bias, mx.nd.zeros(c(5, 10, 100)))
+  
+  sample_gamma <- mxnet:::mx.init.internal.default("X_gamma", c(5, 10, 100), ctx = mx.ctx.default())
+  expect_equal(sample_gamma, mx.nd.ones(c(5, 10, 100)))
+  
+  sample_beta <- mxnet:::mx.init.internal.default("X_beta", c(5, 10, 100), ctx = mx.ctx.default())
+  expect_equal(sample_beta, mx.nd.zeros(c(5, 10, 100)))
+  
+  sample_moving_mean <- mxnet:::mx.init.internal.default("X_moving_mean", c(5, 
+    10, 100), ctx = mx.ctx.default())
+  expect_equal(sample_moving_mean, mx.nd.zeros(c(5, 10, 100)))
+  
+  sample_moving_var <- mxnet:::mx.init.internal.default("X_moving_var", c(5, 10, 
+    100), ctx = mx.ctx.default())
+  expect_equal(sample_moving_var, mx.nd.ones(c(5, 10, 100)))
+  
+  expect_error(mxnet:::mx.init.internal.default("X", c(5, 10, 100), ctx = mx.ctx.default()), 
+    "Unkown initialization pattern for  X")
+})
+
+testthat("mx.init.create", {
+  uniform_init <- mx.init.uniform(scale = 1)
+  expect_equal(typeof(uniform_init), "closure")
+  arrs <- setNames(as.list(c(50000, 100)), c("X_weight", "X_bias"))
+  arr_init <- mx.init.create(uniform_init, arrs, ctx = mx.ctx.default())
+  
+  X_bias <- arr_init$X_bias
+  expect_equal(X_bias, mx.nd.zeros(c(100)))
+  
+  X_weight <- arr_init$X_weight
+  expect_equal(X_weight >= -1, mx.nd.ones(c(50000)))
+  expect_equal(X_weight <= 1, mx.nd.ones(c(50000)))
+  mean_weight <- mean(as.array(X_weight))
+  expect_equal(mean_weight, 0, tolerance = 0.01)
+})
diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R
index d619856cbb9..32f6c58d3cb 100644
--- a/R-package/tests/testthat/test_io.R
+++ b/R-package/tests/testthat/test_io.R
@@ -7,22 +7,15 @@ source("get_data.R")
 test_that("MNISTIter", {
   GetMNIST_ubyte()
   batch.size <- 100
-  train_dataiter <- mx.io.MNISTIter(
-    image = "data/train-images-idx3-ubyte",
-    label = "data/train-labels-idx1-ubyte",
-    data.shape = c(784),
-    batch.size = batch.size,
-    shuffle = TRUE,
-    flat = TRUE,
-    silent = 0,
-    seed = 10
-  )
+  train_dataiter <- mx.io.MNISTIter(image = "data/train-images-idx3-ubyte", label = "data/train-labels-idx1-ubyte", 
+    data.shape = c(784), batch.size = batch.size, shuffle = TRUE, flat = TRUE, 
+    silent = 0, seed = 10)
   train_dataiter$reset()
-  batch_count = 0
+  batch_count <- 0
   while (train_dataiter$iter.next()) {
-    batch_count = batch_count + 1
+    batch_count <- batch_count + 1
   }
-  nbatch = 60000 / batch.size
+  nbatch <- 60000/batch.size
   expect_equal(batch_count, nbatch)
   train_dataiter$reset()
   train_dataiter$iter.next()
@@ -39,21 +32,15 @@ test_that("MNISTIter", {
 
 test_that("Cifar10Rec", {
   GetCifar10()
-  dataiter <- mx.io.ImageRecordIter(
-    path.imgrec     = "./data/cifar/train.rec",
-    path.imglist    = "./data/cifar/train.lst",
-    mean.img        = "./data/cifar/cifar10_mean.bin",
-    batch.size      = 100,
-    data.shape      = c(28, 28, 3),
-    rand.crop       = TRUE,
-    rand.mirror     = TRUE
-  )
-  labelcount = rep(0, 10)
+  dataiter <- mx.io.ImageRecordIter(path.imgrec = "./data/cifar/train.rec", path.imglist = "./data/cifar/train.lst", 
+    mean.img = "./data/cifar/cifar10_mean.bin", batch.size = 100, data.shape = c(28, 
+      28, 3), rand.crop = TRUE, rand.mirror = TRUE)
+  labelcount <- rep(0, 10)
   dataiter$reset()
   while (dataiter$iter.next()) {
-    label = as.array(dataiter$value()$label)
+    label <- as.array(dataiter$value()$label)
     for (i in label) {
-      labelcount[i + 1] = labelcount[i + 1] + 1
+      labelcount[i + 1] <- labelcount[i + 1] + 1
     }
   }
   
@@ -65,20 +52,20 @@ test_that("mx.io.arrayiter", {
   y <- c(1:100)
   dataiter <- mx.io.arrayiter(X, y, batch.size = 20, shuffle = FALSE)
   dataiter$reset()
-  batch_count = 0
+  batch_count <- 0
   while (dataiter$iter.next()) {
-    batch_count = batch_count + 1
+    batch_count <- batch_count + 1
   }
-  expect_equal(batch_count, 100 / 20)
+  expect_equal(batch_count, 100/20)
   
-  y <- round(y / 10)
+  y <- round(y/10)
   dataiter <- mx.io.arrayiter(X, y, batch.size = 30, shuffle = FALSE)
   labelcount <- rep(0, 11)
   dataiter$reset()
   while (dataiter$iter.next()) {
     label <- as.array(dataiter$value()$label)
     for (i in label) {
-      labelcount[i + 1] = labelcount[i + 1] + 1
+      labelcount[i + 1] <- labelcount[i + 1] + 1
     }
   }
   
diff --git a/R-package/tests/testthat/test_model.R b/R-package/tests/testthat/test_model.R
index 6167ed66c41..f4be49d5fdd 100644
--- a/R-package/tests/testthat/test_model.R
+++ b/R-package/tests/testthat/test_model.R
@@ -4,76 +4,64 @@ source("get_data.R")
 
 context("models")
 
-if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 1) {
+if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 
+  1) {
   mx.ctx.default(new = mx.gpu())
   message("Using GPU for testing.")
 }
 
 test_that("MNIST", {
-#   # Network configuration
-   GetMNIST_ubyte()
-   batch.size <- 100
-   data <- mx.symbol.Variable("data")
-   fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
-   act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
-   fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
-   act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
-   fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-   softmax <- mx.symbol.Softmax(fc3, name = "sm")
-   
-   dtrain = mx.io.MNISTIter(
-     image="data/train-images-idx3-ubyte",
-     label="data/train-labels-idx1-ubyte",
-     data.shape=c(784),
-     batch.size=batch.size,
-     shuffle=TRUE,
-     flat=TRUE,
-     silent=0,
-     seed=10)
-   
-   dtest = mx.io.MNISTIter(
-     image="data/t10k-images-idx3-ubyte",
-     label="data/t10k-labels-idx1-ubyte",
-     data.shape=c(784),
-     batch.size=batch.size,
-     shuffle=FALSE,
-     flat=TRUE,
-     silent=0)
-   
-   mx.set.seed(0)
-
-   # create the model
-   model <- mx.model.FeedForward.create(softmax, X=dtrain, eval.data=dtest,
-                                        ctx = mx.ctx.default(), num.round=1,
-                                        learning.rate=0.1, momentum=0.9,
-                                        initializer=mx.init.uniform(0.07),
-                                        epoch.end.callback=mx.callback.save.checkpoint("chkpt"),
-                                        batch.end.callback=mx.callback.log.train.metric(100))
-   
-   # do prediction
-   pred <- predict(model, dtest)
-   label <- mx.io.extract(dtest, "label")
-   dataX <- mx.io.extract(dtest, "data")
-   # Predict with R's array
-   pred2 <- predict(model, X=dataX)
-   
-   accuracy <- function(label, pred) {
-     ypred = max.col(t(as.array(pred)))
-     return(sum((as.array(label) + 1) == ypred) / length(label))
-   }
-
-   expect_equal(accuracy(label, pred), accuracy(label, pred2))
-   
-   file.remove("chkpt-0001.params")
-   file.remove("chkpt-symbol.json")
+  # # Network configuration
+  GetMNIST_ubyte()
+  batch.size <- 100
+  data <- mx.symbol.Variable("data")
+  fc1 <- mx.symbol.FullyConnected(data, name = "fc1", num_hidden = 128)
+  act1 <- mx.symbol.Activation(fc1, name = "relu1", act_type = "relu")
+  fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
+  act2 <- mx.symbol.Activation(fc2, name = "relu2", act_type = "relu")
+  fc3 <- mx.symbol.FullyConnected(act2, name = "fc3", num_hidden = 10)
+  softmax <- mx.symbol.Softmax(fc3, name = "sm")
+  
+  dtrain <- mx.io.MNISTIter(image = "data/train-images-idx3-ubyte", label = "data/train-labels-idx1-ubyte", 
+    data.shape = c(784), batch.size = batch.size, shuffle = TRUE, flat = TRUE, 
+    silent = 0, seed = 10)
+  
+  dtest <- mx.io.MNISTIter(image = "data/t10k-images-idx3-ubyte", label = "data/t10k-labels-idx1-ubyte", 
+    data.shape = c(784), batch.size = batch.size, shuffle = FALSE, flat = TRUE, 
+    silent = 0)
+  
+  mx.set.seed(0)
+  
+  # create the model
+  model <- mx.model.FeedForward.create(softmax, X = dtrain, eval.data = dtest, 
+    ctx = mx.ctx.default(), num.round = 1, learning.rate = 0.1, momentum = 0.9, 
+    initializer = mx.init.uniform(0.07), epoch.end.callback = mx.callback.save.checkpoint("chkpt"), 
+    batch.end.callback = mx.callback.log.train.metric(100))
+  
+  # do prediction
+  pred <- predict(model, dtest)
+  label <- mx.io.extract(dtest, "label")
+  dataX <- mx.io.extract(dtest, "data")
+  # Predict with R's array
+  pred2 <- predict(model, X = dataX)
+  
+  accuracy <- function(label, pred) {
+    ypred <- max.col(t(as.array(pred)))
+    return(sum((as.array(label) + 1) == ypred)/length(label))
+  }
+  
+  expect_equal(accuracy(label, pred), accuracy(label, pred2))
+  
+  file.remove("chkpt-0001.params")
+  file.remove("chkpt-symbol.json")
 })
 
 test_that("Regression", {
   data(BostonHousing, package = "mlbench")
   train.ind <- seq(1, 506, 3)
-  train.x <- data.matrix(BostonHousing[train.ind,-14])
+  train.x <- data.matrix(BostonHousing[train.ind, -14])
   train.y <- BostonHousing[train.ind, 14]
-  test.x <- data.matrix(BostonHousing[-train.ind,-14])
+  test.x <- data.matrix(BostonHousing[-train.ind, -14])
   test.y <- BostonHousing[-train.ind, 14]
   data <- mx.symbol.Variable("data")
   fc1 <- mx.symbol.FullyConnected(data, num_hidden = 1)
@@ -81,16 +69,13 @@ test_that("Regression", {
   
   demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
     pred <- mx.nd.reshape(pred, shape = 0)
-    res <- mx.nd.mean(mx.nd.abs(label-pred))
+    res <- mx.nd.mean(mx.nd.abs(label - pred))
     return(as.array(res))
   })
   mx.set.seed(0)
-  model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
-                                       ctx = mx.ctx.default(), num.round = 5,
-                                       array.batch.size = 20,
-                                       learning.rate = 2e-6,
-                                       momentum = 0.9,
-                                       eval.metric = demo.metric.mae)
+  model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y, ctx = mx.ctx.default(), 
+    num.round = 5, array.batch.size = 20, learning.rate = 2e-06, momentum = 0.9, 
+    eval.metric = demo.metric.mae)
   
   train.x <- data.matrix(BostonHousing[train.ind, -(13:14)])
   train.y <- BostonHousing[train.ind, c(13:14)]
@@ -98,18 +83,14 @@ test_that("Regression", {
   test.y <- BostonHousing[-train.ind, c(13:14)]
   
   data <- mx.symbol.Variable("data")
-  fc2 <- mx.symbol.FullyConnected(data, num_hidden=2)
+  fc2 <- mx.symbol.FullyConnected(data, num_hidden = 2)
   lro2 <- mx.symbol.LinearRegressionOutput(fc2)
   
   mx.set.seed(0)
-  train_iter = mx.io.arrayiter(data = t(train.x), label = t(train.y))
-  
-  model <- mx.model.FeedForward.create(lro2, X = train_iter,
-                                       ctx = mx.ctx.default(),
-                                       num.round = 50,
-                                       array.batch.size = 20,
-                                       learning.rate = 2e-6,
-                                       momentum = 0.9)
+  train_iter <- mx.io.arrayiter(data = t(train.x), label = t(train.y))
+  
+  model <- mx.model.FeedForward.create(lro2, X = train_iter, ctx = mx.ctx.default(), 
+    num.round = 50, array.batch.size = 20, learning.rate = 2e-06, momentum = 0.9)
 })
 
 
@@ -122,23 +103,18 @@ test_that("Classification", {
   test.x <- data.matrix(Sonar[-train.ind, 1:60])
   test.y <- Sonar[-train.ind, 61]
   mx.set.seed(0)
-  model <- mx.mlp(train.x, train.y, hidden_node = 10,
-                  out_node = 2, out_activation = "softmax",
-                  num.round = 5, array.batch.size = 15,
-                  learning.rate = 0.07,
-                  momentum = 0.9,
-                  eval.metric = mx.metric.accuracy)
+  model <- mx.mlp(train.x, train.y, hidden_node = 10, out_node = 2, out_activation = "softmax", 
+    num.round = 5, array.batch.size = 15, learning.rate = 0.07, momentum = 0.9, 
+    eval.metric = mx.metric.accuracy)
 })
 
 test_that("Fine-tune", {
   GetInception()
   GetCatDog()
-  train_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_train.rec",
-                                      batch.size  = 8, data.shape  = c(224, 224, 3),
-                                      rand.crop   = TRUE, rand.mirror = TRUE)
-  val_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_val.rec",
-                                    batch.size  = 8, data.shape  = c(224, 224, 3),
-                                    rand.crop   = FALSE, rand.mirror = FALSE)
+  train_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_train.rec", 
+    batch.size = 8, data.shape = c(224, 224, 3), rand.crop = TRUE, rand.mirror = TRUE)
+  val_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_val.rec", 
+    batch.size = 8, data.shape = c(224, 224, 3), rand.crop = FALSE, rand.mirror = FALSE)
   inception_bn <- mx.model.load("./model/Inception-BN", iteration = 126)
   symbol <- inception_bn$symbol
   internals <- symbol$get.internals()
@@ -148,11 +124,8 @@ test_that("Fine-tune", {
   
   new_fc <- mx.symbol.FullyConnected(data = flatten, num_hidden = 2, name = "fc1")
   new_soft <- mx.symbol.SoftmaxOutput(data = new_fc, name = "softmax")
-  arg_params_new <- mx.model.init.params(symbol = new_soft,
-                                         input.shape = list("data" = c(224, 224, 3, 8)),
-                                         output.shape = NULL,
-                                         initializer = mx.init.uniform(0.1),
-                                         ctx = mx.cpu())$arg.params
+  arg_params_new <- mx.model.init.params(symbol = new_soft, input.shape = list(data = c(224, 
+    224, 3, 8)), output.shape = NULL, initializer = mx.init.uniform(0.1), ctx = mx.cpu())$arg.params
   fc1_weights_new <- arg_params_new[["fc1_weight"]]
   fc1_bias_new <- arg_params_new[["fc1_bias"]]
   
@@ -160,25 +133,22 @@ test_that("Fine-tune", {
   
   arg_params_new[["fc1_weight"]] <- fc1_weights_new
   arg_params_new[["fc1_bias"]] <- fc1_bias_new
-
-  #model <- mx.model.FeedForward.create(symbol = new_soft, X = train_iter, eval.data = val_iter,
-  #                                     ctx = mx.ctx.default(), eval.metric = mx.metric.accuracy,
-  #                                     num.round = 2, learning.rate = 0.05, momentum = 0.9,
-  #                                     wd = 0.00001, kvstore = "local",
-  #                                     batch.end.callback = mx.callback.log.train.metric(50),
-  #                                     initializer = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
-  #                                     optimizer = "sgd",
-  #                                     arg.params = arg_params_new,
-  #                                     aux.params = inception_bn$aux.params)
-})                                       
+  
+  # model <- mx.model.FeedForward.create(symbol = new_soft, X = train_iter,
+  # eval.data = val_iter, ctx = mx.ctx.default(), eval.metric = mx.metric.accuracy,
+  # num.round = 2, learning.rate = 0.05, momentum = 0.9, wd = 0.00001, kvstore =
+  # 'local', batch.end.callback = mx.callback.log.train.metric(50), initializer =
+  # mx.init.Xavier(factor_type = 'in', magnitude = 2.34), optimizer = 'sgd',
+  # arg.params = arg_params_new, aux.params = inception_bn$aux.params)
+})
 
 test_that("Matrix Factorization", {
   
   # Use fake random data instead of GetMovieLens() to remove external dependency
   set.seed(123)
-  user <- sample(943, size = 100000, replace = T)
-  item <- sample(1682, size = 100000, replace = T)
-  score <- sample(5, size = 100000, replace = T)
+  user <- sample(943, size = 1e+05, replace = T)
+  item <- sample(1682, size = 1e+05, replace = T)
+  score <- sample(5, size = 1e+05, replace = T)
   DF <- data.frame(user, item, score)
   
   max_user <- max(DF$user)
@@ -189,95 +159,74 @@ test_that("Matrix Factorization", {
   user <- mx.symbol.Variable("user")
   item <- mx.symbol.Variable("item")
   score <- mx.symbol.Variable("score")
-  user1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(user), input_dim = max_user,
-                               output_dim = k, name = "user1")
-  item1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(item), input_dim = max_item,
-                               output_dim = k, name = "item1")
+  user1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(user), input_dim = max_user, 
+    output_dim = k, name = "user1")
+  item1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(item), input_dim = max_item, 
+    output_dim = k, name = "item1")
   pred <- user1 * item1
   pred1 <- mx.symbol.sum_axis(pred, axis = 1, name = "pred1")
   pred2 <- mx.symbol.Flatten(pred1, name = "pred2")
   pred3 <- mx.symbol.LinearRegressionOutput(data = pred2, label = score, name = "pred3")
-
+  
   mx.set.seed(123)
   
-  CustomIter <- setRefClass( "CustomIter", fields = c("iter1", "iter2"),
-                             contains = "Rcpp_MXArrayDataIter",
-      methods = list(
-        initialize = function(iter1, iter2) {
-          .self$iter1 <- iter1
-          .self$iter2 <- iter2
-          .self
-        },
-        value = function() {
-          user <- .self$iter1$value()$data
-          item <- .self$iter2$value()$data
-          score <- .self$iter1$value()$label
-          list(user = user,
-               item = item,
-               score = score)
-        },
-        iter.next = function() {
-          .self$iter1$iter.next()
-          .self$iter2$iter.next()
-        },
-        reset = function() {
-          .self$iter1$reset()
-          .self$iter2$reset()
-        },
-        num.pad = function() {
-          .self$iter1$num.pad()
-        },
-        finalize = function() {
-          .self$iter1$finalize()
-          .self$iter2$finalize()
-        }
-      )
-    )
-  
-  user_iter = mx.io.arrayiter(data = DF[, 1], label = DF[, 3], batch.size = k)
-  
-  item_iter = mx.io.arrayiter(data = DF[, 2], label = DF[, 3], batch.size = k)
+  CustomIter <- setRefClass("CustomIter", fields = c("iter1", "iter2"), contains = "Rcpp_MXArrayDataIter", 
+    methods = list(initialize = function(iter1, iter2) {
+      .self$iter1 <- iter1
+      .self$iter2 <- iter2
+      .self
+    }, value = function() {
+      user <- .self$iter1$value()$data
+      item <- .self$iter2$value()$data
+      score <- .self$iter1$value()$label
+      list(user = user, item = item, score = score)
+    }, iter.next = function() {
+      .self$iter1$iter.next()
+      .self$iter2$iter.next()
+    }, reset = function() {
+      .self$iter1$reset()
+      .self$iter2$reset()
+    }, num.pad = function() {
+      .self$iter1$num.pad()
+    }, finalize = function() {
+      .self$iter1$finalize()
+      .self$iter2$finalize()
+    }))
+  
+  user_iter <- mx.io.arrayiter(data = DF[, 1], label = DF[, 3], batch.size = k)
+  
+  item_iter <- mx.io.arrayiter(data = DF[, 2], label = DF[, 3], batch.size = k)
   
   train_iter <- CustomIter$new(user_iter, item_iter)
   
-  model <- mx.model.FeedForward.create(pred3, X = train_iter, ctx = mx.ctx.default(),
-                                       num.round = 5, initializer = mx.init.uniform(0.07),
-                                       learning.rate = 0.07,
-                                       eval.metric = mx.metric.rmse,
-                                       momentum = 0.9,
-                                       epoch.end.callback = mx.callback.log.train.metric(1),
-                                       input.names = c("user", "item"),
-                                       output.names = "score")
+  model <- mx.model.FeedForward.create(pred3, X = train_iter, ctx = mx.ctx.default(), 
+    num.round = 5, initializer = mx.init.uniform(0.07), learning.rate = 0.07, 
+    eval.metric = mx.metric.rmse, momentum = 0.9, epoch.end.callback = mx.callback.log.train.metric(1), 
+    input.names = c("user", "item"), output.names = "score")
 })
 
 test_that("Captcha", {
   GetCaptcha_data()
   data.shape <- c(80, 30, 3)
   batch_size <- 40
-  train <- mx.io.ImageRecordIter(
-    path.imgrec   = "./data/captcha_example/captcha_train.rec",
-    path.imglist  = "./data/captcha_example/captcha_train.lst",
-    batch.size    = batch_size,
-    label.width   = 4,
-    data.shape    = data.shape,
-    mean.img      = "mean.bin")
-  
-  val <- mx.io.ImageRecordIter(
-    path.imgrec   = "./data/captcha_example/captcha_test.rec",
-    path.imglist  = "./data/captcha_example/captcha_test.lst",
-    batch.size    = batch_size,
-    label.width   = 4,
-    data.shape    = data.shape,
-    mean.img      = "mean.bin")
+  train <- mx.io.ImageRecordIter(path.imgrec = "./data/captcha_example/captcha_train.rec", 
+    path.imglist = "./data/captcha_example/captcha_train.lst", batch.size = batch_size, 
+    label.width = 4, data.shape = data.shape, mean.img = "mean.bin")
+  
+  val <- mx.io.ImageRecordIter(path.imgrec = "./data/captcha_example/captcha_test.rec", 
+    path.imglist = "./data/captcha_example/captcha_test.lst", batch.size = batch_size, 
+    label.width = 4, data.shape = data.shape, mean.img = "mean.bin")
   
   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 32)
-  pool1 <- mx.symbol.Pooling(data = conv1, pool_type = "max", kernel = c(2, 2), stride = c(1, 1))
+  pool1 <- mx.symbol.Pooling(data = conv1, pool_type = "max", kernel = c(2, 2), 
+    stride = c(1, 1))
   relu1 <- mx.symbol.Activation(data = pool1, act_type = "relu")
   
   conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(5, 5), num_filter = 32)
-  pool2 <- mx.symbol.Pooling(data = conv2, pool_type = "avg", kernel = c(2, 2), stride = c(1, 1))
+  pool2 <- mx.symbol.Pooling(data = conv2, pool_type = "avg", kernel = c(2, 2), 
+    stride = c(1, 1))
   relu2 <- mx.symbol.Activation(data = pool2, act_type = "relu")
   
   flatten <- mx.symbol.Flatten(data = relu2)
@@ -292,8 +241,8 @@ test_that("Captcha", {
   captcha_net <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = "softmax")
   
   mx.metric.acc2 <- mx.metric.custom("accuracy", function(label, pred) {
-    label = as.array(label)
-    pred = as.array(pred)
+    label <- as.array(label)
+    pred <- as.array(pred)
     ypred <- max.col(t(pred)) - 1
     ypred <- matrix(ypred, nrow = nrow(label), ncol = ncol(label), byrow = TRUE)
     return(sum(colSums(label == ypred) == 4)/ncol(label))
@@ -305,26 +254,20 @@ test_that("Captcha", {
   train$iter.next()
   
   input.names <- "data"
-  input.shape <- sapply(input.names, function(n){dim(train$value()[[n]])}, simplify = FALSE)
+  input.shape <- sapply(input.names, function(n) {
+    dim(train$value()[[n]])
+  }, simplify = FALSE)
   arg_names <- arguments(captcha_net)
   output.names <- "label"
-  output.shape <- sapply(output.names, function(n){dim(train$value()[[n]])}, simplify = FALSE)
-  params <- mx.model.init.params(captcha_net, input.shape, output.shape, 
-                                 mx.init.Xavier(factor_type = "in", magnitude = 2.34),
-                                 mx.cpu())
-
-  #model <- mx.model.FeedForward.create(
-  #  X                  = train,
-  #  eval.data          = val,
-  #  ctx                = mx.ctx.default(),
-  #  symbol             = captcha_net,
-  #  eval.metric        = mx.metric.acc2,
-  #  num.round          = 1,
-  #  learning.rate      = 1e-04,
-  #  momentum           = 0.9,
-  #  wd                 = 1e-05,
-  #  batch.end.callback = mx.callback.log.train.metric(50),
-  #  initializer        = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
-  #  optimizer          = "sgd",
-  #  clip_gradient      = 10)
+  output.shape <- sapply(output.names, function(n) {
+    dim(train$value()[[n]])
+  }, simplify = FALSE)
+  params <- mx.model.init.params(captcha_net, input.shape, output.shape, mx.init.Xavier(factor_type = "in", 
+    magnitude = 2.34), mx.cpu())
+  
+  # model <- mx.model.FeedForward.create( X = train, eval.data = val, ctx =
+  # mx.ctx.default(), symbol = captcha_net, eval.metric = mx.metric.acc2, num.round
+  # = 1, learning.rate = 1e-04, momentum = 0.9, wd = 1e-05, batch.end.callback =
+  # mx.callback.log.train.metric(50), initializer = mx.init.Xavier(factor_type =
+  # 'in', magnitude = 2.34), optimizer = 'sgd', clip_gradient = 10)
 })
diff --git a/R-package/tests/testthat/test_ndarray.R b/R-package/tests/testthat/test_ndarray.R
index 326ea6ca7f3..4850823e29d 100644
--- a/R-package/tests/testthat/test_ndarray.R
+++ b/R-package/tests/testthat/test_ndarray.R
@@ -2,45 +2,46 @@ require(mxnet)
 
 context("ndarray")
 
-if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 1) {
+if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 
+  1) {
   mx.ctx.default(new = mx.gpu())
   message("Using GPU for testing.")
 }
 
 test_that("element-wise calculation for vector", {
-  x = 1:10
-  mat = mx.nd.array(as.array(x), mx.ctx.default())
+  x <- 1:10
+  mat <- mx.nd.array(as.array(x), mx.ctx.default())
   expect_equal(x, as.array(mat))
   expect_equal(x + 1, as.array(mat + 1))
   expect_equal(x - 10, as.array(mat - 10))
   expect_equal(x * 20, as.array(mat * 20))
-  expect_equal(x / 3, as.array(mat / 3), tolerance = 1e-5)
+  expect_equal(x/3, as.array(mat/3), tolerance = 1e-05)
   expect_equal(-1 - x, as.array(-1 - mat))
-  expect_equal(-5 / x, as.array(-5 / mat), tolerance = 1e-5)
+  expect_equal(-5/x, as.array(-5/mat), tolerance = 1e-05)
   expect_equal(x + x, as.array(mat + mat))
-  expect_equal(x / x, as.array(mat / mat))
+  expect_equal(x/x, as.array(mat/mat))
   expect_equal(x * x, as.array(mat * mat))
   expect_equal(x - x, as.array(mat - mat))
   expect_equal(as.array(1 - mat), as.array(1 - mat))
   
-  x <- runif(10,-10, 10)
-  nd = mx.nd.array(as.array(x))
-  expect_equal(sqrt(abs(x)), as.array(mx.nd.sqrt(mx.nd.abs(nd))), tolerance = 1e-6)
-  expect_equal(x ^ 2, as.array(mx.nd.square(nd)), tolerance = 1e-6)
+  x <- runif(10, -10, 10)
+  nd <- mx.nd.array(as.array(x))
+  expect_equal(sqrt(abs(x)), as.array(mx.nd.sqrt(mx.nd.abs(nd))), tolerance = 1e-06)
+  expect_equal(x^2, as.array(mx.nd.square(nd)), tolerance = 1e-06)
 })
 
 test_that("element-wise calculation for matrix", {
-  x = matrix(1:4, 2, 2)
-  mat = mx.nd.array(as.array(x), mx.ctx.default())
+  x <- matrix(1:4, 2, 2)
+  mat <- mx.nd.array(as.array(x), mx.ctx.default())
   expect_equal(x, as.array(mat))
   expect_equal(x + 1, as.array(mat + 1))
   expect_equal(x - 10, as.array(mat - 10))
   expect_equal(x * 20, as.array(mat * 20))
-  expect_equal(x / 3, as.array(mat / 3), tolerance = 1e-5)
+  expect_equal(x/3, as.array(mat/3), tolerance = 1e-05)
   expect_equal(-1 - x, as.array(-1 - mat))
-  expect_equal(-5 / x, as.array(-5 / mat), tolerance = 1e-5)
+  expect_equal(-5/x, as.array(-5/mat), tolerance = 1e-05)
   expect_equal(x + x, as.array(mat + mat))
-  expect_equal(x / x, as.array(mat / mat))
+  expect_equal(x/x, as.array(mat/mat))
   expect_equal(x * x, as.array(mat * mat))
   expect_equal(x - x, as.array(mat - mat))
   expect_equal(as.array(1 - mat), as.array(1 - mat))
@@ -51,20 +52,24 @@ test_that("ndarray ones, zeros, save and load", {
   expect_equal(matrix(0, 10, 5), as.array(mx.nd.zeros(c(10, 5))))
   expect_equal(rep(1, 10), as.array(mx.nd.ones(10)))
   expect_equal(matrix(1, 10, 5), as.array(mx.nd.ones(c(10, 5))))
-  mat = mx.nd.array(1:20)
-  mx.nd.save(mat, 'temp.mat')
-  mat2 = mx.nd.load('temp.mat')
+  mat <- mx.nd.array(1:20)
+  mx.nd.save(mat, "temp.mat")
+  mat2 <- mx.nd.load("temp.mat")
   expect_true(is.mx.ndarray(mat2[[1]]))
   expect_equal(as.array(mat), as.array(mat2[[1]]))
-  file.remove('temp.mat')
+  file.remove("temp.mat")
 })
 
 test_that("ndarray concatenate", {
   shapes <- matrix(c(2, 3, 4, 2, 2, 2, 4, 2, 2, 1, 4, 2), nrow = 3, byrow = TRUE)
-  array_r <- apply(shapes, 2, function(s) { runif(s, -10, 10) })
-  array_nd <- apply(array_r, 1, function(s) { mx.nd.array(matrix(s, nrow = 1)) })
+  array_r <- apply(shapes, 2, function(s) {
+    runif(s, -10, 10)
+  })
+  array_nd <- apply(array_r, 1, function(s) {
+    mx.nd.array(matrix(s, nrow = 1))
+  })
   array_nd_concat <- mx.nd.concat(data = array_nd, num_args = 3, dim = 1)
-  expect_equal(array_r, as.matrix(array_nd_concat), tolerance = 1e-6)
+  expect_equal(array_r, as.matrix(array_nd_concat), tolerance = 1e-06)
   
   x1 <- mx.nd.array(c(1:24))
   x2 <- mx.nd.array(c(25:48))
@@ -74,7 +79,8 @@ test_that("ndarray concatenate", {
   
   x1 <- array(1:24, dim = c(4, 3, 2))
   x2 <- array(25:48, dim = c(4, 3, 2))
-  x3 <- c(1:4, 25:28, 5:8, 29:32, 9:12, 33:36, 13:16, 37:40, 17:20, 41:44, 21:24, 45:48)
+  x3 <- c(1:4, 25:28, 5:8, 29:32, 9:12, 33:36, 13:16, 37:40, 17:20, 41:44, 21:24, 
+    45:48)
   y1 <- mx.nd.array(x1)
   y2 <- mx.nd.array(x2)
   y3 <- mx.nd.concat(data = c(y1, y2), num_args = 2, dim = 2)
@@ -83,8 +89,8 @@ test_that("ndarray concatenate", {
 })
 
 test_that("ndarray clip", {
-  nd <- mx.nd.array(runif(10,-10, 10))
-  nd2 <- mx.nd.clip(nd,-2, 3)
+  nd <- mx.nd.array(runif(10, -10, 10))
+  nd2 <- mx.nd.clip(nd, -2, 3)
   arr <- as.array(nd2)
   expect_equal(arr >= -2 | arr <= 3, rep(TRUE, length(arr)))
 })
@@ -98,7 +104,7 @@ test_that("ndarray dot", {
   B <- mx.nd.array(t(b))
   C <- mx.nd.dot(A, B)
   
-  expect_equal(c, t(as.matrix(C)), tolerance = 1e-6)
+  expect_equal(c, t(as.matrix(C)), tolerance = 1e-06)
 })
 
 test_that("ndarray crop", {
@@ -107,9 +113,10 @@ test_that("ndarray crop", {
   expect_equal(array(1, dim = c(2, 1, 3)), as.array(y))
   
   z <- mx.nd.zeros(c(2, 1, 3))
-  x <- mxnet:::mx.nd.internal.crop.assign(x, z, begin = c(0, 0, 0), end = c(2, 1, 3))
+  x <- mxnet:::mx.nd.internal.crop.assign(x, z, begin = c(0, 0, 0), end = c(2, 
+    1, 3))
   arr_x <- array(1, dim = dim(x))
-  arr_x[c(1:2), 1 , c(1:3)] <- 0
+  arr_x[c(1:2), 1, c(1:3)] <- 0
   
   expect_equal(as.array(x), arr_x)
 })
@@ -118,77 +125,77 @@ test_that("ndarray negate", {
   arr <- array(runif(24, -10, 10), dim = c(2, 3, 4))
   nd <- mx.nd.array(arr)
   
-  expect_equal(arr, as.array(nd), tolerance = 1e-6)
-  expect_equal(-arr, as.array(-nd), tolerance = 1e-6)
-  expect_equal(arr, as.array(nd), tolerance = 1e-6)
+  expect_equal(arr, as.array(nd), tolerance = 1e-06)
+  expect_equal(-arr, as.array(-nd), tolerance = 1e-06)
+  expect_equal(arr, as.array(nd), tolerance = 1e-06)
 })
 
 test_that("ndarray equal", {
   x <- mx.nd.zeros(c(2, 3))
   y <- mx.nd.ones(c(2, 3))
-  z = x == y
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- x == y
+  expect_equal(as.array(z), array(0, c(2, 3)))
   
-  z = 0 == x
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- 0 == x
+  expect_equal(as.array(z), array(1, c(2, 3)))
 })
 
 test_that("ndarray not equal", {
   x <- mx.nd.zeros(c(2, 3))
   y <- mx.nd.ones(c(2, 3))
-  z = x != y
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- x != y
+  expect_equal(as.array(z), array(1, c(2, 3)))
   
-  z = 0 != x
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- 0 != x
+  expect_equal(as.array(z), array(0, c(2, 3)))
 })
 
 test_that("ndarray greater", {
   x <- mx.nd.zeros(c(2, 3))
   y <- mx.nd.ones(c(2, 3))
-  z = x > y
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- x > y
+  expect_equal(as.array(z), array(0, c(2, 3)))
   
-  z = y > 0
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- y > 0
+  expect_equal(as.array(z), array(1, c(2, 3)))
   
-  z = 0 > y
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- 0 > y
+  expect_equal(as.array(z), array(0, c(2, 3)))
   
-  z = x >= y
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- x >= y
+  expect_equal(as.array(z), array(0, c(2, 3)))
   
-  z = y >= 0
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- y >= 0
+  expect_equal(as.array(z), array(1, c(2, 3)))
   
-  z = 0 >= y
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- 0 >= y
+  expect_equal(as.array(z), array(0, c(2, 3)))
   
-  z = y >= 1
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- y >= 1
+  expect_equal(as.array(z), array(1, c(2, 3)))
 })
 
 test_that("ndarray lesser", {
   x <- mx.nd.zeros(c(2, 3))
   y <- mx.nd.ones(c(2, 3))
-  z = x < y
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- x < y
+  expect_equal(as.array(z), array(1, c(2, 3)))
   
-  z = y < 0
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- y < 0
+  expect_equal(as.array(z), array(0, c(2, 3)))
   
-  z = 0 < y
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- 0 < y
+  expect_equal(as.array(z), array(1, c(2, 3)))
   
-  z = x <= y
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- x <= y
+  expect_equal(as.array(z), array(1, c(2, 3)))
   
-  z = y <= 0
-  expect_equal(as.array(z), array(0, c(2,3)))
+  z <- y <= 0
+  expect_equal(as.array(z), array(0, c(2, 3)))
   
-  z = 0 <= y
-  expect_equal(as.array(z), array(1, c(2,3)))
+  z <- 0 <= y
+  expect_equal(as.array(z), array(1, c(2, 3)))
   
-  z = y <= 1
-  expect_equal(as.array(z), array(1, c(2,3)))
-})
\ No newline at end of file
+  z <- y <= 1
+  expect_equal(as.array(z), array(1, c(2, 3)))
+})
diff --git a/R-package/tests/testthat/test_optimizer.R b/R-package/tests/testthat/test_optimizer.R
index c6dacaa728b..a02a9edf524 100644
--- a/R-package/tests/testthat/test_optimizer.R
+++ b/R-package/tests/testthat/test_optimizer.R
@@ -1,204 +1,168 @@
 context("optimizer")
 
 test_that("sgd", {
-
-  data = mx.symbol.Variable('data')
-  label = mx.symbol.Variable('label')
-  fc_weight = mx.symbol.Variable('fc_weight')
-  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
-  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
-
-  x <- mx.nd.array(array(1:6, dim=2:3))
+  
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  fc_weight <- mx.symbol.Variable("fc_weight")
+  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
+    name = "fc1", num_hidden = 1)
+  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
+  
+  x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
-
-  exec <- mxnet:::mx.symbol.bind(symbol = loss,
-                                 ctx = mx.cpu(),
-                                 arg.arrays = list(data = x,
-                                                   fc1_weight = w1,
-                                                   label = y),
-                                 aux.arrays = NULL,
-                                 grad.reqs = c("null", "write", "null"))
-
-  optimizer <- mx.opt.create("sgd",
-                             learning.rate = 1,
-                             momentum = 0,
-                             wd = 0,
-                             rescale.grad = 1,
-                             clip_gradient = -1)
-
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
+  
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
+    "null"))
+  
+  optimizer <- mx.opt.create("sgd", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1, 
+    clip_gradient = -1)
+  
   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
-
+  
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
-
+  
   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-
-  expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2,1)), tolerance = 1e-1)
-
+  
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.1)
+  
 })
 
 
 test_that("rmsprop", {
-
-  data = mx.symbol.Variable('data')
-  label = mx.symbol.Variable('label')
-  fc_weight = mx.symbol.Variable('fc_weight')
-  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
-  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
-
-  x <- mx.nd.array(array(1:6, dim=2:3))
+  
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  fc_weight <- mx.symbol.Variable("fc_weight")
+  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
+    name = "fc1", num_hidden = 1)
+  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
+  
+  x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
-
-  exec <- mxnet:::mx.symbol.bind(symbol = loss,
-                                 ctx = mx.cpu(),
-                                 arg.arrays = list(data = x,
-                                                   fc1_weight = w1,
-                                                   label = y),
-                                 aux.arrays = NULL,
-                                 grad.reqs = c("null", "write", "null"))
-
-  optimizer <- mx.opt.create("rmsprop", learning.rate = 1,
-                             centered = TRUE,
-                             gamma1 = 0.95,
-                             gamma2 = 0.9,
-                             epsilon = 1e-4,
-                             wd = 0,
-                             rescale.grad = 1,
-                             clip_gradient = -1)
-
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
+  
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
+    "null"))
+  
+  optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, gamma1 = 0.95, 
+    gamma2 = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)
+  
   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
-
+  
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
-
+  
   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-
-  expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2,1)), tolerance = 1e-1)
-
+  
+  expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2, 1)), 
+    tolerance = 0.1)
+  
 })
 
 
 test_that("adam", {
-
-  data = mx.symbol.Variable('data')
-  label = mx.symbol.Variable('label')
-  fc_weight = mx.symbol.Variable('fc_weight')
-  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
-  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
-
-  x <- mx.nd.array(array(1:6, dim=2:3))
+  
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  fc_weight <- mx.symbol.Variable("fc_weight")
+  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
+    name = "fc1", num_hidden = 1)
+  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
+  
+  x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
-
-  exec <- mxnet:::mx.symbol.bind(symbol = loss,
-                                 ctx = mx.cpu(),
-                                 arg.arrays = list(data = x,
-                                                   fc1_weight = w1,
-                                                   label = y),
-                                 aux.arrays = NULL,
-                                 grad.reqs = c("null", "write", "null"))
-
-  optimizer <- mx.opt.create("adam",
-                             learning.rate = 1,
-                             beta1 = 0.9,
-                             beta2 = 0.999,
-                             epsilon = 1e-8,
-                             wd = 0,
-                             rescale.grad = 1,
-                             clip_gradient = -1)
-
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
+  
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
+    "null"))
+  
+  optimizer <- mx.opt.create("adam", learning.rate = 1, beta1 = 0.9, beta2 = 0.999, 
+    epsilon = 1e-08, wd = 0, rescale.grad = 1, clip_gradient = -1)
+  
   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
-
+  
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
-
+  
   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-
-  expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2,1)), tolerance = 1e-1)
-
+  
+  expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2, 1)), 
+    tolerance = 0.1)
+  
 })
 
 
 test_that("adagrad", {
-
-  data = mx.symbol.Variable('data')
-  label = mx.symbol.Variable('label')
-  fc_weight = mx.symbol.Variable('fc_weight')
-  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
-  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
-
-  x <- mx.nd.array(array(1:6, dim=2:3))
+  
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  fc_weight <- mx.symbol.Variable("fc_weight")
+  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
+    name = "fc1", num_hidden = 1)
+  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
+  
+  x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
-
-  exec <- mxnet:::mx.symbol.bind(symbol = loss,
-                                 ctx = mx.cpu(),
-                                 arg.arrays = list(data = x,
-                                                   fc1_weight = w1,
-                                                   label = y),
-                                 aux.arrays = NULL,
-                                 grad.reqs = c("null", "write", "null"))
-
-  optimizer <- mx.opt.create("adagrad",
-                             learning.rate = 1,
-                             epsilon = 1e-8,
-                             wd = 0,
-                             rescale.grad = 1,
-                             clip_gradient = -1)
-
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
+  
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
+    "null"))
+  
+  optimizer <- mx.opt.create("adagrad", learning.rate = 1, epsilon = 1e-08, wd = 0, 
+    rescale.grad = 1, clip_gradient = -1)
+  
   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
-
+  
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
-
+  
   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-
-  expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2,1)), tolerance = 1e-1)
-
+  
+  expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2, 1)), tolerance = 0.1)
+  
 })
 
 
 test_that("adadelta", {
-
-  data = mx.symbol.Variable('data')
-  label = mx.symbol.Variable('label')
-  fc_weight = mx.symbol.Variable('fc_weight')
-  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
-  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
-
-  x <- mx.nd.array(array(1:6, dim=2:3))
+  
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  fc_weight <- mx.symbol.Variable("fc_weight")
+  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
+    name = "fc1", num_hidden = 1)
+  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
+  
+  x <- mx.nd.array(array(1:6, dim = 2:3))
   y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
-
-  exec <- mxnet:::mx.symbol.bind(symbol = loss,
-                                 ctx = mx.cpu(),
-                                 arg.arrays = list(data = x,
-                                                   fc1_weight = w1,
-                                                   label = y),
-                                 aux.arrays = NULL,
-                                 grad.reqs = c("null", "write", "null"))
-
-  optimizer <- mx.opt.create("adadelta",
-                             rho = 0.90,
-                             epsilon = 1e-5,
-                             wd = 0,
-                             rescale.grad = 1,
-                             clip_gradient = -1)
-
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
+  
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
+    "null"))
+  
+  optimizer <- mx.opt.create("adadelta", rho = 0.9, epsilon = 1e-05, wd = 0, rescale.grad = 1, 
+    clip_gradient = -1)
+  
   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
-
+  
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
-
+  
   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-
-  expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2,1)), tolerance = 1e-1)
-
+  
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2, 1)), 
+    tolerance = 0.1)
+  
 })
diff --git a/R-package/tests/testthat/test_random.R b/R-package/tests/testthat/test_random.R
index 411d0c768a6..e90011dadb2 100644
--- a/R-package/tests/testthat/test_random.R
+++ b/R-package/tests/testthat/test_random.R
@@ -3,17 +3,17 @@ require(mxnet)
 context("random")
 
 test_that("mx.runif", {
-  X <- mx.runif(shape=50000, min=0, max=1, ctx=mx.ctx.default())
-  expect_equal(X>=0, mx.nd.ones(50000))
-  expect_equal(X<=1, mx.nd.ones(50000))
-  sample_mean = mean(as.array(X))
-  expect_equal(sample_mean, 0.5, tolerance=1e-2)
+  X <- mx.runif(shape = 50000, min = 0, max = 1, ctx = mx.ctx.default())
+  expect_equal(X >= 0, mx.nd.ones(50000))
+  expect_equal(X <= 1, mx.nd.ones(50000))
+  sample_mean <- mean(as.array(X))
+  expect_equal(sample_mean, 0.5, tolerance = 0.01)
 })
 
 test_that("mx.rnorm", {
-  X <- mx.rnorm(shape=50000, mean=5, sd=0.1, ctx=mx.ctx.default())
-  sample_mean = mean(as.array(X))
-  sample_sd = sd(as.array(X))
-  expect_equal(sample_mean, 5, tolerance=1e-2)
-  expect_equal(sample_sd, 0.1, tolerance=1e-2)
+  X <- mx.rnorm(shape = 50000, mean = 5, sd = 0.1, ctx = mx.ctx.default())
+  sample_mean <- mean(as.array(X))
+  sample_sd <- sd(as.array(X))
+  expect_equal(sample_mean, 5, tolerance = 0.01)
+  expect_equal(sample_sd, 0.1, tolerance = 0.01)
 })
diff --git a/R-package/tests/testthat/test_symbol.R b/R-package/tests/testthat/test_symbol.R
index 656d146cd87..4a253fbd3e7 100644
--- a/R-package/tests/testthat/test_symbol.R
+++ b/R-package/tests/testthat/test_symbol.R
@@ -3,71 +3,73 @@ require(mxnet)
 context("symbol")
 
 test_that("basic symbol operation", {
-  data = mx.symbol.Variable('data')
-  net1 = mx.symbol.FullyConnected(data = data, name = 'fc1', num_hidden = 10)
-  net1 = mx.symbol.FullyConnected(data = net1, name = 'fc2', num_hidden = 100)
+  data <- mx.symbol.Variable("data")
+  net1 <- mx.symbol.FullyConnected(data = data, name = "fc1", num_hidden = 10)
+  net1 <- mx.symbol.FullyConnected(data = net1, name = "fc2", num_hidden = 100)
   
-  expect_equal(arguments(net1), c('data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias'))
-  expect_equal(outputs(net1), 'fc2_output')
+  expect_equal(arguments(net1), c("data", "fc1_weight", "fc1_bias", "fc2_weight", 
+    "fc2_bias"))
+  expect_equal(outputs(net1), "fc2_output")
   
-  net2 = mx.symbol.FullyConnected(name = 'fc3', num_hidden = 10)
-  net2 = mx.symbol.Activation(data = net2, act_type = 'relu')
-  net2 = mx.symbol.FullyConnected(data = net2, name = 'fc4', num_hidden = 20)
+  net2 <- mx.symbol.FullyConnected(name = "fc3", num_hidden = 10)
+  net2 <- mx.symbol.Activation(data = net2, act_type = "relu")
+  net2 <- mx.symbol.FullyConnected(data = net2, name = "fc4", num_hidden = 20)
   
-  composed = mx.apply(net2, fc3_data = net1, name = 'composed')
+  composed <- mx.apply(net2, fc3_data = net1, name = "composed")
   
-  expect_equal(arguments(composed), c('data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias', 'fc3_weight', 'fc3_bias', 'fc4_weight', 'fc4_bias'))
-  expect_equal(outputs(composed), 'composed_output')
+  expect_equal(arguments(composed), c("data", "fc1_weight", "fc1_bias", "fc2_weight", 
+    "fc2_bias", "fc3_weight", "fc3_bias", "fc4_weight", "fc4_bias"))
+  expect_equal(outputs(composed), "composed_output")
   
-  multi_out = mx.symbol.Group(c(composed, net1))
-  expect_equal(outputs(multi_out), c('composed_output', 'fc2_output'))
+  multi_out <- mx.symbol.Group(c(composed, net1))
+  expect_equal(outputs(multi_out), c("composed_output", "fc2_output"))
 })
 
 test_that("symbol internal", {
-  data = mx.symbol.Variable('data')
-  oldfc = mx.symbol.FullyConnected(data = data, name = 'fc1', num_hidden = 10)
-  net1 = mx.symbol.FullyConnected(data = oldfc, name = 'fc2', num_hidden = 100)
+  data <- mx.symbol.Variable("data")
+  oldfc <- mx.symbol.FullyConnected(data = data, name = "fc1", num_hidden = 10)
+  net1 <- mx.symbol.FullyConnected(data = oldfc, name = "fc2", num_hidden = 100)
   
-  expect_equal(arguments(net1), c("data", "fc1_weight", "fc1_bias", "fc2_weight", "fc2_bias"))
+  expect_equal(arguments(net1), c("data", "fc1_weight", "fc1_bias", "fc2_weight", 
+    "fc2_bias"))
   
-  internal = internals(net1)
-  fc1 = internal[[match("fc1_output", internal$outputs)]]
+  internal <- internals(net1)
+  fc1 <- internal[[match("fc1_output", internal$outputs)]]
   
   expect_equal(arguments(fc1), arguments(oldfc))
 })
 
 test_that("symbol children", {
-  data = mx.symbol.Variable('data')
-  oldfc = mx.symbol.FullyConnected(data = data,
-                                   name = 'fc1',
-                                   num_hidden = 10)
-  net1 = mx.symbol.FullyConnected(data = oldfc, name = 'fc2', num_hidden = 100)
+  data <- mx.symbol.Variable("data")
+  oldfc <- mx.symbol.FullyConnected(data = data, name = "fc1", num_hidden = 10)
+  net1 <- mx.symbol.FullyConnected(data = oldfc, name = "fc2", num_hidden = 100)
   
-  expect_equal(outputs(children(net1)), c('fc1_output', 'fc2_weight', 'fc2_bias'))
-  expect_equal(outputs(children(children(net1))), c('data', 'fc1_weight', 'fc1_bias'))
+  expect_equal(outputs(children(net1)), c("fc1_output", "fc2_weight", "fc2_bias"))
+  expect_equal(outputs(children(children(net1))), c("data", "fc1_weight", "fc1_bias"))
   
-  net2 = net1$get.children()
-  expect_equal(net2[[match('fc2_weight', net2$outputs)]]$arguments, 'fc2_weight')
+  net2 <- net1$get.children()
+  expect_equal(net2[[match("fc2_weight", net2$outputs)]]$arguments, "fc2_weight")
   
-  data = mx.symbol.Variable('data')
-  sliced = mx.symbol.SliceChannel(data, num_outputs = 3, name = 'slice')
-  expect_equal(outputs(children(sliced)), 'data')
+  data <- mx.symbol.Variable("data")
+  sliced <- mx.symbol.SliceChannel(data, num_outputs = 3, name = "slice")
+  expect_equal(outputs(children(sliced)), "data")
 })
 
 test_that("symbol infer type", {
-  num_hidden = 128
-  num_dim    = 64
-  num_sample = 10
+  num_hidden <- 128
+  num_dim <- 64
+  num_sample <- 10
   
-  data = mx.symbol.Variable('data')
-  prev = mx.symbol.Variable('prevstate')
-  x2h  = mx.symbol.FullyConnected(data = data, name = 'x2h', num_hidden = num_hidden)
-  h2h  = mx.symbol.FullyConnected(data = prev, name = 'h2h', num_hidden = num_hidden)
+  data <- mx.symbol.Variable("data")
+  prev <- mx.symbol.Variable("prevstate")
+  x2h <- mx.symbol.FullyConnected(data = data, name = "x2h", num_hidden = num_hidden)
+  h2h <- mx.symbol.FullyConnected(data = prev, name = "h2h", num_hidden = num_hidden)
   
-  out  = mx.symbol.Activation(data = mx.symbol.elemwise_add(x2h, h2h), name = 'out', act_type = 'relu')
+  out <- mx.symbol.Activation(data = mx.symbol.elemwise_add(x2h, h2h), name = "out", 
+    act_type = "relu")
   
   # shape inference will fail because information is not available for h2h
-  ret = mx.symbol.infer.shape(out, data = c(num_dim, num_sample))
+  ret <- mx.symbol.infer.shape(out, data = c(num_dim, num_sample))
   
   expect_equal(ret, NULL)
 })
@@ -77,7 +79,7 @@ test_that("symbol save/load", {
   fc1 <- mx.symbol.FullyConnected(data, num_hidden = 1)
   lro <- mx.symbol.LinearRegressionOutput(fc1)
   mx.symbol.save(lro, "tmp_r_sym.json")
-  data2 = mx.symbol.load("tmp_r_sym.json")
+  data2 <- mx.symbol.load("tmp_r_sym.json")
   
   expect_equal(data2$as.json(), lro$as.json())
   file.remove("tmp_r_sym.json")
@@ -85,12 +87,12 @@ test_that("symbol save/load", {
 
 test_that("symbol attributes access", {
   str <- "(1, 1, 1, 1)"
-  x = mx.symbol.Variable('x')
+  x <- mx.symbol.Variable("x")
   x$attributes <- list(`__shape__` = str)
   
   expect_equal(x$attributes$`__shape__`, str)
   
-  y = mx.symbol.Variable('y')
+  y <- mx.symbol.Variable("y")
   y$attributes$`__shape__` <- str
   
   expect_equal(y$attributes$`__shape__`, str)
diff --git a/ci/README.md b/ci/README.md
index 548e9cb9b04..69308756943 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -59,6 +59,20 @@ To work inside a container with a shell you can do:
 When building, the artifacts are located in the build/ directory in the project root. In case
 `build.py -a` is invoked, the artifacts are located in build.<platform>/
 
+# Docker container cleanup (Zombie containers)
+Docker has a client-server architecture, so when the program that is executing the docker client
+dies or receieves a signal, the container keeps running as it's started by the docker daemon.
+We implement signal handlers that catch sigterm and sigint and cleanup containers before exit. In
+Jenkins there's not enough time between sigterm and sigkill so we guarantee that containers are not
+left running by propagating environment variables used by the Jenkins process tree killer to
+identify which process to kill when the job is stopped. This has the effect of stopping the
+container given that the process inside the container is terminated.
+
+How to test this is working propperly: On the console you can hit ^C while a container is running
+(not just building) and see that the container is stopped by running `docker ps` on another
+terminal. In Jenkins this has been tested by stopping the job which has containers running and
+verifying that the container stops shortly afterwards by running docker ps.
+
 ## Add a platform
 
 To add a platform, you should add the appropriate dockerfile in
diff --git a/ci/build.py b/ci/build.py
index a9d6a63537f..df9e97bdb5f 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -23,7 +23,7 @@
 """
 
 __author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy'
-__version__ = '0.1'
+__version__ = '0.3'
 
 import argparse
 import glob
@@ -34,20 +34,63 @@
 import subprocess
 import sys
 import tempfile
-import platform
-from copy import deepcopy
 from itertools import chain
-from subprocess import call, check_call
+from subprocess import check_call, check_output
 from typing import *
 from util import *
+import docker
+import docker.models
+import docker.errors
+import signal
+import atexit
+import pprint
+
+
+class Cleanup:
+    """A class to cleanup containers"""
+    def __init__(self):
+        self.containers = set()
+        self.docker_stop_timeout = 3
+
+    def add_container(self, container: docker.models.containers.Container):
+        assert isinstance(container, docker.models.containers.Container)
+        self.containers.add(container)
+
+    def remove_container(self, container: docker.models.containers.Container):
+        assert isinstance(container, docker.models.containers.Container)
+        self.containers.remove(container)
+
+    def _cleanup_containers(self):
+        if self.containers:
+            logging.warning("Cleaning up containers")
+        else:
+            return
+        # noinspection PyBroadException
+        try:
+            stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self.docker_stop_timeout))
+        except Exception:
+            stop_timeout = 3
+        for container in self.containers:
+            try:
+                container.stop(timeout=stop_timeout)
+                logging.info("☠: stopped container %s", trim_container_id(container.id))
+                container.remove()
+                logging.info("🚽: removed container %s", trim_container_id(container.id))
+            except Exception as e:
+                logging.exception(e)
+        self.containers.clear()
+        logging.info("Cleaning up containers finished.")
 
-CCACHE_MAXSIZE = '500G'
+    def __call__(self):
+        """Perform cleanup"""
+        self._cleanup_containers()
 
-def under_ci() -> bool:
-    """:return: True if we run in Jenkins."""
-    return 'JOB_NAME' in os.environ
 
-def get_platforms(path: Optional[str] = "docker"):
+def get_dockerfiles_path():
+    return "docker"
+
+
+def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
     """Get a list of architectures given our dockerfiles"""
     dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
     dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
@@ -57,10 +100,11 @@ def get_platforms(path: Optional[str] = "docker"):
 
 
 def get_docker_tag(platform: str, registry: str) -> str:
+    """:return: docker tag to be used for the container"""
     return "{0}/build.{1}".format(registry, platform)
 
 
-def get_dockerfile(platform: str, path="docker") -> str:
+def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
     return os.path.join(path, "Dockerfile.build.{0}".format(platform))
 
 
@@ -68,18 +112,18 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int) -> None:
+def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, use_cache: bool) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
     :param docker_binary: docker binary to use (docker/nvidia-docker)
     :param registry: Dockerhub registry name
     :param num_retries: Number of retries to build the docker image
+    :param use_cache: will pass cache_from to docker to use the previously pulled tag
     :return: Id of the top level image
     """
-
     tag = get_docker_tag(platform=platform, registry=registry)
-    logging.info("Building container tagged '%s' with %s", tag, docker_binary)
+    logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
     #
     # We add a user with the same group as the executing non-root user so files created in the
     # container match permissions of the local user. Same for the group.
@@ -90,35 +134,26 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
     # cache-from is needed so we use the cached images tagged from the remote via
     # docker pull see: docker_cache.load_docker_cache
     #
+    # This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
+    # So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
+    # script.
+    #
     # This doesn't work with multi head docker files.
-    # 
-
-    for i in range(num_retries):
-        logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries)
-
-        cmd = [docker_binary, "build",
-               "-f", get_dockerfile(platform),
-               "--build-arg", "USER_ID={}".format(os.getuid()),
-               "--build-arg", "GROUP_ID={}".format(os.getgid()),
-               "--cache-from", tag,
-               "-t", tag,
-               "docker"]
+    #
+    cmd = [docker_binary, "build",
+           "-f", get_dockerfile(platform),
+           "--build-arg", "USER_ID={}".format(os.getuid()),
+           "--build-arg", "GROUP_ID={}".format(os.getgid())]
+    if use_cache:
+        cmd.extend(["--cache-from", tag])
+    cmd.extend(["-t", tag, get_dockerfiles_path()])
+
+    @retry(subprocess.CalledProcessError, tries=num_retries)
+    def run_cmd():
         logging.info("Running command: '%s'", ' '.join(cmd))
-        try:
-            check_call(cmd)
-            # Docker build was successful. Call break to break out of the retry mechanism
-            break
-        except subprocess.CalledProcessError as e:
-            saved_exception = e
-            logging.error('Failed to build docker image')
-            # Building the docker image failed. Call continue to trigger the retry mechanism
-            continue
-    else:
-        # Num retries exceeded
-        logging.exception('Exception during build of docker image', saved_exception)
-        logging.fatal('Failed to build the docker image, aborting...')
-        sys.exit(1)
+        check_call(cmd)
 
+    run_cmd()
     # Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
     # check_call would have failed
     image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
@@ -134,24 +169,29 @@ def _get_local_image_id(docker_binary, docker_tag):
     :return: Image id as string or None if tag does not exist
     """
     cmd = [docker_binary, "images", "-q", docker_tag]
-    image_id_b = subprocess.check_output(cmd)
+    image_id_b = check_output(cmd)
     image_id = image_id_b.decode('utf-8').strip()
+    if not image_id:
+        raise RuntimeError('Unable to find docker image id matching with tag {}'.format(docker_tag))
     return image_id
 
 
 def buildir() -> str:
     return os.path.join(get_mxnet_root(), "build")
 
+
 def default_ccache_dir() -> str:
+    """:return: ccache directory for the current platform"""
     # Share ccache across containers
     if 'CCACHE_DIR' in os.environ:
+        ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
         try:
-            ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
             os.makedirs(ccache_dir, exist_ok=True)
             return ccache_dir
         except PermissionError:
             logging.info('Unable to make dirs at %s, falling back to local temp dir', ccache_dir)
     # In osx tmpdir is not mountable by default
+    import platform
     if platform.system() == 'Darwin':
         ccache_dir = "/tmp/_mxnet_ccache"
         os.makedirs(ccache_dir, exist_ok=True)
@@ -159,14 +199,41 @@ def default_ccache_dir() -> str:
     return os.path.join(tempfile.gettempdir(), "ci_ccache")
 
 
+def trim_container_id(cid):
+    """:return: trimmed container id"""
+    return cid[:12]
+
+
 def container_run(platform: str,
-                  docker_binary: str,
+                  nvidia_runtime: bool,
                   docker_registry: str,
                   shared_memory_size: str,
                   local_ccache_dir: str,
                   command: List[str],
-                  dry_run: bool = False,
-                  interactive: bool = False) -> str:
+                  cleanup: Cleanup,
+                  dry_run: bool = False) -> int:
+    """Run command in a container"""
+    container_wait_s = 600
+    #
+    # Environment setup
+    #
+    environment = {
+        'CCACHE_MAXSIZE': '500G',
+        'CCACHE_TEMPDIR': '/tmp/ccache',  # temp dir should be local and not shared
+        'CCACHE_DIR': '/work/ccache',  # this path is inside the container as /work/ccache is
+                                       # mounted
+        'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache
+                                              # verification.
+    }
+    # These variables are passed to the container to the process tree killer can find runaway
+    # process inside the container
+    # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller
+    # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393
+    #
+    jenkins_env_vars = ['BUILD_NUMBER', 'BUILD_ID', 'BUILD_TAG']
+    environment.update({k: os.environ[k] for k in jenkins_env_vars if k in os.environ})
+    environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})
+
     tag = get_docker_tag(platform=platform, registry=docker_registry)
     mx_root = get_mxnet_root()
     local_build_folder = buildir()
@@ -174,49 +241,118 @@ def container_run(platform: str,
     os.makedirs(local_build_folder, exist_ok=True)
     os.makedirs(local_ccache_dir, exist_ok=True)
     logging.info("Using ccache directory: %s", local_ccache_dir)
-    runlist = [docker_binary, 'run', '--rm', '-t',
-               '--shm-size={}'.format(shared_memory_size),
-               '-v', "{}:/work/mxnet".format(mx_root),  # mount mxnet root
-               '-v', "{}:/work/build".format(local_build_folder),  # mount mxnet/build for storing build artifacts
-               '-v', "{}:/work/ccache".format(local_ccache_dir),
-               '-u', '{}:{}'.format(os.getuid(), os.getgid()),
-               '-e', 'CCACHE_MAXSIZE={}'.format(CCACHE_MAXSIZE),
-               '-e', 'CCACHE_TEMPDIR=/tmp/ccache',  # temp dir should be local and not shared
-               '-e', "CCACHE_DIR=/work/ccache",  # this path is inside the container as /work/ccache is mounted
-               '-e', "CCACHE_LOGFILE=/tmp/ccache.log",  # a container-scoped log, useful for ccache verification.
-               tag]
-    runlist.extend(command)
-    cmd = '\\\n\t'.join(runlist)
+    docker_client = docker.from_env()
+    # Equivalent command
+    docker_cmd_list = [
+        get_docker_binary(nvidia_runtime),
+        'run',
+        '--rm',
+        '--shm-size={}'.format(shared_memory_size),
+        # mount mxnet root
+        '-v', "{}:/work/mxnet".format(mx_root),
+        # mount mxnet/build for storing build
+        '-v', "{}:/work/build".format(local_build_folder),
+        '-v', "{}:/work/ccache".format(local_ccache_dir),
+        '-u', '{}:{}'.format(os.getuid(), os.getgid()),
+        '-e', 'CCACHE_MAXSIZE={}'.format(environment['CCACHE_MAXSIZE']),
+        # temp dir should be local and not shared
+        '-e', 'CCACHE_TEMPDIR={}'.format(environment['CCACHE_TEMPDIR']),
+        # this path is inside the container as /work/ccache is mounted
+        '-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']),
+        # a container-scoped log, useful for ccache verification.
+        '-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']),
+        '-ti',
+        tag]
+    docker_cmd_list.extend(command)
+    docker_cmd = ' \\\n\t'.join(docker_cmd_list)
+    logging.info("Running %s in container %s", command, tag)
+    logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
+    # return code of the command inside docker
     ret = 0
-    if not dry_run and not interactive:
-        logging.info("Running %s in container %s", command, tag)
-        logging.info("Executing:\n%s\n", cmd)
-        ret = call(runlist)
-
-    docker_run_cmd = ' '.join(runlist)
-    if not dry_run and interactive:
-        into_cmd = deepcopy(runlist)
-        # -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument
-        idx = into_cmd.index('-u') + 2
-        into_cmd[idx:idx] = ['-ti']
-        cmd = '\\\n\t'.join(into_cmd)
-        logging.info("Executing:\n%s\n", cmd)
-        docker_run_cmd = ' '.join(into_cmd)
-        ret = call(into_cmd)
-
-    if not dry_run and not interactive and ret != 0:
-        logging.error("Running of command in container failed (%s):\n%s\n", ret, cmd)
-        logging.error("You can get into the container by adding the -i option")
-        raise subprocess.CalledProcessError(ret, cmd)
-
-    return docker_run_cmd
+    if not dry_run:
+        #############################
+        #
+        signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
+        # noinspection PyShadowingNames
+        runtime = None
+        if nvidia_runtime:
+            # noinspection PyShadowingNames
+            # runc is default (docker info | grep -i runtime)
+            runtime = 'nvidia'
+
+        container = docker_client.containers.run(
+            tag,
+            runtime=runtime,
+            detach=True,
+            command=command,
+            shm_size=shared_memory_size,
+            user='{}:{}'.format(os.getuid(), os.getgid()),
+            volumes={
+                mx_root:
+                    {'bind': '/work/mxnet', 'mode': 'rw'},
+                local_build_folder:
+                    {'bind': '/work/build', 'mode': 'rw'},
+                local_ccache_dir:
+                    {'bind': '/work/ccache', 'mode': 'rw'},
+            },
+            environment=environment)
+        logging.info("Started container: %s", trim_container_id(container.id))
+        # Race condition:
+        # If the previous call is interrupted then it's possible that the container is not cleaned up
+        # We avoid by masking the signals temporarily
+        cleanup.add_container(container)
+        signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
+        #
+        #############################
+
+        stream = container.logs(stream=True, stdout=True, stderr=True)
+        sys.stdout.flush()
+        for chunk in stream:
+            sys.stdout.buffer.write(chunk)
+            sys.stdout.buffer.flush()
+        sys.stdout.flush()
+        stream.close()
+        try:
+            logging.info("Waiting for status of container %s for %d s.",
+                         trim_container_id(container.id),
+                         container_wait_s)
+            wait_result = container.wait(timeout=container_wait_s)
+            logging.info("Container exit status: %s", wait_result)
+            ret = wait_result.get('StatusCode', 200)
+        except Exception as e:
+            logging.exception(e)
+            ret = 150
+
+        # Stop
+        try:
+            logging.info("Stopping container: %s", trim_container_id(container.id))
+            container.stop()
+        except Exception as e:
+            logging.exception(e)
+            ret = 151
+
+        # Remove
+        try:
+            logging.info("Removing container: %s", trim_container_id(container.id))
+            container.remove()
+        except Exception as e:
+            logging.exception(e)
+            ret = 152
+        cleanup.remove_container(container)
+        containers = docker_client.containers.list()
+        if containers:
+            logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers])
+    return ret
 
 
 def list_platforms() -> str:
-    print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
+    return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))
+
 
 def load_docker_cache(tag, docker_registry) -> None:
+    """Imports tagged container from the given docker registry"""
     if docker_registry:
+        # noinspection PyBroadException
         try:
             import docker_cache
             logging.info('Docker cache download is enabled from registry %s', docker_registry)
@@ -226,19 +362,29 @@ def load_docker_cache(tag, docker_registry) -> None:
     else:
         logging.info('Distributed docker cache disabled')
 
-def main() -> int:
-    # We need to be in the same directory than the script so the commands in the dockerfiles work as
-    # expected. But the script can be invoked from a different path
-    base = os.path.split(os.path.realpath(__file__))[0]
-    os.chdir(base)
 
-    logging.getLogger().setLevel(logging.INFO)
+def log_environment():
+    instance_id = ec2_instance_id_hostname()
+    if instance_id:
+        logging.info("EC2 Instance id: %s", instance_id)
+    pp = pprint.PrettyPrinter(indent=4)
+    logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))
 
-    def script_name() -> str:
-        return os.path.split(sys.argv[0])[1]
 
+def script_name() -> str:
+    """:returns: script name with leading paths removed"""
+    return os.path.split(sys.argv[0])[1]
+
+
+def main() -> int:
+    logging.getLogger().setLevel(logging.INFO)
+    logging.getLogger("requests").setLevel(logging.WARNING)
     logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))
 
+    logging.info("MXNet container based build tool.")
+    log_environment()
+    chdir_to_script_directory()
+
     parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker
     containers""", epilog="")
     parser.add_argument("-p", "--platform",
@@ -270,10 +416,6 @@ def script_name() -> str:
                         help="print docker run command for manual inspection",
                         action='store_true')
 
-    parser.add_argument("-i", "--interactive",
-                        help="go in a shell inside the container",
-                        action='store_true')
-
     parser.add_argument("-d", "--docker-registry",
                         help="Dockerhub registry name to retrieve cache from. Default is 'mxnetci'",
                         default='mxnetci',
@@ -284,8 +426,10 @@ def script_name() -> str:
                         default=1,
                         type=int)
 
-    parser.add_argument("-c", "--cache", action="store_true",
-                        help="Enable docker registry cache")
+    parser.add_argument("-c", "--no-dockerhub-cache", action="store_true",
+                        help="Disables use of --cache-from option on docker build, allowing docker"
+                        " to use local layers for caching. If absent, we use the cache from dockerhub"
+                        " which is the default.")
 
     parser.add_argument("command",
                         help="command to run in the container",
@@ -293,68 +437,94 @@ def script_name() -> str:
 
     parser.add_argument("--ccache-dir",
                         default=default_ccache_dir(),
-                        help="Ccache directory",
+                        help="ccache directory",
                         type=str)
 
     args = parser.parse_args()
+
     def use_cache():
-        return args.cache or under_ci()
+        return not args.no_dockerhub_cache or under_ci()
 
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
-    shared_memory_size = args.shared_memory_size
-    num_docker_build_retires = args.docker_build_retries
+
+    # Cleanup on signals and exit
+    cleanup = Cleanup()
+
+    def signal_handler(signum, _):
+        signal.pthread_sigmask(signal.SIG_BLOCK, {signum})
+        logging.warning("Signal %d received, cleaning up...", signum)
+        cleanup()
+        logging.warning("done. Exiting with error.")
+        sys.exit(1)
+
+    atexit.register(cleanup)
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
 
     if args.list:
-        list_platforms()
+        print(list_platforms())
     elif args.platform:
         platform = args.platform
         tag = get_docker_tag(platform=platform, registry=args.docker_registry)
         if use_cache():
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-        build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires)
+        build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
+                     num_retries=args.docker_build_retries, use_cache=use_cache())
         if args.build_only:
             logging.warning("Container was just built. Exiting due to build-only.")
             return 0
 
+        # noinspection PyUnusedLocal
+        ret = 0
         if command:
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=command, docker_registry=args.docker_registry,
-                          local_ccache_dir=args.ccache_dir, interactive=args.interactive)
+            ret = container_run(
+                platform=platform, nvidia_runtime=args.nvidiadocker,
+                shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
+                local_ccache_dir=args.ccache_dir, cleanup=cleanup)
         elif args.print_docker_run:
-            print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                                command=[], dry_run=True, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir))
-        elif args.interactive:
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=command, docker_registry=args.docker_registry,
-                          local_ccache_dir=args.ccache_dir, interactive=args.interactive)
-
+            command = []
+            ret = container_run(
+                platform=platform, nvidia_runtime=args.nvidiadocker,
+                shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
+                local_ccache_dir=args.ccache_dir, dry_run=True, cleanup=cleanup)
         else:
             # With no commands, execute a build function for the target platform
-            assert not args.interactive, "when running with -i must provide a command"
-            cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
-            logging.info("No command specified, trying default build: %s", ' '.join(cmd))
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=cmd, docker_registry=args.docker_registry,
-                          local_ccache_dir=args.ccache_dir)
+            command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
+            logging.info("No command specified, trying default build: %s", ' '.join(command))
+            ret = container_run(
+                platform=platform, nvidia_runtime=args.nvidiadocker,
+                shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
+                local_ccache_dir=args.ccache_dir, cleanup=cleanup)
+
+        if ret != 0:
+            logging.critical("Execution of %s failed with status: %d", command, ret)
+            return ret
 
     elif args.all:
         platforms = get_platforms()
-        logging.info("Building for all architectures: {}".format(platforms))
+        logging.info("Building for all architectures: %s", platforms)
         logging.info("Artifacts will be produced in the build/ directory.")
         for platform in platforms:
             tag = get_docker_tag(platform=platform, registry=args.docker_registry)
             if use_cache():
                 load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires)
+            build_docker(platform, docker_binary=docker_binary, registry=args.docker_registry,
+                         num_retries=args.docker_build_retries, use_cache=use_cache())
             if args.build_only:
                 continue
-            build_platform = "build_{}".format(platform)
-            cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
             shutil.rmtree(buildir(), ignore_errors=True)
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
-            plat_buildir = os.path.join(get_mxnet_root(), build_platform)
+            build_platform = "build_{}".format(platform)
+            plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
+                                                        "mxnet_{}".format(build_platform)))
+            if os.path.exists(plat_buildir):
+                logging.warning("%s already exists, skipping", plat_buildir)
+                continue
+            command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
+            container_run(
+                platform=platform, nvidia_runtime=args.nvidiadocker,
+                shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
+                local_ccache_dir=args.ccache_dir, cleanup=cleanup)
             shutil.move(buildir(), plat_buildir)
             logging.info("Built files left in: %s", plat_buildir)
 
@@ -377,10 +547,6 @@ def use_cache():
 
     Will print a docker run command to get inside the container in a shell
 
-./build.py -p armv7 --interactive
-
-    Will execute a shell into the container
-
 ./build.py -a
 
     Builds for all platforms and leaves artifacts in build_<platform>
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 6316270f9cf..2ad3bea519c 100755
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -18,7 +18,7 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM dockcross/linux-armv7
+FROM mxnetci/dockcross-linux-armv7:08212018
 
 ENV ARCH armv7l
 ENV HOSTCC gcc
diff --git a/ci/docker/install/ubuntu_clang.sh b/ci/docker/install/ubuntu_clang.sh
index 39a5600ce9d..cb0f234a1c1 100755
--- a/ci/docker/install/ubuntu_clang.sh
+++ b/ci/docker/install/ubuntu_clang.sh
@@ -21,11 +21,16 @@
 # the whole docker cache for the image
 
 set -ex
-# Install clang 3.9 (the same version as in XCode 8.*) and 5.0 (latest major release)
+# Install clang 3.9 (the same version as in XCode 8.*) and 6.0 (latest major release)
 wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
     apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-3.9 main" && \
-    apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-5.0 main" && \
+    apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main" && \
     apt-get update && \
-    apt-get install -y clang-3.9 clang-5.0 && \
+    apt-get install -y clang-3.9 clang-6.0 clang-tidy-6.0 && \
     clang-3.9 --version && \
-    clang-5.0 --version
+    clang-6.0 --version
+
+# Use llvm's master version of run-clang-tidy.py.  This version has mostly minor updates, but
+# importantly will properly return a non-zero exit code when an error is reported in clang-tidy.
+# Please remove the below if we install a clang version higher than 6.0.
+wget https://raw.githubusercontent.com/llvm-mirror/clang-tools-extra/7654135f0cbd155c285fd2a37d87e27e4fff3071/clang-tidy/tool/run-clang-tidy.py -O /usr/lib/llvm-6.0/share/clang/run-clang-tidy.py
diff --git a/ci/docker/install/ubuntu_mklml.sh b/ci/docker/install/ubuntu_mklml.sh
index 4efa1f77e92..7e17295f420 100755
--- a/ci/docker/install/ubuntu_mklml.sh
+++ b/ci/docker/install/ubuntu_mklml.sh
@@ -21,5 +21,5 @@
 # the whole docker cache for the image
 
 set -ex
-wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz
-tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
+wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz
+tar -zxf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
diff --git a/ci/docker/install/ubuntu_tvm.sh b/ci/docker/install/ubuntu_tvm.sh
index 4f5cb4251ad..2ee4e534ed9 100755
--- a/ci/docker/install/ubuntu_tvm.sh
+++ b/ci/docker/install/ubuntu_tvm.sh
@@ -25,14 +25,14 @@ cd tvm
 # This is a stable tag that support MXNet TVM bridge.
 # We use this since support for mxnet bridge just checked
 # into master and there is yet a version tag
-git checkout 30eaf463e34d7c301357c31a010945d11df16537
+git checkout v0.4
+
+cp cmake/config.cmake .
+echo set\(USE_CUDA /usr/local/cuda\) >> config.cmake
+echo set\(USE_LLVM llvm-config-5.0\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
 
-cp make/config.mk
-echo USE_CUDA=1 >> config.mk
-echo LLVM_CONFIG=llvm-config-5.0 >> config.mk
-echo USE_RPC=1 >> config.mk
-echo USE_GRAPH_RUNTIME=1 >> config.mk
-echo CUDA_PATH=/usr/local/cuda >> config.mk
 make -j$(nproc)
 
 cd python
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 35311396e34..0b6a42c3cda 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -349,11 +349,11 @@ build_ubuntu_cpu_clang39() {
         -j$(nproc)
 }
 
-build_ubuntu_cpu_clang50() {
+build_ubuntu_cpu_clang60() {
     set -ex
 
-    export CXX=clang++-5.0
-    export CC=clang-5.0
+    export CXX=clang++-6.0
+    export CC=clang-6.0
 
     build_ccache_wrappers
 
@@ -365,6 +365,32 @@ build_ubuntu_cpu_clang50() {
         -j$(nproc)
 }
 
+build_ubuntu_cpu_clang_tidy() {
+    set -ex
+
+    export CXX=clang++-6.0
+    export CC=clang-6.0
+    export CLANG_TIDY=/usr/lib/llvm-6.0/share/clang/run-clang-tidy.py
+
+    pushd .
+    cd /work/build
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DUSE_CUDA=OFF \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_OPENCV=ON \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -G Ninja \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+        /work/mxnet
+
+    ninja -v
+    cd /work/mxnet
+    $CLANG_TIDY -p /work/build -j $(nproc) -clang-tidy-binary clang-tidy-6.0 /work/mxnet/src
+    popd
+}
+
 build_ubuntu_cpu_clang39_mkldnn() {
     set -ex
 
@@ -381,11 +407,11 @@ build_ubuntu_cpu_clang39_mkldnn() {
         -j$(nproc)
 }
 
-build_ubuntu_cpu_clang50_mkldnn() {
+build_ubuntu_cpu_clang60_mkldnn() {
     set -ex
 
-    export CXX=clang++-5.0
-    export CC=clang-5.0
+    export CXX=clang++-6.0
+    export CC=clang-6.0
 
     build_ccache_wrappers
 
@@ -567,6 +593,9 @@ build_ubuntu_gpu_cmake() {
     ninja -v
 }
 
+build_ubuntu_blc() {
+    echo "pass"
+}
 
 # Testing
 
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 7a6d1106d38..bebcb25fb8f 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -30,6 +30,7 @@
 import sys
 import subprocess
 import json
+from typing import *
 import build as build_util
 
 
@@ -59,7 +60,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
     return 1 if is_error else 0
 
 
-def _build_save_container(platform, registry, load_cache) -> str:
+def _build_save_container(platform, registry, load_cache) -> Optional[str]:
     """
     Build image for passed platform and upload the cache to the specified S3 bucket
     :param platform: Platform
@@ -77,7 +78,7 @@ def _build_save_container(platform, registry, load_cache) -> str:
     logging.debug('Building %s as %s', platform, docker_tag)
     try:
         # Increase the number of retries for building the cache.
-        image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10)
+        image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10, use_cache=True)
         logging.info('Built %s as %s', docker_tag, image_id)
 
         # Push cache to registry
diff --git a/ci/other/pylintrc b/ci/other/pylintrc
index db3da4cae57..841a3bea13f 100644
--- a/ci/other/pylintrc
+++ b/ci/other/pylintrc
@@ -83,24 +83,15 @@ enable=indexing-exception,old-raise-syntax,undefined-variable
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
 disable=
-    design,
-    similarities,
     no-self-use,
     attribute-defined-outside-init,
-    locally-disabled,
-    star-args,
-    pointless-except,
     bad-option-value,
     global-statement,
     fixme,
-    suppressed-message,
-    useless-suppression,
-    locally-enabled,
     no-member,
     no-name-in-module,
     import-error,
     unsubscriptable-object,
-    unbalanced-tuple-unpacking,
     protected-access,
     superfluous-parens,
     invalid-name,
@@ -111,15 +102,18 @@ disable=
     chained-comparison,
     consider-using-dict-comprehension,
     consider-using-set-comprehension,
-    invalid-envvar-default,
-    singleton-comparison,
     try-except-raise,
     useless-object-inheritance,
-    useless-return,
     c-extension-no-member,
     deprecated-lambda,
     redefined-builtin,
-    unexpected-keyword-arg
+    too-few-public-methods,
+    too-many-arguments,
+    too-many-branches,
+    too-many-instance-attributes,
+    too-many-locals,
+    too-many-public-methods,
+    too-many-statements
 
 # disable=unicode-builtin,delslice-method,using-cmp-argument,setslice-method,dict-view-method,parameter-unpacking,range-builtin-not-iterating,print-statement,file-builtin,old-raise-syntax,basestring-builtin,execfile-builtin,indexing-exception,import-star-module-level,coerce-method,long-builtin,old-ne-operator,old-division,no-absolute-import,raw_input-builtin,old-octal-literal,oct-method,xrange-builtin,hex-method,unpacking-in-except,nonzero-method,raising-string,intern-builtin,reload-builtin,metaclass-assignment,cmp-method,filter-builtin-not-iterating,apply-builtin,map-builtin-not-iterating,next-method-called,unichr-builtin,buffer-builtin,dict-iter-method,input-builtin,coerce-builtin,getslice-method,useless-suppression,standarderror-builtin,zip-builtin-not-iterating,suppressed-message,cmp-builtin,backtick,long-suffix,reduce-builtin,round-builtin
 
diff --git a/tests/travis/travis_after_failure.sh b/ci/travis/install.sh
old mode 100755
new mode 100644
similarity index 70%
rename from tests/travis/travis_after_failure.sh
rename to ci/travis/install.sh
index 50754c9546c..d04dda7e87f
--- a/tests/travis/travis_after_failure.sh
+++ b/ci/travis/install.sh
@@ -17,10 +17,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
-if [ ${TASK} == "r_test" ]; then
-    echo "Print the install log..."
-    cat mxnet.Rcheck/*.out
-    echo "Print the check log..."
-    cat mxnet.Rcheck/*.log
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    brew update
+    brew install opencv
+    brew install python3
+    brew install fftw
+    brew install libpng
+    brew install ImageMagick
+    brew install swig
+	python -m pip install --user nose numpy cython scipy requests
+	python3 -m pip install --user nose numpy cython scipy requests
 fi
diff --git a/ci/util.py b/ci/util.py
index 22631f30435..4d68b57a3af 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -17,6 +17,8 @@
 
 import os
 import contextlib
+import logging
+import requests
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -41,3 +43,73 @@ def remember_cwd():
     finally: os.chdir(curdir)
 
 
+def retry(target_exception, tries=4, delay_s=1, backoff=2):
+    """Retry calling the decorated function using an exponential backoff.
+
+    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+    :param target_exception: the exception to check. may be a tuple of
+        exceptions to check
+    :type target_exception: Exception or tuple
+    :param tries: number of times to try (not retry) before giving up
+    :type tries: int
+    :param delay_s: initial delay between retries in seconds
+    :type delay_s: int
+    :param backoff: backoff multiplier e.g. value of 2 will double the delay
+        each retry
+    :type backoff: int
+    """
+    import time
+    from functools import wraps
+
+    def decorated_retry(f):
+        @wraps(f)
+        def f_retry(*args, **kwargs):
+            mtries, mdelay = tries, delay_s
+            while mtries > 1:
+                try:
+                    return f(*args, **kwargs)
+                except target_exception as e:
+                    logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
+                    time.sleep(mdelay)
+                    mtries -= 1
+                    mdelay *= backoff
+            return f(*args, **kwargs)
+
+        return f_retry  # true decorator
+
+    return decorated_retry
+
+
+# noinspection SyntaxError
+def under_ci() -> bool:
+    """:return: True if we run in Jenkins."""
+    return 'JOB_NAME' in os.environ
+
+
+def ec2_instance_id_hostname() -> str:
+    if under_ci():
+        result = []
+        try:
+            r = requests.get("http://instance-data/latest/meta-data/instance-id")
+            if r.status_code == 200:
+                result.append(r.content.decode())
+            r = requests.get("http://instance-data/latest/meta-data/public-hostname")
+            if r.status_code == 200:
+                result.append(r.content.decode())
+            return ' '.join(result)
+        except ConnectionError:
+            pass
+        return '?'
+    else:
+        return ''
+
+
+def chdir_to_script_directory():
+    # We need to be in the same directory than the script so the commands in the dockerfiles work as
+    # expected. But the script can be invoked from a different path
+    base = os.path.split(os.path.realpath(__file__))[0]
+    os.chdir(base)
+
+
diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index 5e7356caf64..ea678ccf2db 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -107,7 +107,9 @@ The jars from maven with the needed MXNet native binaries in it. On startup, the
 
 ### Build from MXNET Source
 
-Checkout the latest sha from the main package
+First, ensure you have JDK 8 on your system. Later versions may produce cryptic build errors mentioning `scala.reflect.internal.MissingRequirementError`. 
+
+Checkout the latest SHA from the main package:
 
 `git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet`
 `cd ~/mxnet`
diff --git a/contrib/clojure-package/ci-test.sh b/contrib/clojure-package/ci-test.sh
index eda3919f5ce..ba2d258e12d 100755
--- a/contrib/clojure-package/ci-test.sh
+++ b/contrib/clojure-package/ci-test.sh
@@ -21,3 +21,4 @@ set -evx
 MXNET_HOME=${PWD}
 cd ${MXNET_HOME}/contrib/clojure-package
 lein test
+lein cloverage --codecov
diff --git a/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj b/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj
index 0dd0e4daeb2..bec71dee81f 100644
--- a/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj
+++ b/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj
@@ -50,7 +50,6 @@ net ;=> #object[org.apache.mxnet.Symbol 0x5c78c8c2 "org.apache.mxnet.Symbol@5c78
 (def b (sym/variable "b"))
 (def c (sym/+ a b))
 
-
 ;; Each symbol takes a (unique) string name. NDArray and Symbol both represent a single tensor. Operators represent the computation between tensors. Operators take symbol (or NDArray) as inputs and might also additionally accept other hyperparameters such as the number of hidden neurons (num_hidden) or the activation type (act_type) and produce the output.
 
 ;; We can view a symbol simply as a function taking several arguments. And we can retrieve those arguments with the following method call:
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj
index e37a8bc8c98..7ca4ede9733 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray.clj
@@ -89,7 +89,7 @@
    (NDArray/arange (float start) ($/option (float stop)) step repeat ctx dtype))
   ([start stop]
    (arange start stop {})))
-
+  
 (defn slice
   "Return a sliced NDArray that shares memory with current one."
   ([ndarray i]
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/profiler.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/profiler.clj
index 48fd0414d82..5b4f9b19813 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/profiler.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/profiler.clj
@@ -36,7 +36,7 @@
   ([state]
    (Profiler/profilerSetState state))
   ([]
-   (profiler-set-state false)))
+   (profiler-set-state "stop")))
 
 (defn dump-profile
   " Dump profile and stop profiler. Use this to save profile
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol.clj
index 42ae034eb6d..12135fb75ca 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol.clj
@@ -135,10 +135,20 @@
   ([start stop  {:keys [step repeat dtype]
                  :or {step (float 1) repeat (int 1) dtype base/MX_REAL_TYPE}
                  :as opts}]
-   (Symbol/arange (float start) ($/option (float stop)) step repeat nil dtype))
+   (Symbol/arange (float start) ($/option (float stop)) step repeat false nil dtype))
   ([start stop]
    (arange start stop {})))
 
+(defn arange-with-inference
+  "Behaves like arange operator, but infers the stop value from the output shape, 
+   which must be known from the rest of the net."
+  ([start {:keys [step repeat dtype]
+           :or {step (float 1) repeat (int 1) dtype base/MX_REAL_TYPE}
+          :as opts}]
+   (Symbol/arange (float start) ($/option nil) step repeat true nil dtype))
+  ([start]
+   (arange-with-inference start {})))
+
 ;;; manually defined because of a conflicting arity of 2 with the auto-gen
 (defn min
   ([sym-name kwargs-map symbol-list kwargs-map-1]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/initializer_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/initializer_test.clj
new file mode 100644
index 00000000000..288a41496f0
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/initializer_test.clj
@@ -0,0 +1,45 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.initializer-test
+  (:require [org.apache.clojure-mxnet.initializer :as initializer]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [clojure.test :refer :all]))
+
+(defn exercise-initializer [init]
+  (-> init
+      (initializer/init-weight "test-weight" (ndarray/zeros [3 3])))
+
+  (is (number?
+       (-> init
+           (initializer/apply "test-weight" (ndarray/zeros [3 3]))
+           (ndarray/->vec)
+           (first)))))
+
+(deftest test-uniform
+  (exercise-initializer (initializer/uniform))
+  (exercise-initializer (initializer/uniform 0.8)))
+
+(deftest test-normal
+  (exercise-initializer (initializer/normal))
+  (exercise-initializer (initializer/normal 0.2)))
+
+(deftest test-xavier
+  (exercise-initializer (initializer/xavier))
+  (exercise-initializer (initializer/xavier {:rand-type "gaussian"
+                                             :factor-type "in"
+                                             :magnitude 2})))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/lr_scheduler_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/lr_scheduler_test.clj
new file mode 100644
index 00000000000..c60389a8702
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/lr_scheduler_test.clj
@@ -0,0 +1,24 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.lr-scheduler-test
+  (:require [org.apache.clojure-mxnet.lr-scheduler :as lr-scheduler]
+            [clojure.test :refer :all]))
+
+(deftest test-factor-scheduler
+  ;; just excercising
+  (lr-scheduler/factor-scheduler 2 0.3))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
index a71a312e1ae..1b4b2ea2fbe 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
@@ -222,6 +222,17 @@
     (is (= 0 (count (executor/grad-arrays exec))))
     (is (approx= 1e-4 result (-> (executor/outputs exec) (first))))))
 
+(deftest test-arange-with-inference
+  (let [arange (sym/arange-with-inference 0)
+        data (sym/variable "data")
+        added (sym/+ arange data)
+        result (range 0 4)
+        data-tmp (ndarray/zeros [4])
+        exec (sym/bind added (context/default-context) {"data" data-tmp})]
+    (executor/forward exec)
+    (is (= 0 (count (executor/grad-arrays exec))))
+    (is (approx= 1e-4 result (-> (executor/outputs exec) (first))))))
+
 (deftest test-scalar-pow
   (let [data (sym/variable "data")
         shape-vec [1 1]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/profiler_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/profiler_test.clj
new file mode 100644
index 00000000000..f4b74343fa1
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/profiler_test.clj
@@ -0,0 +1,31 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.profiler-test
+  (:require [org.apache.clojure-mxnet.profiler :as profiler]
+            [clojure.test :refer :all]))
+
+;; Just excercising the interop
+
+(deftest test-profiler
+  (do
+    (profiler/profiler-set-config  {:filename "test-profile.json"
+                                    :profile-symbolic 1})
+    (profiler/profiler-set-state "run")
+    (profiler/profiler-set-state "stop")
+    (profiler/profiler-set-state)
+    (profiler/dump-profile 0)))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
index 6df2a10f888..89b51237d3a 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
@@ -17,9 +17,12 @@
 
 (ns org.apache.clojure-mxnet.symbol-test
   (:require [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.executor :as executor]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
             [org.apache.clojure-mxnet.symbol :as sym]
             [org.apache.clojure-mxnet.util :as util]
-            [clojure.test :refer :all]))
+            [clojure.test :refer :all]
+            [org.apache.clojure-mxnet.context :as context]))
 
 (deftest test-compose
   (let [data (sym/variable "data")
@@ -61,3 +64,30 @@
   (let [data (sym/variable "data")
         data2 (sym/clone data)]
     (is (= (sym/to-json data) (sym/to-json data2)))))
+
+(deftest test-basic-bind
+  (let [a (sym/variable "a")
+        b (sym/variable "b")
+        c (sym/+ a b)
+        ex (sym/bind c {"a" (ndarray/ones [2 2]) "b" (ndarray/ones [2 2])})]
+    (is (= [2.0 2.0 2.0 2.0]) (-> (executor/forward ex)
+                                  (executor/outputs)
+                                  (first)
+                                  (ndarray/->vec)))))
+(deftest test-simple-bind
+  (let [a (sym/ones [3])
+        b (sym/ones [3])
+        c (sym/+ a b)
+        ex (sym/simple-bind c (context/default-context))]
+    (is (= [2.0 2.0 2.0]  (-> (executor/forward ex)
+                              (executor/outputs)
+                              (first)
+                              (ndarray/->vec))))))
+
+(deftest test-infer-shape
+  (let [a (sym/variable "a")
+        b (sym/variable "b")
+        c (sym/+ a b)
+        [arg-shapes out-shapes] (sym/infer-shape c {"a" [2 2] "b" [2 2]})]
+    (is (= [[2 2] [2 2]] arg-shapes))
+    (is (= [[2 2]] out-shapes))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/test_util.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/test_util.clj
index dcdbea64579..ecd54ca7277 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/test_util.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/test_util.clj
@@ -22,6 +22,8 @@
   (if (and (number? x) (number? y))
     (let [diff (Math/abs (- x y))]
       (< diff tolerance))
-    (reduce (fn [x y] (and x y))
-            (map #(approx= tolerance %1 %2) x y))))
+    (and
+    	(= (count x) (count y))
+		(reduce (fn [x y] (and x y))
+            (map #(approx= tolerance %1 %2) x y)))))
 
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
index 5551fab435f..de3480827ba 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
@@ -21,6 +21,7 @@
             [org.apache.clojure-mxnet.util :as util]
             [org.apache.clojure-mxnet.ndarray :as ndarray]
             [org.apache.clojure-mxnet.symbol :as sym]
+            [org.apache.clojure-mxnet.test-util :as test-util]
             [clojure.spec.alpha :as s])
   (:import (org.apache.mxnet Shape NDArrayFuncReturn NDArray)
            (scala.collection Map Set)
@@ -183,3 +184,10 @@
 (deftest test-validate
   (is (nil? (util/validate! string? "foo" "Not a string!")))
   (is (thrown-with-msg? Exception #"Not a string!" (util/validate! ::x 1 "Not a string!"))))
+
+(deftest test-approx=
+  (let [data1 [1 1 1 1]
+        data2 [1 1 1 1 9 9 9 9]
+        data3 [1 1 1 2]]
+    (is (not (test-util/approx= 1e-9 data1 data2)))
+    (is (test-util/approx= 2 data1 data3))))
\ No newline at end of file
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/visualization_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/visualization_test.clj
new file mode 100644
index 00000000000..a2bea947839
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/visualization_test.clj
@@ -0,0 +1,32 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.visualization-test
+  (:require [org.apache.clojure-mxnet.symbol :as sym]
+            [org.apache.clojure-mxnet.visualization :as viz]
+            [clojure.test :refer :all])
+  (:import (org.apache.mxnet Visualization$Dot)))
+
+(deftest test-plot-network
+  (let [to-plot-sym (as-> (sym/variable "data") data
+                      (sym/flatten "fl" {:data data})
+                      (sym/softmax-output "softmax" {:data data}))
+        dot (viz/plot-network to-plot-sym
+                              {"data" [1 1 28 28]}
+                              {:title "foo"
+                               :node-attrs {:shape "oval" :fixedsize "false"}})]
+    (is (instance? Visualization$Dot dot))))
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index 87e50b8aa86..b43f3919a31 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -8,7 +8,7 @@ $(document).ready(function () {
     function label(lbl) {
         return lbl.replace(/[ .]/g, '-').toLowerCase();
     }
-   
+
     function urlSearchParams(searchString) {
         let urlDict = new Map();
         let searchParams = searchString.substring(1).split("&");
@@ -45,11 +45,11 @@ $(document).ready(function () {
         showContent();
         if (window.location.href.indexOf("/install/index.html") >= 0) {
             if (versionSelect.indexOf(defaultVersion) >= 0) {
-                history.pushState(null, null, '/install/index.html?platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+                history.pushState(null, null, 'index.html?platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
             } else {
-                history.pushState(null, null, '/install/index.html?version=' + versionSelect + '&platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+                history.pushState(null, null, 'index.html?version=' + versionSelect + '&platform=' + platformSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
             }
-        } 
+        }
     }
 
     function showContent() {
@@ -73,22 +73,22 @@ $(document).ready(function () {
             $('.current-version').html( $(this).text() + ' <span class="caret"></span></button>' );
             if ($(this).text().indexOf(defaultVersion) < 0) {
                 if (window.location.search.indexOf("version") < 0) {
-                    history.pushState(null, null, '/install/index.html' + window.location.search.concat( '&version=' + $(this).text() ));
+                    history.pushState(null, null, 'index.html' + window.location.search.concat( '&version=' + $(this).text() ));
                 } else {
-                    history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('version'), $(this).text() ));
+                    history.pushState(null, null, 'index.html' + window.location.search.replace( urlParams.get('version'), $(this).text() ));
                 }
             } else if (window.location.search.indexOf("version") >= 0) {
-                  history.pushState(null, null, '/install/index.html' + window.location.search.replace( 'version', 'prev' ));
+                  history.pushState(null, null, 'index.html' + window.location.search.replace( 'version', 'prev' ));
               }
         }
         else if ($(this).hasClass("platforms")) {
-            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('platform'), $(this).text() ));
+            history.pushState(null, null, 'index.html' + window.location.search.replace( urlParams.get('platform'), $(this).text() ));
         }
         else if ($(this).hasClass("languages")) {
-            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('language'), $(this).text() ));
+            history.pushState(null, null, 'index.html' + window.location.search.replace( urlParams.get('language'), $(this).text() ));
         }
         else if ($(this).hasClass("processors")) {
-            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('processor'), $(this).text() ));
+            history.pushState(null, null, 'index.html' + window.location.search.replace( urlParams.get('processor'), $(this).text() ));
         }
         showContent();
         //window.location.search = window.location.search.replace( urlParams.get('version'), $(this).text() );
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index 3647e23a736..c8417ef023c 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -1,8 +1,8 @@
 <div id="splash">
   <div class="container">
     <div class="row">
-      <div class="col-lg-12">
-          <div id="banner-title" class='col-sm-6 col-xs-12'><span>Apache MXNet (Incubating)</span>
+      <div class="col-lg-6">
+          <div id="banner-title"><span>Apache MXNet (Incubating)</span>
             <p id="landing-title">A flexible and efficient library for deep learning.</p>
             <div id='landing-btn-blk'>
                 <div id="install_blk">
@@ -12,7 +12,10 @@
                     <a href="faq/why_mxnet.html" id="why_mxnet_btn">Learn More</a>
                 </div>
             </div>
-        </div>
+          </div>
+      </div>
+      <div class="col-lg-6" style="padding-top: 60px;">
+        <a href="http://www.apachecon.com/acna18/" class="section-tout-promo"><img src="http://www.apachecon.com/acna18/banners/acna-sleek-highres.png" width="65%" alt="apachecon"/></a>
       </div>
     </div>
   </div>
@@ -31,10 +34,10 @@ <h3>MXNet 1.2.1 Released</h3>
         <a href="https://github.com/apache/incubator-mxnet/releases/tag/1.2.1">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
-          <h3>Introducing the Scala Inference API</h3>
-          <p>A model loading and inference API is now available for Scala developers. Try out the examples for single shot detection and loading models for image classification.
-          </p>
-          <a href="api/scala/index.html">Learn More</a>
+        <h3>Introducing the Scala Inference API</h3>
+        <p>A model loading and inference API is now available for Scala developers. Try out the examples for single shot detection and loading models for image classification.
+        </p>
+        <a href="api/scala/index.html">Learn More</a>
       </div>
     </div>
   </div>
diff --git a/docs/_static/searchtools_custom.js b/docs/_static/searchtools_custom.js
index dcc147329b1..5f8c30a24f1 100644
--- a/docs/_static/searchtools_custom.js
+++ b/docs/_static/searchtools_custom.js
@@ -8,14 +8,14 @@
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- * 
+ *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
- * 
+ *
  * * Redistributions in binary form must reproduce the above copyright
  *   notice, this list of conditions and the following disclaimer in the
  *   documentation and/or other materials provided with the distribution.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -523,7 +523,7 @@ var Search = {
             displayNextItem();
           });
         } else if (DOCUMENTATION_OPTIONS.HAS_SOURCE) {
-          $.ajax({url: DOCUMENTATION_OPTIONS.URL_ROOT + '_sources/' + item[0] + '.txt',
+          $.ajax({url: DOCUMENTATION_OPTIONS.URL_ROOT + '_sources/' + item[0] + '.md.txt',
                   dataType: "text",
                   complete: function(jqxhr, textstatus) {
                     var data = jqxhr.responseText;
diff --git a/docs/api/clojure/index.md b/docs/api/clojure/index.md
index 3eeffff0a4e..32abbe06ad7 100644
--- a/docs/api/clojure/index.md
+++ b/docs/api/clojure/index.md
@@ -1,9 +1,20 @@
 # MXNet - Clojure API
+
 MXNet supports the Clojure programming language. The MXNet Clojure package brings flexible and efficient GPU
 computing and state-of-art deep learning to Clojure. It enables you to write seamless tensor/matrix computation with multiple GPUs in Clojure. It also lets you construct and customize the state-of-art deep learning models in Clojure, and apply them to tasks, such as image classification and data science challenges.
 
 See the [MXNet Clojure API Documentation](docs/index.html) for detailed API information.
 
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   kvstore.md
+   module.md
+   ndarray.md
+   symbol_in_pictures.md
+   symbol.md
+```
 
 ## Tensor and Matrix Computations
 You can perform tensor or matrix computation in pure Clojure:
diff --git a/docs/api/index.md b/docs/api/index.md
new file mode 100644
index 00000000000..eff6807678e
--- /dev/null
+++ b/docs/api/index.md
@@ -0,0 +1,14 @@
+# MXNet APIs
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   c++/index.md
+   clojure/index.md
+   julia/index.md
+   perl/index.md
+   python/index.md
+   r/index.md
+   scala/index.md
+```
diff --git a/docs/api/python/contrib/onnx.md b/docs/api/python/contrib/onnx.md
index d7c34ec1e01..f8210ad6a00 100644
--- a/docs/api/python/contrib/onnx.md
+++ b/docs/api/python/contrib/onnx.md
@@ -22,10 +22,9 @@ This document describes all the ONNX-MXNet APIs.
 .. autosummary::
     :nosignatures:
 
-    mxnet.contrib.onnx.import_model
-    mxnet.contrib.onnx.get_model_metadata
-    mxnet.contrib.onnx.import_to_gluon
-    mxnet.contrib.onnx.export_model
+    mxnet.contrib.onnx.onnx2mx.import_model
+    mxnet.contrib.onnx.onnx2mx.import_to_gluon
+    mxnet.contrib.onnx.mx2onnx.export_model
 ```
 
 ## ONNX Tutorials
@@ -33,8 +32,9 @@ This document describes all the ONNX-MXNet APIs.
 ```eval_rst
 .. toctree::
    :maxdepth: 1
-   
+
    /tutorials/onnx/super_resolution.md
+   /tutorials/onnx/export_mxnet_to_onnx.md
    /tutorials/onnx/inference_on_onnx_model.md
    /tutorials/onnx/fine_tuning_gluon.md
 ```
@@ -42,7 +42,7 @@ This document describes all the ONNX-MXNet APIs.
 ## ONNX Examples
 
 * Face Recognition with [ArcFace](https://github.com/onnx/models/tree/master/models/face_recognition/ArcFace)
-* Image Classification with [MobileNet](https://github.com/onnx/models/tree/master/models/image_classification/mobilenet), [ResNet](https://github.com/onnx/models/tree/master/models/image_classification/resnet), [SqueezeNet](https://github.com/onnx/models/tree/master/models/image_classification/squeezenet), [VGG](https://github.com/onnx/models/tree/master/models/image_classification/vgg) 
+* Image Classification with [MobileNet](https://github.com/onnx/models/tree/master/models/image_classification/mobilenet), [ResNet](https://github.com/onnx/models/tree/master/models/image_classification/resnet), [SqueezeNet](https://github.com/onnx/models/tree/master/models/image_classification/squeezenet), [VGG](https://github.com/onnx/models/tree/master/models/image_classification/vgg)
 
 ## API Reference
 
@@ -50,11 +50,12 @@ This document describes all the ONNX-MXNet APIs.
 
 ```eval_rst
 
-.. automodule:: mxnet.contrib.onnx.import_model
-.. automodule:: mxnet.contrib.onnx.get_model_metadata
-.. automodule:: mxnet.contrib.onnx.import_to_gluon
-.. automodule:: mxnet.contrib.onnx.export_model
-
+.. automodule:: mxnet.contrib.onnx.onnx2mx.import_model
+    :members: import_model, get_model_metadata
+.. automodule:: mxnet.contrib.onnx.onnx2mx.import_to_gluon
+    :members: import_to_gluon
+.. automodule:: mxnet.contrib.onnx.mx2onnx.export_model
+    :members: export_model
 ```
 
 <script>auto_index("api-reference");</script>
diff --git a/docs/api/python/index.md b/docs/api/python/index.md
index 420f4c9b72f..42c4af9e46b 100644
--- a/docs/api/python/index.md
+++ b/docs/api/python/index.md
@@ -17,58 +17,41 @@ Code examples are placed throughout the API documentation and these can be run a
 ```eval_rst
 
 .. note:: A convenient way to execute code examples is using the ``%doctest_mode`` mode of
-    Jupyter notebook, which allows for pasting multi-line examples containing
-    ``>>>`` while preserving indentation. Run ``%doctest_mode?`` in Jupyter notebook
-    for more details.
+   Jupyter notebook, which allows for pasting multi-line examples containing
+   ``>>>`` while preserving indentation. Run ``%doctest_mode?`` in Jupyter notebook
+   for more details.
 
 ```
 
 \* Some old references to Model API may exist, but this API has been deprecated.
 
-## NDArray API
-
-```eval_rst
-.. toctree::
-   :maxdepth: 1
-
-   ndarray/ndarray.md
-   ndarray/random.md
-   ndarray/linalg.md
-   ndarray/sparse.md
-   ndarray/contrib.md
-```
-
-## Symbol API
+## Autograd API
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   symbol/symbol.md
-   symbol/random.md
-   symbol/linalg.md
-   symbol/sparse.md
-   symbol/contrib.md
-   symbol/rnn.md
+   autograd/autograd.md
 ```
 
-## Module API
+## Callback API
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   module/module.md
-   executor/executor.md
+   callback/callback.md
 ```
 
-## Autograd API
+## Contrib Package
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   autograd/autograd.md
+   contrib/contrib.md
+   contrib/text.md
+   contrib/onnx.md
 ```
 
 ## Gluon API
@@ -86,6 +69,15 @@ Code examples are placed throughout the API documentation and these can be run a
    gluon/contrib.md
 ```
 
+## Image API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   image/image.md
+```
+
 ## IO API
 
 ```eval_rst
@@ -95,40 +87,54 @@ Code examples are placed throughout the API documentation and these can be run a
    io/io.md
 ```
 
-## Image API
+## KV Store API
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   image/image.md
+   kvstore/kvstore.md
 ```
 
-## Optimization API
+## Metric API
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   optimization/optimization.md
+   metric/metric.md
 ```
 
-## Callback API
+## Module API
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   callback/callback.md
+   module/module.md
+   executor/executor.md
 ```
 
-## Metric API
+## NDArray API
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   metric/metric.md
+   ndarray/ndarray.md
+   ndarray/random.md
+   ndarray/linalg.md
+   ndarray/sparse.md
+   ndarray/contrib.md
+```
+
+## Optimization API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   optimization/optimization.md
 ```
 
 ## Profiler API
@@ -144,18 +150,30 @@ Code examples are placed throughout the API documentation and these can be run a
 
 ```eval_rst
 .. toctree::
-   :maxdepth 1
+   :maxdepth: 1
 
    rtc/rtc.md
 ```
 
-## Contrib Package
+## Symbol API
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
-   contrib/contrib.md
-   contrib/text.md
-   contrib/onnx.md
+   symbol/symbol.md
+   symbol/random.md
+   symbol/linalg.md
+   symbol/sparse.md
+   symbol/contrib.md
+   symbol/rnn.md
+```
+
+## Symbol in Pictures API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   symbol_in_pictures/symbol_in_pictures.md
 ```
diff --git a/docs/api/python/ndarray/sparse.md b/docs/api/python/ndarray/sparse.md
index 85d33b193a6..2ade059a70c 100644
--- a/docs/api/python/ndarray/sparse.md
+++ b/docs/api/python/ndarray/sparse.md
@@ -16,7 +16,7 @@ This document lists the routines of the *n*-dimensional sparse array package:
 ```
 
 The `CSRNDArray` and `RowSparseNDArray` API, defined in the `ndarray.sparse` package, provides
-imperative sparse tensor operations on **CPU**.
+imperative sparse tensor operations.
 
 An `CSRNDArray` inherits from `NDArray`, and represents a two-dimensional, fixed-size array in compressed sparse row format.
 
@@ -63,16 +63,13 @@ A detailed tutorial is available at
 
 ```eval_rst
 
-.. note:: ``mxnet.ndarray.sparse.RowSparseNDArray`` and ``mxnet.ndarray.sparse.CSRNDArray`` DO NOT support the ``mxnet.gluon`` high-level interface yet.
-
 .. note:: ``mxnet.ndarray.sparse`` is similar to ``mxnet.ndarray`` in some aspects. But the differences are not negligible. For instance:
 
-   - Only a subset of operators in ``mxnet.ndarray`` have specialized implementations in ``mxnet.ndarray.sparse``.
-     Operators such as Convolution and broadcasting do not have sparse implementations yet.
+   - Only a subset of operators in ``mxnet.ndarray`` have efficient sparse implementations in ``mxnet.ndarray.sparse``.
+   - If an operator do not occur in the ``mxnet.ndarray.sparse`` namespace, that means the operator does not have an efficient sparse implementation yet. If sparse inputs are passed to such an operator, it will convert inputs to the dense format and fallback to the already available dense implementation.
    - The storage types (``stype``) of sparse operators' outputs depend on the storage types of inputs.
      By default the operators not available in ``mxnet.ndarray.sparse`` infer "default" (dense) storage type for outputs.
      Please refer to the [API Reference](#api-reference) section for further details on specific operators.
-   - GPU support for ``mxnet.ndarray.sparse`` is experimental. Only a few sparse operators are supported on GPU such as ``sparse.dot``.
 
 .. note:: ``mxnet.ndarray.sparse.CSRNDArray`` is similar to ``scipy.sparse.csr_matrix`` in some aspects. But they differ in a few aspects:
 
@@ -559,7 +556,6 @@ We summarize the interface for each class in the following sections.
     sgd_update
     sgd_mom_update
     adam_update
-    ftrl_update
     adagrad_update
 ```
 
diff --git a/docs/api/python/symbol/sparse.md b/docs/api/python/symbol/sparse.md
index d26ba07853d..cd8272cedd7 100644
--- a/docs/api/python/symbol/sparse.md
+++ b/docs/api/python/symbol/sparse.md
@@ -16,7 +16,7 @@ This document lists the routines of the sparse symbolic expression package:
 ```
 
 The `Sparse Symbol` API, defined in the `symbol.sparse` package, provides
-sparse neural network graphs and auto-differentiation on CPU.
+sparse neural network graphs and auto-differentiation.
 
 The storage type of a variable is speficied by the `stype` attribute of the variable.
 The storage type of a symbolic expression is inferred based on the storage types of the variables and the operators.
@@ -43,12 +43,11 @@ array([ 1.,  1.],
 .. note:: most operators provided in ``mxnet.symbol.sparse`` are similar to those in
    ``mxnet.symbol`` although there are few differences:
 
-   - Only a subset of operators in ``mxnet.symbol`` have specialized implementations in ``mxnet.symbol.sparse``.
-     Operators such as reduction and broadcasting do not have sparse implementations yet.
+   - Only a subset of operators in ``mxnet.symbol`` have efficient sparse implementations in ``mxnet.symbol.sparse``.
+   - If an operator do not occur in the ``mxnet.symbol.sparse`` namespace, that means the operator does not have an efficient sparse implementation yet. If sparse inputs are passed to such an operator, it will convert inputs to the dense format and fallback to the already available dense implementation.
    - The storage types (``stype``) of sparse operators' outputs depend on the storage types of inputs.
      By default the operators not available in ``mxnet.symbol.sparse`` infer "default" (dense) storage type for outputs.
      Please refer to the API reference section for further details on specific operators.
-   - GPU support for ``mxnet.symbol.sparse`` is experimental.
 
 ```
 
diff --git a/docs/api/scala/index.md b/docs/api/scala/index.md
index e96892b5800..8b32c9fe9e2 100644
--- a/docs/api/scala/index.md
+++ b/docs/api/scala/index.md
@@ -1,9 +1,23 @@
 # MXNet - Scala API
+
 MXNet supports the Scala programming language. The MXNet Scala package brings flexible and efficient GPU
 computing and state-of-art deep learning to Scala. It enables you to write seamless tensor/matrix computation with multiple GPUs in Scala. It also lets you construct and customize the state-of-art deep learning models in Scala, and apply them to tasks, such as image classification and data science challenges.
 
 See the [MXNet Scala API Documentation](docs/index.html#org.apache.mxnet.package) for detailed API information.
 
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   infer.md
+   io.md
+   kvstore.md
+   model.md
+   module.md
+   ndarray.md
+   symbol_in_pictures.md
+   symbol.md
+```
 
 ## Image Classification with the Scala Infer API
 The Infer API can be used for single and batch image classification. More information can be found at the following locations:
diff --git a/docs/architecture/index.md b/docs/architecture/index.md
index 91fb5f51d7b..189e76e62fa 100644
--- a/docs/architecture/index.md
+++ b/docs/architecture/index.md
@@ -15,9 +15,15 @@ Mainly, they focus on the following 3 areas:
 abstraction, optimization, and trade-offs between efficiency and flexibility.
 Additionally, we provide an overview of the complete MXNet system.
 
-* [MXNet System Overview](http://mxnet.io/architecture/overview.html)
-* [Deep Learning Programming Style: Symbolic vs Imperative](http://mxnet.io/architecture/program_model.html)
-* [Dependency Engine for Deep Learning](http://mxnet.io/architecture/note_engine.html)
-* [Optimizing the Memory Consumption in Deep Learning](http://mxnet.io/architecture/note_memory.html)
-* [Efficient Data Loading Module for Deep Learning](http://mxnet.io/architecture/note_data_loading.html)
-* [Exception Handling in MXNet](http://mxnet.io/architecture/exception_handling.html)
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   overview.md
+   program_model.md
+   note_engine.md
+   note_memory.md
+   note_data_loading.md
+   exception_handling.md
+   rnn_interface.md
+```
diff --git a/docs/architecture/release_note_0_9.md b/docs/architecture/release_note_0_9.md
deleted file mode 100644
index afcc091d7cc..00000000000
--- a/docs/architecture/release_note_0_9.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# MXNet 0.9 (NNVM) Release Note
-
-Version 0.9 brings a number of important features and changes, including a back-end refactor to adopt the [NNVM](https://github.com/dmlc/nnvm) framework, a profiler for analyzing performance, a fast image IO and augmentation module that bypasses GIL, and various other changes.
-
-## NNVM Refactor
-
-NNVM is a library for neural network graph construction, optimization, and operator registration. It serves as an intermediary layer between the front-end (MXNet user API) and the back-end (computation on the device). After version 0.9, MXNet fully adopts the NNVM framework. Now it's easier to create operators. You can also register "pass"es that process and optimizes the graph when `bind` is called on the symbol. For more discussion on how to create operators with NNVM, please refer to [How to Create New Operators](../faq/new_op.md)
-
-Other changes brought by NNVM include:
-- Backward shape inference is now supported
-- All operators can now be used with both symbolic and ndarray API. For example, `mx.nd.Activation(x, act_type='relu')` works now.
-- Optional cython API for mx.symbol and mx.ndarray is now available. Use `make cython` to activate it for accelerated communication with the back-end.
-
-## Profiler
-
-![MLP Profile](https://cloud.githubusercontent.com/assets/17693755/18035938/0a43484a-6d93-11e6-80d4-241c6ca552ea.png)
-
-MXNet now provides a native profiler for analyzing the performance of operators. This feature compliments general profiling tools like nvprof and gprof by summarizing at the operator level, instead of function, kernel, or instruction level.
-
-To use this feature, first set `USE_PROFILER = 1` in `config.mk` and rebuild mxnet. Then add three lines at the beginning and end of the section of your program you want to profile:
-```python
-mx.profiler.profiler_set_config(mode=scope, filename=fname)
-profiler.profiler_set_state('run')
-
-# do computation ...
-
-profiler.profiler_set_state('stop')
-```
-`scope` can be 'symbolic' (to only include symbolic operations) or 'all' (to include all operations), and `fname` is the path to save profiler output.
-
-After program finishes, navigate to [chrome://tracing](chrome://tracing) in a Chrome browser and load profiler output to see the results.
-
-## Image IO
-
-MXNet already has `mx.io.ImageRecordIter` for loading and preprocessing images. However, some tasks need more flexible image processing API. Detection, for example, requires transforming labels together with images. Usually, people write custom data iterators in python to handle this. But due to the infamous Global Interpreter Lock (GIL), python scripts cannot use multithreading to speed up processing.
-
-`mx.image` provides a set of fast image processing API that leverage MXNet Engine to automatically parallelize processing. You can write
-```python
-imgs = [mx.image.imdecode(open(f).read()) for f in img_paths]
-```
-and decoding will be automatically run in parallel.
-
-## Miscellaneous
-
-- sgd and adam optimizer are now implemented with a single imperative call. They should be as fast and memory efficient as cc optimizers. ccsgd is now deprecated and redirects to sgd.
-- Layout support is added. Use `mx.io.DataDesc(..., layout='NHWC')` in provide_data to specify data layout. use `mx.sym.YourSymbol(..., __layout__='NHWC')` to specify output layout. `layout` option is now available for Convolution layer.
-- element_mask is removed. Please use src*mask.reshape((mask.size, 1, 1, ..., 1)) directly as binary ops now support broadcasting.
-- sum_axis, max_axis, and min_axis are deprecated. Please use mx.nd.max(src, axis=n) instead.
-- symbol attributes are now limited to ctx_group, lr_mult, wd_mult, force_mirroring. All other custom attributes need to be in __xxx__ format (start and end with double underscore) or an error will be triggered during attribute parsing.
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
index d36f1f5edc6..350c75b084b 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -114,7 +114,7 @@ function checkout () {
   # Overriding configs later will cause a conflict here, so stashing...
   git stash
   # Fails to checkout if not available locally, so try upstream
-  git checkout "$repo_folder" || git branch $repo_folder "upstream/$repo_folder"
+  git checkout "$repo_folder" || git branch $repo_folder "upstream/$repo_folder" && git checkout "$repo_folder" || exit 1
   if [ $tag == 'master' ]; then
     git pull
   fi
@@ -174,4 +174,4 @@ done
 
 echo "Now you may want to run update_all_version.sh to create the production layout with the versions dropdown and other per-version corrections."
 echo "The following pattern is recommended (tags, default tag, url base):"
-echo "./update_all_version.sh "$tags_to_display " master http://mxnet.incubator.apache.org/"
+echo "./update_all_version.sh \"$2\" master http://mxnet.incubator.apache.org/"
diff --git a/docs/community/index.md b/docs/community/index.md
new file mode 100644
index 00000000000..7bdb1c21350
--- /dev/null
+++ b/docs/community/index.md
@@ -0,0 +1,11 @@
+# MXNet Community
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   contribute.md
+   ecosystem.md
+   powered_by.md
+   mxnet_channels.md
+```
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 07dd9b9d7ca..1b4a95d3f33 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -1,5 +1,13 @@
 # MXNet FAQ
 
+```eval_rst
+.. toctree::
+   :hidden:
+   :glob:
+
+   *
+```
+
 This section addresses common questions about how to use _MXNet_. These include performance issues, e.g., how to train with multiple GPUs.
 They also include workflow questions, e.g., how to visualize a neural network computation graph.
 These answers are fairly focused. For more didactic, self-contained introductions to neural networks
diff --git a/docs/get_started/index.md b/docs/get_started/index.md
deleted file mode 100644
index a743930b33d..00000000000
--- a/docs/get_started/index.md
+++ /dev/null
@@ -1,8 +0,0 @@
-
-<html lang="en-US">
-    <head>
-        <meta charset="UTF-8">
-        <meta http-equiv="refresh" content="0; url=why_mxnet.html">
-        <title>Page Redirection</title>
-    </head>
-</html>
diff --git a/docs/gluon/index.md b/docs/gluon/index.md
index 4bea06edcab..c0d9053cd2c 100644
--- a/docs/gluon/index.md
+++ b/docs/gluon/index.md
@@ -1,9 +1,11 @@
-![](https://github.com/dmlc/web-data/blob/master/mxnet/image/image-gluon-logo.png?raw=true)
+# About Gluon
+
+![gluon logo](https://github.com/dmlc/web-data/blob/master/mxnet/image/image-gluon-logo.png?raw=true)
 
 Based on the [the Gluon API specification](https://github.com/gluon-api/gluon-api), the new Gluon library in Apache MXNet provides a clear, concise, and simple API for deep learning. It makes it easy to prototype, build, and train deep learning models without sacrificing training speed. Install the latest version of MXNet to get access to Gluon by either following these easy steps or using this simple command:
 
-```python
-    pip install mxnet --pre --user
+```bash
+    pip install mxnet
 ```
 <br/>
 <div class="boxed">
@@ -39,8 +41,8 @@ Use plug-and-play neural network building blocks, including predefined layers, o
 
 ```python
 net = gluon.nn.Sequential()
-# When instantiated, Sequential stores a chain of neural network layers. 
-# Once presented with data, Sequential executes each layer in turn, using 
+# When instantiated, Sequential stores a chain of neural network layers.
+# Once presented with data, Sequential executes each layer in turn, using
 # the output of one layer as the input for the next
 with net.name_scope():
     net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
@@ -81,7 +83,7 @@ def forward(self, F, inputs, tree):
 <br/>
 **__High Performance__**
 
-Easily cache the neural network to achieve high performance by defining your neural network with ``HybridSequential`` and calling the ``hybridize`` method: 
+Easily cache the neural network to achieve high performance by defining your neural network with ``HybridSequential`` and calling the ``hybridize`` method:
 
 ```python
 net = nn.HybridSequential()
diff --git a/docs/index.md b/docs/index.md
index 7e251131fee..ab6a95dc0dd 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,14 +1,15 @@
-Contents
---------
-These are used to generate the indexes for search functionality.
+# MXNet
 
-- [Python Documents](api/python/index.md)
-- [R Documents](api/r/index.md)
-- [Julia Documents](api/julia/index.md)
-- [C++ Documents](api/c++/index.md)
-- [Scala Documents](api/scala/index.md)
-- [Perl Documents](api/perl/index.md)
-- [HowTo Documents](faq/index.md)
-- [System Documents](architecture/index.md)
-- [Tutorials](tutorials/index.md)
-- [Community](community/contribute.md)
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   api/index.md
+   architecture/index.md
+   community/index.md
+   faq/index.md
+   gluon/index.md
+   install/index.md
+   model_zoo/index.md
+   tutorials/index.md
+```
diff --git a/docs/install/build_from_source.md b/docs/install/build_from_source.md
index b22ff8833e9..6c0a4dab251 100644
--- a/docs/install/build_from_source.md
+++ b/docs/install/build_from_source.md
@@ -1,22 +1,23 @@
 # Build MXNet from Source
 
-**NOTE:** For MXNet with Python installation, please refer to the [new install guide](http://mxnet.io/install/index.html).
-
-This document explains how to build MXNet from sources. Building MXNet from sources is a 2 step process.
+This document explains how to build MXNet from source code. Building MXNet from source is a two step process.
 
 1. Build the MXNet shared library, `libmxnet.so`, from [C++ source files](#build-the-shared-library)
-2. Install the language binding for MXNet. MXNet supports -
-   [C++](#build-the-c-package),
-   [Scala](#build-the-scala-package), [R](#build-the-r-package), and
-   [Julia](#build-the-julia-package).
+2. Install the [language bindings](#installing-mxnet-language-bindings) for MXNet. MXNet supports the following languages:
+    - Python
+    - C++
+    - Clojure
+    - Julia
+    - Perl
+    - R
+    - Scala
 
-## Build the shared library
+## Prerequisites
 
-### Prerequisites
+You need C++ build tools and a BLAS library to build the MXNet shared library. If you want to run MXNet with GPUs, you will need to install [NVDIA CUDA and cuDNN](https://developer.nvidia.com/cuda-downloads) first.
 
-You need C++ build tools and BLAS library to build MXNet shared library. If you want to run MXNet on GPUs, you need to install CUDA and CuDNN.
 
-#### C++ build tools
+### C++ build tools
 
 1. A C++ compiler that supports C++ 11.
 [G++ (4.8 or later)](https://gcc.gnu.org/gcc-4.8/) or
@@ -24,311 +25,138 @@ You need C++ build tools and BLAS library to build MXNet shared library. If you
 
 2. [Git](https://git-scm.com/downloads) for downloading the sources from Github repository.
 
-3. [GNU Make](https://www.gnu.org/software/make/) ([cmake](https://cmake.org/)
-   for Windows) to build the library.
-
-
-Select your preferences and follow the instructions to install MXNet from sources.
-<div class="btn-group opt-group" role="group">
-<button type="button" class="btn btn-default opt active">Linux</button>
-<button type="button" class="btn btn-default opt">macOS</button>
-<button type="button" class="btn btn-default opt">Windows</button>
-</div>
-<script type="text/javascript" src='../../_static/js/options.js'></script>
-
-<div class="linux">
-
-Then select the Linux distribution:
-<div class="btn-group opt-group" role="group">
-<button type="button" class="btn btn-default opt active">Ubuntu</button>
-<button type="button" class="btn btn-default opt">CentOS</button>
-<button type="button" class="btn btn-default opt">Others</button>
-</div>
-
-- **Ubuntu** for systems supporting the `apt-get`
-  package management program
-- **CentOS** for systems supporting the `yum` package
-  management program
-- **Others** for general Linux-like systems building dependencies from scratch.
-
-<div class="ubuntu">
-
-Install build tools and git on `Ubuntu >= 13.10` and `Debian >= 8`.
-
-```bash
-sudo apt-get update && sudo apt-get install build-essential git
-```
-
-</div>
-
-<div class="centos">
-
-Install build tools and git on `CentOS >= 7` and `Fedora >= 19`.
-
-```bash
-sudo yum groupinstall -y "Development Tools" && sudo yum install -y git
-```
-
-</div>
-
-<div class="others">
-
-Installing both `git` and `make` by following instructions on the websites is
-straightforward. Here we provide the instructions to build `gcc-4.8` from source codes.
-
-1. Install the 32-bit `libc` with one of the following system-specific commands:
-
-   ```bash
-   sudo apt-get install libc6-dev-i386 # In Ubuntu
-   sudo yum install glibc-devel.i686   # In RHEL (Red Hat Linux)
-   sudo yum install glibc-devel.i386   # In CentOS 5.8
-   sudo yum install glibc-devel.i686   # In CentOS 6/7
-   ```
-
-2. Download and extract the `gcc` source code with the prerequisites:
-
-   ```bash
-   wget http://mirrors.concertpass.com/gcc/releases/gcc-4.8.5/gcc-4.8.5.tar.gz
-   tar -zxf gcc-4.8.5.tar.gz
-   cd gcc-4.8.5
-   ./contrib/download_prerequisites
-   ```
-
-3. Build `gcc` by using 10 threads and then install to `/usr/local`
-
-   ```bash
-   mkdir release && cd release
-   ../configure --prefix=/usr/local --enable-languages=c,c++
-   make -j10
-   sudo make install
-   ```
-
-4. Add the lib path to your configure file such as `~/.bashrc`:
-
-   ```bash
-   export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib64
-   ```
+3. [cmake](https://cmake.org/) is recommended. You may also use [GNU Make](https://www.gnu.org/software/make/) to build the library.
 
-</div>
-</div> <!-- linux -->
 
-<div class="windows">
-
-1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and Install [CMake](https://cmake.org/) if it is not already installed.
-
-</div>
-
-<div class="macos">
-
-Install [Xcode](https://developer.apple.com/xcode/).
-
-</div>
-
-#### BLAS library
+### BLAS library
 
 MXNet relies on the
 [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) (Basic
-Linear Algebra Subprograms) library for numerical computations. You can install
-any one among [ATLAS](http://math-atlas.sourceforge.net/),
-[OpenBLAS](http://www.openblas.net/) and
-[MKL](https://software.intel.com/en-us/intel-mkl).
-
-<div class="linux">
-<div class="ubuntu">
+Linear Algebra Subprograms) library for numerical computations.
+Those can be extended with [LAPACK (Linear Algebra Package)](https://github.com/Reference-LAPACK/lapack), an additional set of mathematical functions.
 
-```bash
-sudo apt-get install libatlas-base-dev
-```
+MXNet supports multiple mathematical backends for computations on the CPU:
 
-</div>
+* [Apple Accelerate](https://developer.apple.com/documentation/accelerate)
+* [ATLAS](http://math-atlas.sourceforge.net/)
+* [MKL](https://software.intel.com/en-us/intel-mkl) (MKL, MKLML)
+* [MKLDNN](https://github.com/intel/mkl-dnn)
+* [OpenBLAS](http://www.openblas.net/)
 
-<div class="centos">
+Usage of these are covered in more detail in the [build configurations](#build-configurations) section.
 
-```bash
-sudo yum install atlas-devel
-```
 
-</div>
+### Optional
 
-<div class="linux">
+These might be optional, but they're typically desirable.
 
-You can follow this link to build
-[OpenBlas from source](https://github.com/xianyi/OpenBLAS#installation-from-source).
+* [OpenCV](http://opencv.org/) for Image Loading and Augmentation
+* [NVDIA CUDA and cuDNN](https://developer.nvidia.com/cuda-downloads) for running MXNet with GPUs
 
-</div>
-</div>
 
-<div class="macos">
+## Build Instructions by Operating System
 
-macOS users can skip this step as `xcode` ships with a BLAS library.
+Detailed instructions are provided per operating system.
+You may jump to those, but it is recommended that you continue reading to understand more general build from source options.
 
-</div>
+| | | | |
+|---|---|---|---|
+| [macOS](osx_setup.html) | [Ubuntu](ubuntu_setup.html) | [CentOS/*unix](centos_setup.html) | [Windows](windows_setup.html) |
+| [raspbian](raspian_setup.html) | [tx2](tx2_setup.html) | | |
 
-<div class="windows">
 
-1. Download pre-built binaries for [OpenBLAS](https://sourceforge.net/projects/openblas/files/)
-2. Set the environment variable `OpenBLAS_HOME` to point to the OpenBLAS
-   directory that contains the `include/` and `lib/` directories. Typically, you
-   can find the directory in `C:\Program files (x86)\OpenBLAS\`.
 
-</div>
-
-#### Optional: [OpenCV](http://opencv.org/) for Image Loading and Augmentation
-
-<div class="linux">
-<div class="ubuntu">
+## Build
 
+1. Clone the MXNet project.
 ```bash
-sudo apt-get install libopencv-dev
-```
-
-</div>
-
-<div class="centos">
-
-```bash
-sudo apt-get install opencv-devel
+git clone --recursive https://github.com/apache/incubator-mxnet mxnet
+cd mxnet
 ```
 
-</div>
-
-<div class="others">
-
-To build OpenCV from source code, you need the [cmake](https://cmake.org) library.
-
-1. If you don't have cmake or if your version of cmake is earlier than 3.6.1, run the following commands to install a newer version of cmake:
-
-   ```bash
-   wget https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.tar.gz
-   tar -zxvf cmake-3.6.1-Linux-x86_64.tar.gz
-   alias cmake="cmake-3.6.1-Linux-x86_64/bin/cmake"
-   ```
-
-2. To download and extract the OpenCV source code, run the following commands:
-
-   ```bash
-   wget https://codeload.github.com/opencv/opencv/zip/2.4.13
-   unzip 2.4.13
-   cd opencv-2.4.13
-   mkdir release
-   cd release/
-   ```
-
-3. Build OpenCV. The following commands build OpenCV with 10 threads. We
-   disabled GPU support, which might significantly slow down an MXNet program
-   running on a GPU processor. It also disables 1394 which might generate a
-   warning. Then install it on `/usr/local`.
-
-   ```bash
-   cmake -D BUILD_opencv_gpu=OFF -D WITH_CUDA=OFF -D WITH_1394=OFF -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local ..
-   make -j10
-   sudo make install
-   ```
+There is a configuration file for make,
+[`make/config.mk`](https://github.com/apache/incubator-mxnet/blob/master/make/config.mk), that contains all the compilation options. You can edit it and then run `make`.
 
-4. Add the lib path to your configuration such as `~/.bashrc`.
 
-   ```bash
-   export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/lib/pkgconfig/
-   ```
+## Build Configurations
 
-</div>
-</div>
+`cmake` is recommended for building MXNet, however you may use `make` instead.
 
-<div class="windows">
 
-First download and install [OpenCV](http://opencv.org/releases.html), then set
-the environment variable `OpenCV_DIR` to point to the OpenCV build directory.
+### Math Library Selection
+It is useful to consider your math library selection first.
 
-</div>
+The default order of choice for the libraries if found follows the path from the most
+(recommended) to less performant backends.
+The following lists show this order by library and `cmake` switch.
 
-#### Optional: [CUDA](https://developer.nvidia.com/cuda-downloads)/[cuDNN](https://developer.nvidia.com/cudnn) for Nvidia GPUs
+For desktop platforms (x86_64):
 
-MXNet is compatible with both CUDA 7.5 and 8.0. It is recommended to use cuDNN 5.
+1. MKLDNN (submodule) | `USE_MKLDNN`
+2. MKL | `USE_MKL_IF_AVAILABLE`
+3. MKLML (downloaded) | `USE_MKLML`
+4. Apple Accelerate | `USE_APPLE_ACCELERATE_IF_AVAILABLE` | Mac only
+5. OpenBLAS | `BLAS` | Options: Atlas, Open, MKL, Apple
 
-<div class="linux">
-<div class="ubuntu">
+Note: If `USE_MKL_IF_AVAILABLE` is set to False then MKLML and MKLDNN will be disabled as well for configuration
+backwards compatibility.
 
-Install CUDA 7.5 and cuDNN 5 on Ubuntu 14.04
+For embedded platforms (all other and if cross compiled):
 
-```bash
-wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb
-sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb
-echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64 /" | sudo tee /etc/apt/sources.list.d/nvidia-ml.list
-sudo apt-get update
-sudo apt-get install -y linux-image-extra-`uname -r` linux-headers-`uname -r` linux-image-`uname -r`
-sudo apt-get install -y cuda libcudnn5-dev=5.0.5-1+cuda7.5
-```
-
-</div>
-</div>
-
-### Build
-
-<div class="linux macos">
-
-First clone the recent codes
-
-```bash
-git clone --recursive https://github.com/dmlc/mxnet
-cd mxnet
-```
-
-File
-[`make/config.mk`](https://github.com/dmlc/mxnet/blob/master/make/config.mk)
-contains all the compilation options. You can edit it and then `make`. There are
-some example build options
-
-If you want to build MXNet with C++ language binding, please make sure you read [Build the C++ package](#build-the-c-package) first.
-
-</div>
-
-<div class="linux">
+1. OpenBLAS | `BLAS` | Options: Atlas, Open, MKL, Apple
 
-- Build without using OpenCV. `-j` runs multiple jobs against multi-core CPUs.
+You can set the BLAS library explicitly by setting the BLAS variable to:
 
-  ```bash
-  make -j USE_OPENCV=0
-  ```
+* Atlas
+* Open
+* MKL
+* Apple
 
-- Build with both GPU and OpenCV support
+See the [cmake/ChooseBLAS.cmake](https://github.com/apache/incubator-mxnet/blob/master/cmake/ChooseBlas.cmake) file for the options.
 
-  ```bash
-  make -j USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
-  ```
+Intel's MKL (Math Kernel Library) is one of the most powerful math libraries
+https://software.intel.com/en-us/mkl
 
-</div>
+It has following flavors:
 
-<div class="macos">
+* MKL is a complete math library, containing all the functionality found in ATLAS, OpenBlas and LAPACK. It is free under
+  community support licensing (https://software.intel.com/en-us/articles/free-mkl),
+  but needs to be downloaded and installed manually.
 
-- Build with the default BLAS library and clang installed with `xcode` (OPENMP
-  is disabled because it is not supported in default by clang).
+* MKLML is a subset of MKL. It contains a smaller number of functions to reduce the
+  size of the download and reduce the number of dynamic libraries user needs.
 
-  ```bash
-  make -j USE_BLAS=apple USE_OPENCV=0 USE_OPENMP=0
-  ```
+  <!-- [Removed until #11148 is merged.] This is the most effective option since it can be downloaded and installed automatically
+  by the cmake script (see cmake/DownloadMKLML.cmake).-->
 
-</div>
+* MKLDNN is a separate open-source library, it can be used separately from MKL or MKLML. It is
+  shipped as a subrepo with MXNet source code (see 3rdparty/mkldnn or the [mkl-dnn project](https://github.com/intel/mkl-dnn))
 
-<div class="windows">
+Since the full MKL library is almost always faster than any other BLAS library it's turned on by default,
+however it needs to be downloaded and installed manually before doing `cmake` configuration.
+Register and download on the [Intel performance libraries website](https://software.seek.intel.com/performance-libraries).
 
-Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build```.
+Note: MKL is supported only for desktop builds and the framework itself supports the following
+hardware:
 
-In Visual Studio, open the solution file,```.sln```, and compile it.
-These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
+* Intel® Xeon Phi™ processor
+* Intel® Xeon® processor
+* Intel® Core™ processor family
+* Intel Atom® processor
 
-</div>
+If you have a different processor you can still try to use MKL, but performance results are
+unpredictable.
 
-<div class="linux ubuntu">
 
-## Build MXNet using NCCL
+### Build MXNet with NCCL
 - Download and install the latest NCCL library from NVIDIA.
 - Note the directory path in which NCCL libraries and header files are installed.
 - Ensure that the installation directory contains ```lib``` and ```include``` folders.
-- Ensure that the prerequisites for using NCCL such as Cuda libraries are met. 
+- Ensure that the prerequisites for using NCCL such as Cuda libraries are met.
 - Append the ```config.mk``` file with following, in addition to the CUDA related options.
 - USE_NCCL=1
 - USE_NCCL_PATH=path-to-nccl-installation-folder
+
 ``` bash
 echo "USE_NCCL=1" >> make/config.mk
 echo "USE_NCCP_PATH=path-to-nccl-installation-folder" >> make/config.mk
@@ -339,7 +167,7 @@ cp make/config.mk .
 make -j"$(nproc)"
 ```
 
-## Validation
+#### Validating NCCL
 - Follow the steps to install MXNet Python binding.
 - Comment the following line in ```test_nccl.py``` file at ```incubator-mxnet/tests/python/gpu/test_nccl.py```
 ``` bash
@@ -350,143 +178,56 @@ make -j"$(nproc)"
 nosetests --verbose tests/python/gpu/test_nccl.py
 ```
 
-## Recommendation for best performance
+**Recommendation to get the best performance out of NCCL:**
 It is recommended to set environment variable NCCL_LAUNCH_MODE to PARALLEL when using NCCL version 2.1 or newer.
 
 
-</div>
-
-## Build the C++ package
-The C++ package has the same prerequisites as the MXNet library, you should also have `python` installed. (Both `python` 2 and 3 are supported)
-
-To enable C++ package, just add `USE_CPP_PACKAGE=1` in the build options when building the MXNet shared library.
-
-## Build the R package
+### Build MXNet with Language Packages
+* To enable C++ package, just add `USE_CPP_PACKAGE=1` when you run `make` or `cmake`.
 
-The R package requires `R` to be installed.
 
-<div class="ubuntu">
-
-Follow the below instructions to install the latest R on Ubuntu 14.04 (trusty) and also the libraries used
-to build other R package dependencies.
+### Usage Examples
+* `-j` runs multiple jobs against multi-core CPUs. Example using all cores on Linux:
 
 ```bash
-echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list
-gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9
-gpg -a --export E084DAB9 | apt-key add -
-
-apt-get update
-apt-get install -y r-base r-base-dev libxml2-dev libxt-dev libssl-dev
+make -j$(nproc)
 ```
 
-</div>
-
-Install the required R package dependencies:
+* Build without using OpenCV:
 
 ```bash
-cd R-package
-Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
-Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cran.rstudio.com')); install_deps(dependencies = TRUE)"
+make USE_OPENCV=0
 ```
 
-Next, build and install the MXNet R package:
+* Build with both OpenBLAS, GPU, and OpenCV support:
 
 ```bash
-cd ..
-make rpkg
+make -j USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
 ```
 
-## Build the Scala package
-
-Both JDK and Maven are required to build the Scala package.
-
-<div class="ubuntu">
+* Build on **macOS** with the default BLAS library (Apple Accelerate) and Clang installed with `xcode` (OPENMP is disabled because it is not supported by the Apple version of Clang):
 
 ```bash
-apt-get install -y software-properties-common
-add-apt-repository -y ppa:webupd8team/java
-apt-get update
-echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections
-apt-get install -y oracle-java8-installer
-apt-get install -y oracle-java8-set-default
-apt-get install -y maven
+make -j USE_BLAS=apple USE_OPENCV=0 USE_OPENMP=0
 ```
 
-</div>
-
-The following command builds the `.jar` package:
+* To use OpenMP on **macOS** you need to install the Clang compiler, `llvm` (the one provided by Apple does not support OpenMP):
 
 ```bash
-make scalapkg
+brew install llvm
+make -j USE_BLAS=apple USE_OPENMP=1
 ```
 
-which can be found by `ls scala-package/assembly/*/target/*SNAPSHOT.jar`.
-
-Optionally, we can install Scala for the interactive interface.
+## Installing MXNet Language Bindings
+After building MXNet's shared library, you can install other language bindings. (Except for C++. You need to build this when you build MXNet from source.)
 
-<div class="ubuntu">
-
-```bash
-wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb
-dpkg -i scala-2.11.8.deb
-rm scala-2.11.8.deb
-```
-
-</div>
-
-Then we can start `scala` with `mxnet` imported by
-
-```bash
-scala -cp scala-package/assembly/*/target/*SNAPSHOT.jar
-```
-
-## Build the Julia package
-
-We need to first install Julia.
-
-<div class="ubuntu centos linux">
-
-The following commands install Julia 0.5.1
-
-```bash
-wget -q https://julialang.s3.amazonaws.com/bin/linux/x64/0.5/julia-0.5.1-linux-x86_64.tar.gz
-tar -zxf julia-0.5.1-linux-x86_64.tar.gz
-rm julia-0.5.1-linux-x86_64.tar.gz
-ln -s $(pwd)/julia-6445c82d00/bin/julia /usr/bin/julia
-```
-
-</div>
-
-Next set the environment variable `MXNET_HOME=/path/to/mxnet` so that Julia
-can find the pre-built library.
-
-Install the Julia package for MXNet with:
-
-```bash
-julia -e 'Pkg.add("MXNet")'
-```
-
-### Build the Perl package
-
-Run the following command from the MXNet source root directory to build the MXNet Perl package:
-
-```bash
-    sudo apt-get install libmouse-perl pdl cpanminus swig libgraphviz-perl
-    cpanm -q -L "${HOME}/perl5" Function::Parameters Hash::Ordered PDL::CCS
-
-    MXNET_HOME=${PWD}
-    export LD_LIBRARY_PATH=${MXNET_HOME}/lib
-    export PERL5LIB=${HOME}/perl5/lib/perl5
-
-    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make install
-
-    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make install
-
-    cd ${MXNET_HOME}/perl-package/AI-MXNet/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make install
-```
+The following table provides links to each language binding by operating system:
+|   | Linux | macOS | Windows |
+|---|---|---|---|
+| Python | [Linux](ubuntu_setup.html#install-mxnet-for-python) | [macOS](osx_setup.html) | [Windows](windows_setup.html#install-mxnet-for-python) |
+| C++ | [Linux](c_plus_plus.html) | [macOS](c_plus_plus.html) | [Windows](c_plus_plus.html) |
+| Clojure | [Linux](https://github.com/apache/incubator-mxnet/tree/master/contrib/clojure-package) | [macOS](https://github.com/apache/incubator-mxnet/tree/master/contrib/clojure-package) | n/a |
+| Julia | [Linux](ubuntu_setup.html#install-the-mxnet-package-for-julia) | [macOS](osx_setup.html#install-the-mxnet-package-for-julia) | [Windows](windows_setup.html#install-the-mxnet-package-for-julia) |
+| Perl | [Linux](ubuntu_setup.html#install-the-mxnet-package-for-perl) | [macOS](osx_setup.html#install-the-mxnet-package-for-perl) | [Windows](n/a) |
+| R | [Linux](ubuntu_setup.html#install-the-mxnet-package-for-r) | [macOS](osx_setup.html#install-the-mxnet-package-for-r) | [Windows](windows_setup.html#install-the-mxnet-package-for-r) |
+| Scala | [Linux](scala_setup.html) | [macOS](scala_setup.html) | n/a |
diff --git a/docs/install/c_plus_plus.md b/docs/install/c_plus_plus.md
new file mode 100644
index 00000000000..6078877c27c
--- /dev/null
+++ b/docs/install/c_plus_plus.md
@@ -0,0 +1,29 @@
+## Build the C++ package
+The C++ package has the same prerequisites as the MXNet library.
+
+To enable C++ package, just add `USE_CPP_PACKAGE=1` in the [build from source](build_from_source.html) options when building the MXNet shared library.
+
+For example to build MXNet with GPU support and the C++ package, OpenCV, and OpenBLAS, from the project root you would run:
+
+```bash
+make -j USE_CPP_PACKAGE=1 USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1
+```
+
+You may also want to add the MXNet shared library to your `LD_LIBRARY_PATH`:
+
+```bash
+export LD_LIBRARY_PATH=~/incubator-mxnet/lib
+```
+
+Setting the `LD_LIBRARY_PATH` is required to run the examples mentioned in the following section.
+
+## C++ Example Code
+You can find C++ code examples in the `cpp-package/example` folder of the MXNet project. Refer to the [cpp-package's README](https://github.com/apache/incubator-mxnet/tree/master/cpp-package) for instructions on building the examples.
+
+## Tutorials
+
+* [MXNet C++ API Basics](https://mxnet.incubator.apache.org/tutorials/c++/basics.html)
+
+## Related Topics
+
+* [Image Classification using MXNet's C Predict API](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp)
diff --git a/docs/install/centos_setup.md b/docs/install/centos_setup.md
index 42a4fcb0eb8..f63099bcf62 100644
--- a/docs/install/centos_setup.md
+++ b/docs/install/centos_setup.md
@@ -1,8 +1,90 @@
-<!-- This page should be deleted after sometime (Allowing search engines
-to update links) -->
-<meta http-equiv="refresh" content="3; url=http://mxnet.io/install/index.html" />
-<!-- Just in case redirection does not work -->
-<p>
-  <a href="http://mxnet.io/install/index.html">
-    This content is moved to a new MXNet install page. Redirecting... </a>
-</p>
+# Installing MXNet on CentOS and other non-Ubuntu Linux systems
+
+1. Install build tools and git on `CentOS >= 7` and `Fedora >= 19`:
+
+```bash
+sudo yum groupinstall -y "Development Tools" && sudo yum install -y git
+```
+
+2. Install Atlas:
+
+```bash
+sudo yum install atlas-devel
+```
+
+Installing both `git` and `cmake` or `make` by following instructions on the websites is
+straightforward. Here we provide the instructions to build `gcc-4.8` from source codes.
+
+3. Install the 32-bit `libc` with one of the following system-specific commands:
+
+```bash
+sudo apt-get install libc6-dev-i386 # In Ubuntu
+sudo yum install glibc-devel.i686   # In RHEL (Red Hat Linux)
+sudo yum install glibc-devel.i386   # In CentOS 5.8
+sudo yum install glibc-devel.i686   # In CentOS 6/7
+```
+
+4. Download and extract the `gcc` source code with the prerequisites:
+
+```bash
+wget http://mirrors.concertpass.com/gcc/releases/gcc-4.8.5/gcc-4.8.5.tar.gz
+tar -zxf gcc-4.8.5.tar.gz
+cd gcc-4.8.5
+./contrib/download_prerequisites
+```
+
+5. Build `gcc` by using 10 threads and then install to `/usr/local`
+
+```bash
+mkdir release && cd release
+../configure --prefix=/usr/local --enable-languages=c,c++
+make -j10
+sudo make install
+```
+
+6. Add the lib path to your configure file such as `~/.bashrc`:
+
+```bash
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib64
+```
+
+7. Build [OpenBLAS from source](https://github.com/xianyi/OpenBLAS#installation-from-source).
+
+8. Build OpenCV
+
+To build OpenCV from source code, you need the [cmake](https://cmake.org) library.
+
+* If you don't have cmake or if your version of cmake is earlier than 3.6.1, run the following commands to install a newer version of cmake:
+
+   ```bash
+   wget https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.tar.gz
+   tar -zxvf cmake-3.6.1-Linux-x86_64.tar.gz
+   alias cmake="cmake-3.6.1-Linux-x86_64/bin/cmake"
+   ```
+
+* To download and extract the OpenCV source code, run the following commands:
+
+   ```bash
+   wget https://codeload.github.com/opencv/opencv/zip/2.4.13
+   unzip 2.4.13
+   cd opencv-2.4.13
+   mkdir release
+   cd release/
+   ```
+
+* Build OpenCV. The following commands build OpenCV with 10 threads. We
+   disabled GPU support, which might significantly slow down an MXNet program
+   running on a GPU processor. It also disables 1394 which might generate a
+   warning. Then install it on `/usr/local`.
+
+   ```bash
+   cmake -D BUILD_opencv_gpu=OFF -D WITH_CUDA=OFF -D WITH_1394=OFF -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local ..
+   make -j10
+   sudo make install
+   ```
+
+* Add the lib path to your configuration such as `~/.bashrc`.
+
+   ```bash
+   export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/lib/pkgconfig/
+   ```
diff --git a/docs/install/index.md b/docs/install/index.md
index 833bedf08af..4a6af31cee3 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -1,6 +1,6 @@
 # Installing MXNet
 
-Indicate your preferred configuration. Then, follow the customized commands to install *MXNet*.
+Indicate your preferred configuration. Then, follow the customized commands to install MXNet.
 
 <div class="dropdown">
   <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.2.1
@@ -63,14 +63,13 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <div class="cpu gpu">
 <div class="btn-group opt-group" role="group">
   <button type="button" class="btn btn-default environs opt active">Pip</button>
-  <button type="button" class="btn btn-default environs opt">Virtualenv</button>
   <button type="button" class="btn btn-default environs opt">Docker</button>
   <button type="button" class="btn btn-default environs opt">Build from Source</button>
 </div>
 </div>
 </div>
 </div>
-
+<hr>
 <!-- END - Main Menu -->
 
 <!-- START - Linux Python CPU Installation Instructions -->
@@ -78,88 +77,27 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <div class="linux">
 <div class="python">
 <div class="cpu">
-
-The following installation instructions have been tested on Ubuntu 14.04 and 16.04.
-
-
 <div class="pip">
-<br/>
-
-**Step 1**  Install prerequisites - wget and latest pip.
-
-Installing *MXNet* with pip requires a latest version of `pip`. Install the latest version of `pip` by issuing the following command in the terminal.
-
-```bash
-$ sudo apt-get update
-$ sudo apt-get install -y wget python gcc
-$ wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
-```
-
 <div class="v1-2-1">
 
-**Step 2** Install MXNet with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet
-```
-
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
 ```
-
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
-
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-mkl
+$ pip install mxnet
 ```
 
 </div> <!-- End of v1-2-1 -->
 
 <div class="v1-1-0">
 
-**Step 2** Install MXNet with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet==1.1.0
-```
-
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
 ```
-
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
-
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-mkl==1.1.0
+$ pip install mxnet==1.1.0
 ```
 
 </div> <!-- End of v1-1-0-->
 
 <div class="v1-0-0">
 
-**Step 2** Install MXNet with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet==1.0.0
-```
-
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
 ```
-
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
-
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-mkl==1.0.0
+$ pip install mxnet==1.0.0
 ```
 
 </div> <!-- End of v1-0-0-->
@@ -167,210 +105,159 @@ $ pip install mxnet-mkl==1.0.0
 
 <div class="v0-12-1">
 
-
-**Step 2** Install MXNet with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet==0.12.1
-```
-
-For MXNet 0.12.0 -
-
-```bash
-$ pip install mxnet==0.12.0
 ```
-
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
+$ pip install mxnet==0.12.1
 ```
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+For MXNet 0.12.0:
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-mkl==0.12.1
 ```
-
-For MXNet 0.12.0 -
-
-```bash
-$ pip install mxnet-mkl==0.12.0
+$ pip install mxnet==0.12.0
 ```
 
 </div> <!-- End of v0-12-1-->
 
 <div class="v0-11-0">
 
-
-**Step 2** Install MXNet with OpenBLAS acceleration.
-
-```bash
+```
 $ pip install mxnet==0.11.0
 ```
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
+</div> <!-- End of v0-11-0-->
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+<div class="master">
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-mkl==0.11.0
+```
+$ pip install mxnet --pre
 ```
 
-</div> <!-- End of v0-11-0-->
+</div> <!-- End of master-->
+<hr> <!-- pip footer -->
+Most MXNet versions offer an experimental MKL pip package that will be much faster when running on Intel hardware.
+Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<div class="master">
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
+</div> <!-- End of pip -->
 
-**Step 2** Install MXNet with OpenBLAS acceleration.
 
-```bash
-$ pip install mxnet --pre
-```
+<div class="docker">
+<br/>
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
+Docker images with *MXNet* are available at [Docker Hub](https://hub.docker.com/r/mxnet/).
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+**Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-mkl --pre
-```
+*Note* - You can install Community Edition (CE) to get started with *MXNet*.
 
-</div> <!-- End of master-->
+**Step 2** [Optional] Post installation steps to manage Docker as a non-root user.
 
-</div> <!-- End of pip -->
+Follow the four steps in this [docker documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user) to allow managing docker containers without *sudo*.
 
-<div class="virtualenv">
-<br/>
+If you skip this step, you need to use *sudo* each time you invoke Docker.
 
-**Step 1**  Install virtualenv for Ubuntu.
+**Step 3** Pull the MXNet docker image.
 
-```bash
-$ sudo apt-get update
-$ sudo apt-get install -y python-dev python-virtualenv
+```
+$ docker pull mxnet/python # Use sudo if you skip Step 2
 ```
 
-**Step 2**  Create and activate virtualenv environment for MXNet.
+You can list docker images to see if mxnet/python docker image pull was successful.
 
-Following command creates a virtualenv environment at `~/mxnet` directory. However, you can choose any directory by replacing `~/mxnet` with a directory of your choice.
+```
+$ docker images # Use sudo if you skip Step 2
 
-```bash
-$ virtualenv --system-site-packages ~/mxnet
+REPOSITORY          TAG                 IMAGE ID            CREATED             SIZE
+mxnet/python        latest              00d026968b3c        3 weeks ago         1.41 GB
 ```
 
-Activate the virtualenv environment created for *MXNet*.
+**Step 4** <a href="validate_mxnet.html">Validate the installation</a>.
 
-```bash
-$ source ~/mxnet/bin/activate
-```
+</div> <!-- END of docker -->
 
-After activating the environment, you should see the prompt as below.
+<div class="build-from-source">
+<br/>
 
-```bash
-(mxnet)$
-```
+To build from source, refer to the <a href="ubuntu_setup.html">MXNet Ubuntu installation guide</a>.
 
-**Step 3**  Install MXNet in the active virtualenv environment.
+</div><!-- END of build from source -->
 
-Installing *MXNet* with pip requires a latest version of `pip`. Install the latest version of `pip` by issuing the following command.
+</div><!-- END of CPU -->
+<!-- END - Linux Python CPU Installation Instructions -->
 
-```bash
-$ pip install --upgrade pip
-```
+<!-- START - Linux Python GPU Installation Instructions -->
 
+<div class="gpu">
+<div class="pip">
 <div class="v1-2-1">
 
-Install *MXNet* with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet
+```
+$ pip install mxnet-cu92
 ```
 
 </div> <!-- End of v1-2-1-->
 
 <div class="v1-1-0">
 
-Install *MXNet* with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet==1.1.0
+```
+$ pip install mxnet-cu91==1.1.0
 ```
 
 </div> <!-- End of v1-1-0-->
 
 <div class="v1-0-0">
 
-Install *MXNet* with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet==1.0.0
+```
+$ pip install mxnet-cu90==1.0.0
 ```
 
 </div> <!-- End of v1-0-0-->
 
-
 <div class="v0-12-1">
 
-Install *MXNet* with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet==0.12.1
 ```
-
-For *MXNet* 0.12.0 -
-
-```bash
-$ pip install mxnet==0.12.0
+$ pip install mxnet-cu90==0.12.1
 ```
 
 </div> <!-- End of v0-12-1-->
 
 <div class="v0-11-0">
 
-Install *MXNet* with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet==0.11.0
+```
+$ pip install mxnet-cu80==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
 
 <div class="master">
 
-Install *MXNet* with OpenBLAS acceleration.
-
-```bash
-$ pip install mxnet --pre
+```
+$ pip install mxnet-cu92 --pre
 ```
 
 </div> <!-- End of master-->
+<hr> <!-- pip footer -->
+Most MXNet versions offer an experimental MKL pip package that will be much faster when running on Intel hardware.
+Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
-**Step 4**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
+**NOTES:**
 
-**Step 5**  Validate the installation by running simple *MXNet* code described [here](#validate-mxnet-installation).
+CUDA should be installed first. Instructions can be found in the <a href="ubuntu_setup.html#cuda-dependencies">CUDA dependencies section of the MXNet Ubuntu installation guide</a>.
 
-**Note**  You can read more about virtualenv [here](https://virtualenv.pypa.io/en/stable/userguide/).
+**Important:** Make sure your installed CUDA version matches the CUDA version in the pip package. Check your CUDA version with the following command:
+
+```
+nvcc --version
+```
 
-</div> <!-- END of virtualenv -->
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
+</div> <!-- END of pip -->
 
 <div class="docker">
+
 <br/>
 
 Docker images with *MXNet* are available at [Docker Hub](https://hub.docker.com/r/mxnet/).
@@ -385,2146 +272,806 @@ Follow the four steps in this [docker documentation](https://docs.docker.com/eng
 
 If you skip this step, you need to use *sudo* each time you invoke Docker.
 
-**Step 3** Pull the MXNet docker image.
+**Step 3** Install *nvidia-docker-plugin* following the [installation instructions](https://github.com/NVIDIA/nvidia-docker/wiki/Installation). *nvidia-docker-plugin* is required to enable the usage of GPUs from the docker containers.
 
-```bash
-$ docker pull mxnet/python # Use sudo if you skip Step 2
+**Step 4** Pull the MXNet docker image.
+
+```
+$ docker pull mxnet/python:gpu # Use sudo if you skip Step 2
 ```
 
 You can list docker images to see if mxnet/python docker image pull was successful.
 
-```bash
+```
 $ docker images # Use sudo if you skip Step 2
 
 REPOSITORY          TAG                 IMAGE ID            CREATED             SIZE
-mxnet/python        latest              00d026968b3c        3 weeks ago         1.41 GB
+mxnet/python        gpu                 493b2683c269        3 weeks ago         4.77 GB
 ```
 
-**Step 4** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+**Step 5** <a href="validate_mxnet.html">Validate the installation</a>.
 
 </div> <!-- END of docker -->
 
 <div class="build-from-source">
 <br/>
+Refer to the <a href="ubuntu_setup.html">MXNet Ubuntu installation guide</a>.
 
-Building *MXNet* from source is a 2 step process.
-1. Build the *MXNet* core shared library, `libmxnet.so`, from the C++ sources.
-2. Build the language specific bindings. Example - Python bindings, Scala bindings.
 
-**Minimum Requirements**
-1. [GCC 4.8](https://gcc.gnu.org/gcc-4.8/) or later to compile C++ 11.
-2. [GNU Make](https://www.gnu.org/software/make/)
+</div> <!-- END of build from source -->
+</div> <!-- END of GPU -->
+</div> <!-- END of Python -->
+<!-- END - Linux Python GPU Installation Instructions -->
 
-<br/>
 
-**Build the MXNet core shared library**
+<div class="r">
+<div class="cpu">
 
-**Step 1** Install build tools and git.
-```bash
-$ sudo apt-get update
-$ sudo apt-get install -y build-essential git
-```
+The default version of R that is installed with `apt-get` is insufficient. You will need to first [install R v3.4.4+ and build MXNet from source](ubuntu_setup.html#install-the-mxnet-package-for-r).
 
-**Step 2** Install OpenBLAS.
+After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
 
-*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [LAPACK](https://en.wikipedia.org/wiki/LAPACK) libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
-```bash
-$ sudo apt-get install -y libopenblas-dev liblapack-dev
+```
+$ cd incubator-mxnet
+$ make rpkg
 ```
 
-**Step 3** Install OpenCV.
+</div> <!-- END of CPU -->
 
-*MXNet* uses [OpenCV](http://opencv.org/) for efficient image loading and augmentation operations.
-```bash
-$ sudo apt-get install -y libopencv-dev
-```
 
-**Step 4** Download MXNet sources and build MXNet core shared library. You can clone the repository as described in the following code block, or you may try the <a href="download.html">download links</a> for your desired MXNet version.
+<div class="gpu">
+
+The default version of R that is installed with `apt-get` is insufficient. You will need to first [install R v3.4.4+ and build MXNet from source](ubuntu_setup.html#install-the-mxnet-package-for-r).
+
+After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
 
-```bash
-$ git clone --recursive https://github.com/apache/incubator-mxnet
+```
 $ cd incubator-mxnet
-$ make -j $(nproc) USE_OPENCV=1 USE_BLAS=openblas
+$ make rpkg
 ```
 
-*Note* - USE_OPENCV and USE_BLAS are make file flags to set compilation options to use OpenCV and BLAS library. You can explore and use more compilation options in `make/config.mk`.
+</div> <!-- END of GPU -->
+</div> <!-- END of R -->
 
-<br/>
 
-**Build the MXNet Python binding**
+<div class="scala">
+<div class="gpu">
+<br/>
+You can use the Maven packages defined in the following `dependency` to include MXNet in your Scala project. Please refer to the <a href="scala_setup.html">MXNet-Scala setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-**Step 1** Install prerequisites - python, setup-tools, python-pip and libfortran (required for Numpy).
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
 
-```bash
-$ sudo apt-get install -y python-dev python-setuptools python-pip libgfortran3
+```html
+<dependency>
+    <groupId>org.apache.mxnet</groupId>
+    <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
+</dependency>
 ```
+<br>
+</div> <!-- End of gpu -->
 
-**Step 2** Install the MXNet Python binding.
-
-```bash
-$ cd python
-$ pip install -e .
-```
+<div class="cpu">
+<br/>
+You can use the Maven packages defined in the following `dependency` to include MXNet in your Scala project. Please refer to the <a href="scala_setup.html">MXNet-Scala setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
+```html
+<dependency>
+    <groupId>org.apache.mxnet</groupId>
+    <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+</dependency>
 ```
+<br>
+</div> <!-- End of cpu -->
+</div> <!-- End of scala -->
 
-**Step 4** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
 
-</div><!-- END of build from source -->
-</div><!-- END of CPU -->
-<!-- END - Linux Python CPU Installation Instructions -->
+<div class="julia">
+<div class="cpu gpu">
+</br>
+Refer to the <a href="ubuntu_setup.html#install-the-mxnet-package-for-julia">Julia section of the MXNet Ubuntu installation guide</a>.
 
-<!-- START - Linux Python GPU Installation Instructions -->
+</div> <!-- End of cpu gpu -->
+</div> <!-- End of julia -->
 
-<div class="gpu">
+<div class="perl">
+<div class="cpu gpu">
+</br>
+Refer to the <a href="ubuntu_setup.html#install-the-mxnet-package-for-perl">Perl section of the MXNet Ubuntu installation guide</a>.
 
-The following installation instructions have been tested on Ubuntu 14.04 and 16.04.
+</div> <!-- End of cpu gpu -->
+</div> <!-- End of julia -->
 
 
-**Prerequisites**
 
-Install the following NVIDIA libraries to setup *MXNet* with GPU support:
+<div class="cpp">
+<div class="cpu gpu">
+<br/>
+<p>To enable the C++ package, build from source using `make USE_CPP_PACKAGE=1`.
+<br/>Refer to the <a href="c_plus_plus.html">MXNet C++ setup guide</a> for more info.</p>
+<br/>
+</div> <!-- End of cpu gpu -->
+</div> <!-- END - C++-->
+<hr>
+For more installation options, refer to the <a href="ubuntu_setup.html">MXNet Ubuntu installation guide</a>.
 
-1. Install CUDA 9.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
-2. Install cuDNN 7 for CUDA 9.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
+</div> <!-- END - Linux -->
 
-**Note:** Make sure to add CUDA install path to `LD_LIBRARY_PATH`.
 
-Example - *export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH*
+<!-- START - MacOS Python CPU Installation Instructions -->
 
+<div class="macos">
+<div class="python">
+<div class="cpu">
 <div class="pip">
-<br/>
-
-**Step 1**  Install prerequisites - wget and latest pip.
-
-Installing *MXNet* with pip requires a latest version of `pip`. Install the latest version of `pip` by issuing the following command in the terminal.
+<div class="v1-2-1">
 
-```bash
-$ sudo apt-get update
-$ sudo apt-get install -y wget python
-$ wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
+```
+$ pip install mxnet
 ```
 
-<div class="v1-2-1">
+</div> <!-- End of v1-2-1 -->
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.2
 
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
+<div class="v1-1-0">
 
-```bash
-nvcc --version
 ```
-
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
-
-```bash
-$ pip install mxnet-cu92
+$ pip install mxnet==1.1.0
 ```
 
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+</div> <!-- End of v1-1-0-->
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+<div class="v1-0-0">
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-cu90mkl
+```
+$ pip install mxnet==1.0.0
 ```
 
-</div> <!-- End of v1-2-1-->
-
-
-<div class="v1-1-0">
-
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.1
+</div> <!-- End of v1-0-0-->
 
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
+<div class="v0-12-1">
 
-```bash
-nvcc --version
+```
+$ pip install mxnet=0.12.1
 ```
 
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+</div> <!-- End of v0-12-1-->
 
-```bash
-$ pip install mxnet-cu91==1.1.0
-```
 
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+<div class="v0-11-0">
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
 ```
+$ pip install mxnet==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+<div class="master">
 
-**Experimental Choice** If You would like to install MXNet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-cu91mkl==1.1.0
+```
+$ pip install mxnet --pre
 ```
 
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+</div> <!-- End of master-->
+<hr> <!-- pip footer -->
+Most MXNet versions offer an experimental MKL pip package that will be much faster when running on Intel hardware.
+Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-</div> <!-- End of v1-1-0-->
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
 
-<div class="v1-0-0">
+</div> <!-- END of pip -->
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
-
-```bash
-$ pip install mxnet-cu90==1.0.0
-```
-
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
-
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
-
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-cu90mkl==1.0.0
-```
-
-</div> <!-- End of v1-0-0-->
 
+<div class="docker">
+<br/>
 
-<div class="v0-12-1">
+Docker images with *MXNet* are available at [Docker Hub](https://hub.docker.com/r/mxnet/).
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+**Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/docker-for-mac/install/#install-and-run-docker-for-mac).
 
-```bash
-$ pip install mxnet-cu90==0.12.1
-```
+*Note* - You can install Community Edition (CE) to get started with *MXNet*.
 
-For *MXNet* 0.12.0 -
+**Step 2** Pull the MXNet docker image.
 
-```bash
-$ pip install mxnet-cu90==0.12.0
 ```
-
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
+$ docker pull mxnet/python
 ```
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+You can list docker images to see if mxnet/python docker image pull was successful.
 
-**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-cu90mkl==0.12.1
 ```
+$ docker images
 
-For *MXNet* 0.12.0 -
-
-```bash
-$ pip install mxnet-cu90mkl==0.12.0
+REPOSITORY          TAG                 IMAGE ID            CREATED             SIZE
+mxnet/python        latest              00d026968b3c        3 weeks ago         1.41 GB
 ```
 
-</div> <!-- End of v0-12-1-->
-
-
-<div class="v0-11-0">
+**Step 4** <a href="validate_mxnet.html">Validate the installation</a>.
 
-**Step 2**  Install *MXNet* with GPU support using CUDA 8.0
+</div> <!-- END of docker -->
 
-```bash
-$ pip install mxnet-cu80==0.11.0
-```
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
+<div class="build-from-source">
+<br/>
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+To build from source, refer to the <a href="osx_setup.html">MXNet macOS installation guide</a>.
 
-**Experimental Choice** If You would like to install MXNet with Intel MKL, try the experimental pip package with MKL:
-```bash
-$ pip install mxnet-cu80mkl==0.11.0
-```
+MXNet developers should refer to the MXNet wiki's <a href="https://cwiki.apache.org/confluence/display/MXNET/MXNet+Developer+Setup+on+Mac">Developer Setup on Mac</a>.
+<br/>
+</div> <!-- END of build from source -->
+</div> <!-- END of CPU -->
 
-</div> <!-- End of v0-11-0-->
 
+<!-- START - Mac OS Python GPU Installation Instructions -->
+<div class="gpu">
+<div class="pip docker">
+<br/>
+This option is only available by building from source. Refer to the <a href="osx_setup.html">MXNet macOS installation guide</a>.
+<br/>
+</div>
 
-</div> <!-- END of pip -->
+<div class="build-from-source">
+<br/>
 
-<div class="virtualenv">
+Refer to the <a href="osx_setup.html">MXNet macOS installation guide</a>.
 
+MXNet developers should refer to the MXNet wiki's <a href="https://cwiki.apache.org/confluence/display/MXNET/MXNet+Developer+Setup+on+Mac">Developer Setup on Mac</a>.
 <br/>
+</div> <!-- END of build from source -->
+</div> <!-- END of GPU -->
+</div> <!-- END of Python -->
 
-**Step 1**  Install virtualenv for Ubuntu.
 
-```bash
-$ sudo apt-get update
-$ sudo apt-get install -y python-dev python-virtualenv
-```
+<!-- START - MacOS R CPU Installation Instructions -->
 
-**Step 2**  Create and activate virtualenv environment for MXNet.
+<div class="r">
+<div class="cpu">
+</br>
 
-Following command creates a virtualenv environment at `~/mxnet` directory. However, you can choose any directory by replacing `~/mxnet` with a directory of your choice.
+Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/macosx/).
+You can [build MXNet-R from source](osx_setup.html#install-the-mxnet-package-for-r), or you can use a pre-built binary:
 
-```bash
-$ virtualenv --system-site-packages ~/mxnet
+```r
+cran <- getOption("repos")
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
+options(repos = cran)
+install.packages("mxnet")
 ```
 
-Activate the virtualenv environment created for *MXNet*.
+</div> <!-- END of CPU -->
 
-```bash
-$ source ~/mxnet/bin/activate
-```
 
-After activating the environment, you should see the prompt as below.
+<div class="gpu">
+</br>
+Will be available soon.
 
-```bash
-(mxnet)$
-```
+</div> <!-- END of GPU -->
+</div> <!-- END of R -->
 
-**Step 3**  Install MXNet in the active virtualenv environment.
+<div class="scala">
+<div class="cpu">
+</br>
+You can use the Maven packages defined in the following `dependency` to include MXNet in your Scala project. Please refer to the <a href="scala_setup.html">MXNet-Scala setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-Installing *MXNet* with pip requires a latest version of `pip`. Install the latest version of `pip` by issuing the following command.
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
 
-```bash
-(mxnet)$ pip install --upgrade pip
+```html
+<dependency>
+    <groupId>org.apache.mxnet</groupId>
+    <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+</dependency>
 ```
+<br>
+</div> <!-- End of cpu  -->
+<div class="gpu">
 
+Not available at this time. <br>
 
-<div class="v1-2-1">
+</div>
+</div> <!-- End of scala -->
 
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
 
-```bash
-nvcc --version
-```
 
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+<div class="julia">
+<div class="cpu gpu">
+</br>
+Refer to the <a href="osx_setup.html#install-the-mxnet-package-for-julia">Julia section of the MXNet macOS installation guide</a>.
 
-Install *MXNet* with GPU support using CUDA 9.2:
+</div> <!-- End of cpu gpu -->
+</div> <!-- End of julia -->
 
-```bash
-(mxnet)$ pip install mxnet-cu92
-```
+<div class="perl">
+<div class="cpu gpu">
+</br>
+Refer to the <a href="osx_setup.html#install-the-mxnet-package-for-perl">Perl section of the MXNet macOS installation guide</a>.
 
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+</div> <!-- End of cpu gpu -->
+</div> <!-- End of julia -->
 
-</div> <!-- End of v1-2-1-->
 
 
-<div class="v1-1-0">
+<div class="cpp">
+<br/>
+<p>To enable the C++ package, build from source using `make USE_CPP_PACKAGE=1`.
+<br/>Refer to the <a href="c_plus_plus.html">MXNet C++ setup guide</a> for more info.</p>
+<br/>
+</div>
+<hr>
+For more installation options, refer to the <a href="osx_setup.html">MXNet macOS installation guide</a>.
+</div> <!-- END - Mac OS -->
 
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
 
-```bash
-nvcc --version
+
+<div class="windows">
+<div class="python">
+<div class="cpu">
+<div class="pip">
+<div class="v1-2-1">
+
+```
+$ pip install mxnet
 ```
 
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
+</div> <!-- End of v1-2-1-->
 
-Install *MXNet* with GPU support using CUDA 9.1:
+<div class="v1-1-0">
 
-```bash
-(mxnet)$ pip install mxnet-cu91==1.1.0
 ```
-
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+$ pip install mxnet==1.1.0
+```
 
 </div> <!-- End of v1-1-0-->
 
-
 <div class="v1-0-0">
 
-Install *MXNet* with GPU support using CUDA 9.0.
-
-```bash
-(mxnet)$ pip install mxnet-cu90==1.0.0
 ```
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+$ pip install mxnet==1.0.0
+```
 
 </div> <!-- End of v1-0-0-->
 
-
 <div class="v0-12-1">
 
-Install *MXNet* with GPU support using CUDA 9.0.
-
-```bash
-(mxnet)$ pip install mxnet-cu90==0.12.1
 ```
-
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
+$ pip install mxnet==0.12.1
+```
 
 </div> <!-- End of v0-12-1-->
 
-
 <div class="v0-11-0">
 
-Install *MXNet* with GPU support using CUDA 8.0.
-
-```bash
-(mxnet)$ pip install mxnet-cu80==0.11.0
+```
+$ pip install mxnet==0.11.0
 ```
 
 </div> <!-- End of v0-11-0-->
 
 <div class="master">
 
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
-
-```bash
-nvcc --version
 ```
-
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
-
-Install *MXNet* with GPU support using CUDA 9.2.
-
-```bash
-(mxnet)$ pip install mxnet-cu92 --pre
+$ pip install mxnet --pre
 ```
 
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
-
 </div> <!-- End of master-->
+<hr> <!-- pip footer -->
+Most MXNet versions offer an experimental MKL pip package that will be much faster when running on Intel hardware.
+Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-**Step 4**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
-
-**Step 5**  Validate the installation by running simple *MXNet* code described [here](#validate-mxnet-installation).
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
-**Note**  You can read more about virtualenv [here](https://virtualenv.pypa.io/en/stable/userguide/).
 
-</div> <!-- END of virtualenv -->
+</div> <!-- End of pip -->
 
-<div class="docker">
 
+<div class="docker build-from-source">
 <br/>
 
-Docker images with *MXNet* are available at [Docker Hub](https://hub.docker.com/r/mxnet/).
+Refer to the <a href="windows_setup.html">MXNet Windows installation guide</a>.
 
-**Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
 
-*Note* - You can install Community Edition (CE) to get started with *MXNet*.
+</div> <!-- End of docker build-from-source -->
+</div> <!-- End of CPU -->
 
-**Step 2** [Optional] Post installation steps to manage Docker as a non-root user.
 
-Follow the four steps in this [docker documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user) to allow managing docker containers without *sudo*.
+<div class="gpu">
+<div class="pip">
+<div class="v1-2-1">
 
-If you skip this step, you need to use *sudo* each time you invoke Docker.
+```
+$ pip install mxnet-cu92
+```
 
-**Step 3** Install *nvidia-docker-plugin* following the [installation instructions](https://github.com/NVIDIA/nvidia-docker/wiki/Installation). *nvidia-docker-plugin* is required to enable the usage of GPUs from the docker containers.
+</div> <!-- End of v1-2-1-->
 
-**Step 4** Pull the MXNet docker image.
+<div class="v1-1-0">
 
-```bash
-$ docker pull mxnet/python:gpu # Use sudo if you skip Step 2
+```
+$ pip install mxnet-cu91==1.1.0
 ```
 
-You can list docker images to see if mxnet/python docker image pull was successful.
+</div> <!-- End of v1-1-0-->
 
-```bash
-$ docker images # Use sudo if you skip Step 2
+<div class="v1-0-0">
 
-REPOSITORY          TAG                 IMAGE ID            CREATED             SIZE
-mxnet/python        gpu                 493b2683c269        3 weeks ago         4.77 GB
+```
+$ pip install mxnet-cu90==1.0.0
 ```
 
-**Step 5** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+</div> <!-- End of v1-0-0-->
 
-</div> <!-- END of docker -->
+<div class="v0-12-1">
 
-<div class="build-from-source">
+```
+$ pip install mxnet-cu90==0.12.1
+```
 
-<br/>
+</div> <!-- End of v0-12-1-->
 
-Building *MXNet* from source is a 2 step process.
-1. Build the *MXNet* core shared library, `libmxnet.so`, from the C++ sources.
-2. Build the language specific bindings. Example - Python bindings, Scala bindings.
+<div class="v0-11-0">
 
-**Minimum Requirements**
-1. [GCC 4.8](https://gcc.gnu.org/gcc-4.8/) or later to compile C++ 11.
-2. [GNU Make](https://www.gnu.org/software/make/)
+```
+$ pip install mxnet-cu80==0.11.0
+```
 
-<br/>
+</div> <!-- End of v0-11-0-->
 
-**Build the MXNet core shared library**
+<div class="master">
 
-**Step 1** Install build tools and git.
-```bash
-$ sudo apt-get update
-$ sudo apt-get install -y build-essential git
 ```
-**Step 2** Install OpenBLAS.
-
-*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [LAPACK](https://en.wikipedia.org/wiki/LAPACK) libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
-```bash
-$ sudo apt-get install -y libopenblas-dev liblapack-dev
+$ pip install mxnet-cu92 --pre
 ```
 
-**Step 3** Install OpenCV.
+</div> <!-- End of master-->
+<hr> <!-- pip footer -->
+Most MXNet versions offer an experimental MKL pip package that will be much faster when running on Intel hardware.
+Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-*MXNet* uses [OpenCV](http://opencv.org/) for efficient image loading and augmentation operations.
-```bash
-$ sudo apt-get install -y libopencv-dev
-```
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
-**Step 4** Download MXNet sources and build MXNet core shared library. You can clone the repository as described in the following code block, or you may try the <a href="download.html">download links</a> for your desired MXNet version.
+**NOTES:**
 
-```bash
-$ git clone --recursive https://github.com/apache/incubator-mxnet
-$ cd incubator-mxnet
-$ make -j $(nproc) USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
-```
+[Anaconda](https://www.anaconda.com/download/) is recommended.
 
-*Note* - USE_OPENCV, USE_BLAS, USE_CUDA, USE_CUDA_PATH AND USE_CUDNN are make file flags to set compilation options to use OpenCV, OpenBLAS, CUDA and cuDNN libraries. You can explore and use more compilation options in `make/config.mk`. Make sure to set USE_CUDA_PATH to right CUDA installation path. In most cases it is - */usr/local/cuda*.
+CUDA should be installed first. Instructions can be found in the <a href="ubuntu_setup.html#cuda-dependencies">CUDA dependencies section of the MXNet Ubuntu installation guide</a>.
 
-<br/>
+**Important:** Make sure your installed CUDA version matches the CUDA version in the pip package. Check your CUDA version with the following command:
 
-**Install the MXNet Python binding**
+```
+nvcc --version
+```
 
-**Step 1** Install prerequisites - python, setup-tools, python-pip and libfortran (required for Numpy)..
+Refer to [#8671](https://github.com/apache/incubator-mxnet/issues/8671) for status on CUDA 9.1 support.
 
-```bash
-$ sudo apt-get install -y python-dev python-setuptools python-pip libgfortran3
-```
+You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
 
-**Step 2** Install the MXNet Python binding.
+</div>
 
-```bash
-$ cd python
-$ pip install -e .
-```
+<div class="build-from-source">
+<br/>
 
-Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
+To build from source, refer to the <a href="windows_setup.html">MXNet Windows installation guide</a>.
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-sudo apt-get install graphviz
-pip install graphviz
-```
 
-**Step 4** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+</div> <!-- End of pip -->
+</div> <!-- End of GPU -->
+</div> <!-- End of Python -->
 
-</div> <!-- END of build from source -->
-</div> <!-- END of GPU -->
-</div> <!-- END of Python -->
-<!-- END - Linux Python GPU Installation Instructions -->
 
+<!-- START - Windows R CPU Installation Instructions -->
 
 <div class="r">
 <div class="cpu">
+</br>
 
-The default version of R that is installed with `apt-get` is insufficient. You will need to first [install R v3.4.4+ and build MXNet from source](ubuntu_setup.html#install-the-mxnet-package-for-r).
-
-After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
+Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/windows/).
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
 
-```bash
-$ cd incubator-mxnet
-$ make rpkg
+```r
+cran <- getOption("repos")
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
+options(repos = cran)
+install.packages("mxnet")
 ```
 
-</div> <!-- END of CPU -->
-
+</div> <!-- END - Windows R CPU -->
 
 <div class="gpu">
+</br>
 
-The default version of R that is installed with `apt-get` is insufficient. You will need to first [install R v3.4.4+ and build MXNet from source](ubuntu_setup.html#install-the-mxnet-package-for-r).
-
-After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
 
-```bash
-$ cd incubator-mxnet
-$ make rpkg
+```r
+  cran <- getOption("repos")
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cu92"
+  options(repos = cran)
+  install.packages("mxnet")
 ```
+Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
 
 </div> <!-- END of GPU -->
-</div> <!-- END of R -->
-
+</div> <!-- END - Windows R -->
 
 <div class="scala">
-<div class="gpu">
-
-You can use the Maven packages defined in the following `dependency` to include MXNet in your Scala project. Please refer to the <a href="scala_setup.html">MXNet-Scala setup guide</a> for a detailed set of instructions to help you with the setup process.
-
-<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
-
-```html
-<dependency>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-</dependency>
-```
-<br>
-</div> <!-- End of gpu -->
-
-<div class="cpu">
-
-You can use the Maven packages defined in the following `dependency` to include MXNet in your Scala project. Please refer to the <a href="scala_setup.html">MXNet-Scala setup guide</a> for a detailed set of instructions to help you with the setup process.
-
-<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
-
-```html
-<dependency>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-</dependency>
-```
-<br>
-</div> <!-- End of cpu -->
+<div class="cpu gpu">
+<br/>
+MXNet-Scala for Windows is not yet available.
+<br/>
+</div> <!-- End of cpu gpu -->
 </div> <!-- End of scala -->
 
-
-<div class="julia perl">
+<div class="julia">
 <div class="cpu gpu">
-
-Follow the installation instructions [in this guide](./ubuntu_setup.md) to set up MXNet.
+</br>
+Refer to the <a href="windows_setup.html#install-the-mxnet-package-for-julia">Julia section of the MXNet Windows installation guide</a>.
 
 </div> <!-- End of cpu gpu -->
-</div> <!-- End of julia perl -->
+</div> <!-- End of julia -->
+
+<div class="perl">
+<div class="cpu gpu">
+</br>
+Refer to the <a href="windows_setup.html#install-the-mxnet-package-for-perl">Perl section of the MXNet Windows installation guide</a>.
 
+</div> <!-- End of cpu gpu -->
+</div> <!-- End of julia -->
 
 <div class="cpp">
 <div class="cpu gpu">
-<p> To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>. </p>
+</br>
+<p>To enable the C++ package, build from source using `make USE_CPP_PACKAGE=1`.
+<br/>Refer to the <a href="c_plus_plus.html">MXNet C++ setup guide</a> for more info.</p>
 <br/>
 </div> <!-- End of cpu gpu -->
-</div> <!-- END - C++-->
-</div> <!-- END - Linux -->
-
-
-<!-- START - MacOS Python CPU Installation Instructions -->
-
-<div class="macos">
-<div class="python">
-<div class="cpu">
-
-The following installation instructions have been tested on OSX Sierra and El Capitan.
+</div> <!-- End of C++ -->
+<hr>
+For more installation options, refer to the <a href="windows_setup.html">MXNet Windows installation guide</a>.
+</div> <!-- End of Windows -->
 
 
-<div class="pip">
-<br/>
+<!-- START - Cloud Python Installation Instructions -->
 
-**Step 1**  Install prerequisites - Homebrew, python development tools.
+<div class="cloud">
 
-```bash
-# Install Homebrew
-$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-$ export PATH=/usr/local/bin:/usr/local/sbin:$PATH
+AWS Marketplace distributes Deep Learning AMIs (Amazon Machine Image) with MXNet pre-installed. You can launch one of these Deep Learning AMIs by following instructions in the [AWS Deep Learning AMI Developer Guide](http://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html).
 
-# Install python development tools - python2.7, pip, python-setuptools
-$ brew install python
-```
+You can also run distributed deep learning with *MXNet* on AWS using [Cloudformation Template](https://github.com/awslabs/deeplearning-cfn/blob/master/README.md).
 
-**Step 2** Install MXNet with OpenBLAS acceleration.
+</div> <!-- END - Cloud Python Installation Instructions -->
 
-Installing *MXNet* with pip requires a latest version of `pip`. Install the latest version of `pip` by issuing the following command.
 
-```bash
-$ pip install --upgrade pip
-$ pip install --upgrade setuptools
-```
+<!-- DEVICES -->
+<div class="devices">
+  <div class="raspberry-pi">
 
-<div class="v1-2-1">
+MXNet supports the Debian based Raspbian ARM based operating system so you can run MXNet on Raspberry Pi Devices.
 
-Then use pip to install MXNet:
+These instructions will walk through how to build MXNet for the Raspberry Pi and install the Python bindings for the library.
 
-```bash
-$ pip install mxnet
-```
-</div> <!-- End of v1-2-1 -->
+You can do a dockerized cross compilation build on your local machine or a native build on-device.
 
+The complete MXNet library and its requirements can take almost 200MB of RAM, and loading large models with the library can take over 1GB of RAM. Because of this, we recommend running MXNet on the Raspberry Pi 3 or an equivalent device that has more than 1 GB of RAM and a Secure Digital (SD) card that has at least 4 GB of free memory.
 
-<div class="v1-1-0">
+**Cross compilation build (Experimental)**
 
-Then use pip to install MXNet:
+## Docker installation
+**Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
 
-```bash
-$ pip install mxnet==1.1.0
-```
+*Note* - You can install Community Edition (CE)
 
-</div> <!-- End of v1-1-0-->
+**Step 2** [Optional] Post installation steps to manage Docker as a non-root user.
 
+Follow the four steps in this [docker documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user) to allow managing docker containers without *sudo*.
 
-<div class="v1-0-0">
+## Build
 
-Then use pip to install MXNet:
+The following command will build a container with dependencies and tools and then compile MXNet for
+ARMv7. The resulting artifact will be located in `build/mxnet-x.x.x-py2.py3-none-any.whl`, copy this
+file to your Raspberry Pi.
 
-```bash
-$ pip install mxnet==1.0.0
+```
+ci/build.py -p armv7
 ```
 
-</div> <!-- End of v1-0-0-->
-
-<div class="v0-12-1">
+## Install
 
-Then use pip to install MXNet:
+Create a virtualenv and install the package we created previously.
 
-```bash
-$ pip install mxnet=0.12.1
 ```
-
-For MXNet 0.12.0 -
-
-```bash
-$ pip install mxnet=0.12.0
+virtualenv -p `which python3` mxnet_py3
+source mxnet_py3/bin/activate
+pip install mxnet-x.x.x-py2.py3-none-any.whl
 ```
 
 
-</div> <!-- End of v0-12-1-->
-
-
-<div class="v0-11-0">
-
-Then use pip to install MXNet:
-
-```bash
-$ pip install mxnet==0.11.0
-```
+**Native Build**
 
-</div> <!-- End of v0-11-0-->
+Installing MXNet is a two-step process:
 
-<div class="master">
+1. Build the shared library from the MXNet C++ source code.
+2. Install the supported language-specific packages for MXNet.
 
-Then use pip to install MXNet:
+**Step 1** Build the Shared Library
 
-```bash
-$ pip install mxnet --pre
-```
+On Raspbian versions Wheezy and later, you need the following dependencies:
 
-</div> <!-- End of master-->
+- Git (to pull code from GitHub)
 
-**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-$ brew install graphviz
-$ pip install graphviz
-```
+- libblas (for linear algebraic operations)
 
-**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+- libopencv (for computer vision operations. This is optional if you want to save RAM and Disk Space)
 
-</div> <!-- END of pip -->
+- A C++ compiler that supports C++ 11. The C++ compiler compiles and builds MXNet source code. Supported compilers include the following:
 
+    - [G++ (4.8 or later)](https://gcc.gnu.org/gcc-4.8/). Make sure to use gcc 4 and not 5 or 6 as there are known bugs with these compilers.
+    - [Clang (3.9 - 6)](https://clang.llvm.org/)
 
-<div class="virtualenv">
-<br/>
+Install these dependencies using the following commands in any directory:
 
-**Step 1**  Install prerequisites - Homebrew, python development tools.
+```
+    sudo apt-get update
+    sudo apt-get -y install git cmake ninja-build build-essential g++-4.9 c++-4.9 liblapack* libblas* libopencv* libopenblas* python3-dev virtualenv
+```
 
-```bash
-# Install Homebrew
-$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-$ export PATH=/usr/local/bin:/usr/local/sbin:$PATH
+Clone the MXNet source code repository using the following `git` command in your home directory:
+```
+    git clone https://github.com/apache/incubator-mxnet.git --recursive
+    cd incubator-mxnet
+```
 
-# Install python development tools - python2.7, pip, python-setuptools
-$ brew install python
+Build:
+```
+    mkdir -p build && cd build
+    cmake \
+        -DUSE_SSE=OFF \
+        -DUSE_CUDA=OFF \
+        -DUSE_OPENCV=ON \
+        -DUSE_OPENMP=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -GNinja ..
+    ninja -j$(nproc)
 ```
+Some compilation units require memory close to 1GB, so it's recommended that you enable swap as
+explained below and be cautious about increasing the number of jobs when building (-j)
 
-**Step 2**  Install virtualenv for macOS.
+Executing these commands start the build process, which can take up to a couple hours, and creates a file called `libmxnet.so` in the build directory.
 
-```bash
-$ pip install virtualenv
+If you are getting build errors in which the compiler is being killed, it is likely that the
+compiler is running out of memory (especially if you are on Raspberry Pi 1, 2 or Zero, which have
+less than 1GB of RAM), this can often be rectified by increasing the swapfile size on the Pi by
+editing the file /etc/dphys-swapfile and changing the line CONF_SWAPSIZE=100 to CONF_SWAPSIZE=1024,
+then running:
+```
+  sudo /etc/init.d/dphys-swapfile stop
+  sudo /etc/init.d/dphys-swapfile start
+  free -m # to verify the swapfile size has been increased
 ```
 
-**Step 3**  Create and activate virtualenv environment for MXNet.
+**Step 2** Install MXNet Python Bindings
 
-Following command creates a virtualenv environment at `~/mxnet` directory. However, you can choose any directory by replacing `~/mxnet` with a directory of your choice.
+To install Python bindings run the following commands in the MXNet directory:
 
-```bash
-$ virtualenv --system-site-packages ~/mxnet
 ```
-
-Activate the virtualenv environment created for *MXNet*.
-
-```bash
-$ source ~/mxnet/bin/activate
+    cd python
+    pip install --upgrade pip
+    pip install -e .
 ```
 
-After activating the environment, you should see the prompt as below.
+Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
 
-```bash
-(mxnet)$
+Alternatively you can create a whl package installable with pip with the following command:
+```
+ci/docker/runtime_functions.sh build_wheel python/ $(realpath build)
 ```
 
-**Step 4**  Install MXNet in the active virtualenv environment.
 
-Installing *MXNet* with pip requires a latest version of `pip`. Install the latest version of `pip` by issuing the following command.
+You are now ready to run MXNet on your Raspberry Pi device. You can get started by following the tutorial on [Real-time Object Detection with MXNet On The Raspberry Pi](http://mxnet.io/tutorials/embedded/wine_detector.html).
 
-```bash
-(mxnet)$ pip install --upgrade pip
-(mxnet)$ pip install --upgrade setuptools
-```
+*Note - Because the complete MXNet library takes up a significant amount of the Raspberry Pi's limited RAM, when loading training data or large models into memory, you might have to turn off the GUI and terminate running processes to free RAM.*
 
-<div class="v1-2-1">
+</div> <!-- End of raspberry pi -->
 
-Install *MXNet* with OpenBLAS acceleration.
 
-```bash
-(mxnet)$ pip install mxnet
-```
+<div class="nvidia-jetson">
 
-</div> <!-- End of v1-2-1-->
+# Nvidia Jetson TX family
 
-<div class="v1-1-0">
+MXNet supports the Ubuntu Arch64 based operating system so you can run MXNet on NVIDIA Jetson Devices.
 
-Install *MXNet* with OpenBLAS acceleration.
+These instructions will walk through how to build MXNet for the Pascal based [NVIDIA Jetson TX2](http://www.nvidia.com/object/embedded-systems-dev-kits-modules.html) and install the corresponding python language bindings.
 
-```bash
-(mxnet)$ pip install mxnet==1.1.0
-```
+For the purposes of this install guide we will assume that CUDA is already installed on your Jetson device.
 
-</div> <!-- End of v1-1-0-->
+**Install MXNet**
 
-<div class="v1-0-0">
+Installing MXNet is a two-step process:
 
-Install *MXNet* with OpenBLAS acceleration.
+1. Build the shared library from the MXNet C++ source code.
+2. Install the supported language-specific packages for MXNet.
 
-```bash
-(mxnet)$ pip install mxnet==1.0.0
-```
+**Step 1** Build the Shared Library
 
-</div> <!-- End of v1-0-0-->
+You need the following additional dependencies:
 
+- Git (to pull code from GitHub)
 
-<div class="v0-12-1">
+- libatlas (for linear algebraic operations)
 
-Install *MXNet* with OpenBLAS acceleration.
+- libopencv (for computer vision operations)
 
-```bash
-(mxnet)$ pip install mxnet==0.12.1
-```
+- python pip (to load relevant python packages for our language bindings)
 
-For *MXNet* 0.12.0 -
+Install these dependencies using the following commands in any directory:
 
-```bash
-(mxnet)$ pip install mxnet==0.12.0
+```
+    sudo apt-get update
+    sudo apt-get -y install git build-essential libatlas-base-dev libopencv-dev graphviz python-pip
+    sudo pip install pip --upgrade
+    sudo pip install setuptools numpy --upgrade
+    sudo pip install graphviz jupyter
 ```
 
+Clone the MXNet source code repository using the following `git` command in your home directory:
+```
+    git clone https://github.com/apache/incubator-mxnet.git --recursive
+    cd incubator-mxnet
+```
 
-</div> <!-- End of v0-12-1-->
-
-<div class="v0-11-0">
+Edit the Makefile to install the MXNet with CUDA bindings to leverage the GPU on the Jetson:
+```
+    cp make/crosscompile.jetson.mk config.mk
+```
 
-Install *MXNet* with OpenBLAS acceleration.
+Edit the Mshadow Makefile to ensure MXNet builds with Pascal's hardware level low precision acceleration by editing 3rdparty/mshadow/make/mshadow.mk and adding the following after line 122:
+```
+MSHADOW_CFLAGS += -DMSHADOW_USE_PASCAL=1
+```
 
-```bash
-(mxnet)$ pip install mxnet==0.11.0
+Now you can build the complete MXNet library with the following command:
+```
+    make -j $(nproc)
 ```
 
-</div> <!-- End of v0-11-0-->
+Executing this command creates a file called `libmxnet.so` in the mxnet/lib directory.
 
-<div class="master">
+**Step 2** Install MXNet Python Bindings
 
-Install *MXNet* with OpenBLAS acceleration.
+To install Python bindings run the following commands in the MXNet directory:
 
-```bash
-(mxnet)$ pip install mxnet --pre
 ```
-
-</div> <!-- End of master-->
-
-
-**Step 5**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
-```bash
-$ brew install graphviz
-(mxnet)$ pip install graphviz
+    cd python
+    pip install --upgrade pip
+    pip install -e .
 ```
 
-**Step 6**  Validate the installation by running simple *MXNet* code described [here](#validate-mxnet-installation).
-
-**Note**  You can read more about virtualenv [here](https://virtualenv.pypa.io/en/stable/userguide/).
+Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
 
-</div> <!-- End of virtualenv -->
+Add the mxnet folder to the path:
 
+```
+    cd ..
+    export MXNET_HOME=$(pwd)
+    echo "export PYTHONPATH=$MXNET_HOME/python:$PYTHONPATH" >> ~/.rc
+    source ~/.rc
+```
 
-<div class="docker">
-<br/>
+You are now ready to run MXNet on your NVIDIA Jetson TX2 device.
 
-Docker images with *MXNet* are available at [Docker Hub](https://hub.docker.com/r/mxnet/).
+</div> <!-- End of jetson -->
+</div> <!-- End of devices -->
 
-**Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/docker-for-mac/install/#install-and-run-docker-for-mac).
 
-*Note* - You can install Community Edition (CE) to get started with *MXNet*.
+<!-- This # tag restarts the page and allows reuse
+ of the div classes for validation sections, etc -->
 
-**Step 2** Pull the MXNet docker image.
 
-```bash
-$ docker pull mxnet/python
-```
-
-You can list docker images to see if mxnet/python docker image pull was successful.
-
-```bash
-$ docker images
-
-REPOSITORY          TAG                 IMAGE ID            CREATED             SIZE
-mxnet/python        latest              00d026968b3c        3 weeks ago         1.41 GB
-```
-
-**Step 4** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
-
-</div> <!-- END of docker -->
-
-
-<div class="build-from-source">
-<br/>
-
-**Prerequisites**
-
-If not already installed, [download and install Xcode](https://developer.apple.com/xcode/) (or [insall it from the App Store](https://itunes.apple.com/us/app/xcode/id497799835)) for macOS. [Xcode](https://en.wikipedia.org/wiki/Xcode) is an integrated development environment for macOS containing a suite of software development tools like C/C++ compilers, BLAS library and more.
-
-<br/>
-
-Building *MXNet* from source is a 2 step process.
-1. Build the *MXNet* core shared library, `libmxnet.so`, from the C++ sources.
-2. Build the language specific bindings. Example - Python bindings, Scala bindings.
-
-Make sure you have installed Xcode before proceeding further.
-
-<br/>
-
-All the instructions to build *MXNet* core shared library and *MXNet* Python bindings are compiled as one helper *bash* script. You can use [this bash script](https://raw.githubusercontent.com/dmlc/mxnet/master/setup-utils/install-mxnet-osx-python.sh) to build *MXNet* for Python, from source, on macOS.
-
-**Step 1** Download the bash script for building MXNet from source.
-
-```bash
-$ curl -O https://raw.githubusercontent.com/dmlc/mxnet/master/setup-utils/install-mxnet-osx-python.sh
-```
-
-**Step 2** Run the script to get latest MXNet source and build.
-
-```bash
-# Make the script executable
-$ chmod 744 install-mxnet-osx-python.sh
-
-# Run the script. It takes around 5 mins.
-$ bash install-mxnet-osx-python.sh
-```
-
-**Step 3** Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
-
-</div> <!-- END of build from source -->
-</div> <!-- END of CPU -->
-
-
-<!-- START - Mac OS Python GPU Installation Instructions -->
-<div class="gpu">
-<div class="pip virtualenv docker">
-</br>
-
-Try the **Build from Source** option for now.
-
-</div>
-
-<div class="build-from-source">
-
-**Step 1**  Install prerequisites - Homebrew, python development tools.
-
-```bash
-# Install Homebrew
-$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-$ export PATH=/usr/local/bin:/usr/local/sbin:$PATH
-
-# Install python development tools - python2.7, pip, python-setuptools
-$ brew install python pkg-config graphviz
-```
-
-**Step 2**  Install optional components - OpenCV
-
-If you want to use OpenCV you should install it first, then build MXNet with the `USE_OPENCV=1` option in the later steps.
-
-```bash
-brew tap homebrew/science
-brew install opencv
-
-```
-
-**Step 3**  Install CUDA and cuDNN
-
-The following instructions are for CUDA 9.1 and cuDNN 7 for macOS 10.12+ and a CUDA-capable GPU. They summarize confirmed successful builds in [#9217](https://github.com/apache/incubator-mxnet/issues/9217).
-Alternatively, you may follow the [CUDA installation instructions for macOS](https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html).
-
-1. [Download Xcode 8.3.3 from Apple](https://developer.apple.com/download/more/). This is the version [NVIDIA specifies in its instructions for macOS](https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html). Unzip and rename to `Xcode8.3.3.app`.
-
-2. Run `sudo xcode-select -s /Applications/Xcode8.3.3.app` or to wherever you have placed Xcode.
-
-3. Run `xcode-select --install` to install all command line tools, compilers, etc.
-
-4. Run `sudo xcodebuild -license accept` to accept Xcode's licensing terms.
-
-5. Install CUDA for macOS. Specific steps are provided in NVIDIA's [CUDA installation instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#installation).
-
-6. [Download](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#download-mac) and [install](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac) cuDNN for macOS. You will need to [create a free developer account](https://developer.nvidia.com/accelerated-computing-developer) with NVIDIA prior to getting the download link.
-
-**Step 4**  Build MXNet
-
-1. Run `git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet` to get the latest version.
-
-2. Run `cd mxnet`.
-
-3. Edit the `make/osx.mk` file to set the following parameters:
-
-    ```
-    USE_CUDA = 1
-    USE_CUDA_PATH = /usr/local/cuda
-    USE_CUDNN = 1
-    USE_OPENCV = 0   # set to 1 if you want to build with OpenCV
-    ```
-
-4. Copy the `make/osx.mk` to `config.mk`
-
-5. Run `make`. If you previously attempted to compile you might want to do `make clean_all` first. You can also run `make -j` with the number of processors you have to compile with multithreading. There'll be plenty of warnings, but there should be no errors.
-
-6. Once finished, you should have a file called `libmxnet.so` in `lib/`.
-
-7. Do `cd python`.
-
-8. Run `sudo pip install -e .` **Note**: the `.` is part of the command.
-
-</div> <!-- END of build from source -->
-</div> <!-- END of GPU -->
-</div> <!-- END of Python -->
-
-
-<!-- START - MacOS R CPU Installation Instructions -->
-
-<div class="r">
-<div class="cpu">
-
-Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/macosx/).
-You can [build MXNet-R from source](osx_setup.html#install-the-mxnet-package-for-r), or you can use a pre-built binary:
-
-```r
-cran <- getOption("repos")
-cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
-options(repos = cran)
-install.packages("mxnet")
-```
-
-</div> <!-- END of CPU -->
-
-
-<div class="gpu">
-
-Will be available soon.
-
-</div> <!-- END of GPU -->
-</div> <!-- END of R -->
-
-<div class="scala">
-<div class="cpu">
-
-You can use the Maven packages defined in the following `dependency` to include MXNet in your Scala project. Please refer to the <a href="scala_setup.html">MXNet-Scala setup guide</a> for a detailed set of instructions to help you with the setup process.
-
-<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
-
-```html
-<dependency>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-</dependency>
-```
-<br>
-</div> <!-- End of cpu  -->
-<div class="gpu">
-
-Not available at this time. <br>
-
-</div>
-</div> <!-- End of scala -->
-
-
-<div class="julia perl">
-<div class="cpu gpu">
-
-Follow the installation instructions [in this guide](./osx_setup.md) to set up MXNet.
-
-</div> <!-- End of cpu gpu -->
-</div> <!-- End of julia perl -->
-
-
-<div class="cpp">
-<p>To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>.</p>
-<br/>
-</div>
-</div> <!-- END - Mac OS -->
-
-
-
-
-
-
-
-
-
-<div class="windows">
-<div class="python">
-<div class="cpu">
-<div class="pip">
-
-<br/>
-
-**Step 1**  Install Python.
-
-[Anaconda](https://www.anaconda.com/download/) is recommended.
-
-<div class="v1-2-1">
-
-**Step 2**  Install *MXNet*.
-
-```bash
-$ pip install mxnet
-```
-
-</div> <!-- End of v1-2-1-->
-
-<div class="v1-1-0">
-
-**Step 2**  Install *MXNet*.
-
-```bash
-$ pip install mxnet==1.1.0
-```
-
-</div> <!-- End of v1-1-0-->
-
-<div class="v1-0-0">
-
-**Step 2**  Install *MXNet*.
-
-```bash
-$ pip install mxnet==1.0.0
-```
-
-</div> <!-- End of v1-0-0-->
-
-
-<div class="v0-12-1">
-
-**Step 2**  Install *MXNet*.
-
-```bash
-$ pip install mxnet==0.12.1
-```
-
-For *MXNet* 0.12.0 -
-
-```bash
-$ pip install mxnet==0.12.0
-```
-
-
-</div> <!-- End of v0-12-1-->
-
-<div class="v0-11-0">
-
-**Step 2**  Install *MXNet*.
-
-```bash
-$ pip install mxnet==0.11.0
-```
-
-
-</div> <!-- End of v0-11-0-->
-
-<div class="master">
-
-**Step 2**  Install *MXNet*.
-
-```bash
-$ pip install mxnet --pre
-```
-
-</div> <!-- End of master-->
-
-
-</div> <!-- End of pip -->
-
-
-<div class="virtualenv docker build-from-source">
-
-Follow the installation instructions [in this guide](./windows_setup.md) to set up MXNet.
-
-</div> <!-- End of virtualenv docker build-from-source -->
-</div> <!-- End of CPU -->
-
-
-<div class="gpu">
-<div class="pip">
-
-<br/>
-
-**Step 1**  Install Python.
-
-[Anaconda](https://www.anaconda.com/download/) is recommended.
-
-
-<div class="v1-2-1">
-
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.2.
-
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
-
-```bash
-nvcc --version
-```
-
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
-
-```bash
-$ pip install mxnet-cu92
-```
-
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
-
-</div> <!-- End of v1-2-1-->
-
-<div class="v1-1-0">
-
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.1.
-
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
-
-```bash
-nvcc --version
-```
-
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
-
-```bash
-$ pip install mxnet-cu91==1.1.0
-```
-
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
-
-</div> <!-- End of v1-1-0-->
-
-<div class="v1-0-0">
-
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
-
-```bash
-$ pip install mxnet-cu90==1.0.0
-```
-
-</div> <!-- End of v1-0-0-->
-
-<div class="v0-12-1">
-
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
-
-```bash
-$ pip install mxnet-cu90==0.12.1
-```
-
-Install *MXNet* 0.12.0 with GPU support using CUDA 9.0.
-
-```bash
-$ pip install mxnet-cu90==0.12.0
-```
-
-</div> <!-- End of v0-12-1-->
-
-<div class="v0-11-0">
-
-**Step 2**  Install *MXNet* with GPU support using CUDA 8.0.
-
-```bash
-$ pip install mxnet-cu80==0.11.0
-```
-
-</div> <!-- End of v0-11-0-->
-
-<div class="master">
-
-**Step 2**  Install *MXNet* with GPU support using CUDA 9.2.
-
-**Important**: Make sure your installed CUDA version matches the CUDA version in the pip package.
-Check your CUDA version with the following command:
-
-```bash
-nvcc --version
-```
-
-You can either upgrade your CUDA install or install the MXNet package that supports your CUDA version.
-
-```bash
-$ pip install mxnet-cu92 --pre
-```
-
-Refer to [pypi for older packages](https://pypi.org/project/mxnet/).
-
-</div> <!-- End of master-->
-
-Refer to [#8671](https://github.com/apache/incubator-mxnet/issues/8671) for status on CUDA 9.1 support.
-
-</div>
-<div class="build-from-source">
-<br/>
-
-We provide both options to build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), and [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/).
-
-**Option 1**
-
-To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
-
-1. If [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and install [CMake](https://cmake.org/files/v3.11/cmake-3.11.0-rc4-win64-x64.msi) if it is not already installed.
-3. Download and install [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download).
-4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g., ```OpenCV_DIR = C:\utils\opencv\build```).
-6. If you don’t have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.20/OpenBLAS%200.2.20%20version.zip/download).
-7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories (e.g., ```OpenBLAS_HOME = C:\utils\OpenBLAS```).
-8. Download and install CUDA: Install [CUDA](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal), and Download the base installer (e.g., ```cuda_9.1.85_win10.exe```).
-9. Download and install cuDNN. To get access to the download link, register as an NVIDIA community user. Then Follow the [link](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#install-windows) to install the cuDNN.
-10. Download and install [git](https://git-for-windows.github.io/).
-
-After you have installed all of the required dependencies, build the MXNet source code:
-
-1. Start ```cmd``` in windows.
-
-2. Download the MXNet source code from GitHub by using following command:
-
-```r
-cd C:\
-git clone https://github.com/apache/incubator-mxnet.git --recursive
-```
-
-3. Follow [this link](https://docs.microsoft.com/en-us/visualstudio/install/modify-visual-studio) to modify ```Individual components```, and check ```VC++ 2017 version 15.4 v14.11 toolset```, and click ```Modify```.
-
-4. Change the version of the Visual studio 2017 to v14.11 using the following command (by default the VS2017 is installed in the following path):
-
-```r
-"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
-```
-
-5. Create a build dir using the following command and go to the directory, for example:
-
-```r
-mkdir C:\build
-cd C:\build
-```
-
-6. CMake the MXNet source code by using following command:
-
-```r
-cmake -G "Visual Studio 15 2017 Win64" -T cuda=9.1,host=x64 -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_LIST=Common -DCUDA_TOOLSET=9.1 -DCUDNN_INCLUDE=C:\cuda\include -DCUDNN_LIBRARY=C:\cuda\lib\x64\cudnn.lib "C:\incubator-mxnet"
-```
-
-NOTE: make sure the DCUDNN_INCLUDE and DCUDNN_LIBRARY pointing to the “include” and “cudnn.lib” of your CUDA installed location, and the ```C:\incubator-mxnet``` is the location of the source code you just git in the previous step
-
-7. After the CMake successfully completed, compile the the MXNet source code by using following command:
-
-```r
-msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
-```
-
-**Option 2**
-
-To build and install MXNet yourself using [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/), you need the following dependencies. Install the required dependencies:
-
-1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition. At least Update 3 of Microsoft Visual Studio 2015 is required to build MXNet from source. Upgrade via it's ```Tools -> Extensions and Updates... | Product Updates``` menu.
-2. Download and install [CMake](https://cmake.org/) if it is not already installed.
-3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
-4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
-6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
-7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
-8. Download and install [CUDA](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64) and [cuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
-9. Set the environment variable ```CUDACXX``` to point to the ```CUDA Compiler```(```C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.1\bin\nvcc.exe``` for example).
-10. Set the environment variable ```CUDNN_ROOT``` to point to the ```cuDNN``` directory that contains the ```include```,  ```lib``` and ```bin``` directories (```C:\Downloads\cudnn-9.1-windows7-x64-v7\cuda``` for example).
-
-After you have installed all of the required dependencies, build the MXNet source code:
-
-1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet) (make sure you also download third parties submodules e.g. ```git clone --recurse-submodules```).
-2. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build```.
-3. In Visual Studio, open the solution file,```.sln```, and compile it.
-These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
-
-&nbsp;
-Next, we install the ```graphviz``` library that we use for visualizing network graphs that you build on MXNet. We will also install [Jupyter Notebook](http://jupyter.readthedocs.io/) which is used for running MXNet tutorials and examples.
-- Install the ```graphviz``` by downloading the installer from the [Graphviz Download Page](https://graphviz.gitlab.io/_pages/Download/Download_windows.html).
-**Note** Make sure to add the `graphviz` executable path to the PATH environment variable. Refer [here for more details](http://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft)
-
-
-&nbsp;
-</div> <!-- End of pip -->
-</div> <!-- End of GPU -->
-</div> <!-- End of Python -->
-
-
-<!-- START - Windows R CPU Installation Instructions -->
-
-<div class="r">
-<div class="cpu">
-
-Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/windows/).
-You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
-
-```r
-cran <- getOption("repos")
-cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
-options(repos = cran)
-install.packages("mxnet")
-```
-
-</div> <!-- END - Windows R CPU -->
-
-<div class="gpu">
-
-You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
-
-```r
-  cran <- getOption("repos")
-  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cu92"
-  options(repos = cran)
-  install.packages("mxnet")
-```
-Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
-
-</div> <!-- END of GPU -->
-</div> <!-- END - Windows R -->
-
-<div class="scala">
-<div class="cpu gpu">
-
-MXNet-Scala for Windows is not yet available.
-<br>
-</div> <!-- End of cpu gpu -->
-</div> <!-- End of scala -->
-
-<div class="julia perl">
-<div class="cpu gpu">
-
-Follow the installation instructions [in this guide](./windows_setup.md) to set up MXNet.
-
-</div> <!-- End of cpu gpu -->
-</div> <!-- End of julia perl -->
-
-<div class="cpp">
-<div class="cpu gpu">
-<p> To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>. </p>
-<br/>
-</div> <!-- End of cpu gpu -->
-</div> <!-- End of C++ -->
-</div> <!-- End of Windows -->
-
-
-<!-- START - Cloud Python Installation Instructions -->
-
-<div class="cloud">
-
-AWS Marketplace distributes Deep Learning AMIs (Amazon Machine Image) with MXNet pre-installed. You can launch one of these Deep Learning AMIs by following instructions in the [AWS Deep Learning AMI Developer Guide](http://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html).
-
-You can also run distributed deep learning with *MXNet* on AWS using [Cloudformation Template](https://github.com/awslabs/deeplearning-cfn/blob/master/README.md).
-
-</div> <!-- END - Cloud Python Installation Instructions -->
-
-
-<!-- DEVICES -->
-<div class="devices">
-  <div class="raspberry-pi">
-
-MXNet supports the Debian based Raspbian ARM based operating system so you can run MXNet on Raspberry Pi Devices.
-
-These instructions will walk through how to build MXNet for the Raspberry Pi and install the Python bindings for the library.
-
-You can do a dockerized cross compilation build on your local machine or a native build on-device.
-
-The complete MXNet library and its requirements can take almost 200MB of RAM, and loading large models with the library can take over 1GB of RAM. Because of this, we recommend running MXNet on the Raspberry Pi 3 or an equivalent device that has more than 1 GB of RAM and a Secure Digital (SD) card that has at least 4 GB of free memory.
-
-**Cross compilation build (Experimental)**
-
-## Docker installation
-**Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
-
-*Note* - You can install Community Edition (CE)
-
-**Step 2** [Optional] Post installation steps to manage Docker as a non-root user.
-
-Follow the four steps in this [docker documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user) to allow managing docker containers without *sudo*.
-
-## Build
-
-The following command will build a container with dependencies and tools and then compile MXNet for
-ARMv7. The resulting artifact will be located in `build/mxnet-x.x.x-py2.py3-none-any.whl`, copy this
-file to your Raspberry Pi.
-
-```bash
-ci/build.py -p armv7
-```
-
-## Install
-
-Create a virtualenv and install the package we created previously.
-
-```bash
-virtualenv -p `which python3` mxnet_py3
-source mxnet_py3/bin/activate
-pip install mxnet-x.x.x-py2.py3-none-any.whl
-```
-
-
-**Native Build**
-
-Installing MXNet is a two-step process:
-
-1. Build the shared library from the MXNet C++ source code.
-2. Install the supported language-specific packages for MXNet.
-
-**Step 1** Build the Shared Library
-
-On Raspbian versions Wheezy and later, you need the following dependencies:
-
-- Git (to pull code from GitHub)
-
-- libblas (for linear algebraic operations)
-
-- libopencv (for computer vision operations. This is optional if you want to save RAM and Disk Space)
-
-- A C++ compiler that supports C++ 11. The C++ compiler compiles and builds MXNet source code. Supported compilers include the following:
-
-- [G++ (4.8 or later)](https://gcc.gnu.org/gcc-4.8/). Make sure to use gcc 4 and not 5 or 6 as there
-  are known bugs with these compilers.
-
-Install these dependencies using the following commands in any directory:
-
-```bash
-    sudo apt-get update
-    sudo apt-get -y install git cmake ninja-build build-essential g++-4.9 c++-4.9 liblapack* libblas* libopencv* libopenblas* python3-dev virtualenv
-```
-
-Clone the MXNet source code repository using the following `git` command in your home directory:
-```bash
-    git clone https://github.com/apache/incubator-mxnet.git --recursive
-    cd incubator-mxnet
-```
-
-Build:
-```bash
-    mkdir -p build && cd build
-    cmake \
-        -DUSE_SSE=OFF \
-        -DUSE_CUDA=OFF \
-        -DUSE_OPENCV=ON \
-        -DUSE_OPENMP=ON \
-        -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_SIGNAL_HANDLER=ON \
-        -DCMAKE_BUILD_TYPE=Release \
-        -GNinja ..
-    ninja -j1
-```
-Some compilation units require memory close to 1GB, so it's recommended that you enable swap as
-explained below and be cautious about increasing the number of jobs when building (-j)
-
-Executing these commands start the build process, which can take up to a couple hours, and creates a file called `libmxnet.so` in the build directory.
-
-If you are getting build errors in which the compiler is being killed, it is likely that the
-compiler is running out of memory (especially if you are on Raspberry Pi 1, 2 or Zero, which have
-less than 1GB of RAM), this can often be rectified by increasing the swapfile size on the Pi by
-editing the file /etc/dphys-swapfile and changing the line CONF_SWAPSIZE=100 to CONF_SWAPSIZE=1024,
-then running:
-```bash
-  sudo /etc/init.d/dphys-swapfile stop
-  sudo /etc/init.d/dphys-swapfile start
-  free -m # to verify the swapfile size has been increased
-```
-
-**Step 2** Install MXNet Python Bindings
-
-To install Python bindings run the following commands in the MXNet directory:
-
-```bash
-    cd python
-    pip install --upgrade pip
-    pip install -e .
-```
-
-Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
-
-Alternatively you can create a whl package installable with pip with the following command:
-```bash
-ci/docker/runtime_functions.sh build_wheel python/ $(realpath build)
-```
-
-
-You are now ready to run MXNet on your Raspberry Pi device. You can get started by following the tutorial on [Real-time Object Detection with MXNet On The Raspberry Pi](http://mxnet.io/tutorials/embedded/wine_detector.html).
-
-*Note - Because the complete MXNet library takes up a significant amount of the Raspberry Pi's limited RAM, when loading training data or large models into memory, you might have to turn off the GUI and terminate running processes to free RAM.*
-
-</div> <!-- End of raspberry pi -->
-
-
-<div class="nvidia-jetson">
-
-# Nvidia Jetson TX family
-
-MXNet supports the Ubuntu Arch64 based operating system so you can run MXNet on NVIDIA Jetson Devices.
-
-These instructions will walk through how to build MXNet for the Pascal based [NVIDIA Jetson TX2](http://www.nvidia.com/object/embedded-systems-dev-kits-modules.html) and install the corresponding python language bindings.
-
-For the purposes of this install guide we will assume that CUDA is already installed on your Jetson device.
-
-**Install MXNet**
-
-Installing MXNet is a two-step process:
-
-1. Build the shared library from the MXNet C++ source code.
-2. Install the supported language-specific packages for MXNet.
-
-**Step 1** Build the Shared Library
-
-You need the following additional dependencies:
-
-- Git (to pull code from GitHub)
-
-- libatlas (for linear algebraic operations)
-
-- libopencv (for computer vision operations)
-
-- python pip (to load relevant python packages for our language bindings)
-
-Install these dependencies using the following commands in any directory:
-
-```bash
-    sudo apt-get update
-    sudo apt-get -y install git build-essential libatlas-base-dev libopencv-dev graphviz python-pip
-    sudo pip install pip --upgrade
-    sudo pip install setuptools numpy --upgrade
-    sudo pip install graphviz jupyter
-```
-
-Clone the MXNet source code repository using the following `git` command in your home directory:
-```bash
-    git clone https://github.com/apache/incubator-mxnet.git --recursive
-    cd incubator-mxnet
-```
-
-Edit the Makefile to install the MXNet with CUDA bindings to leverage the GPU on the Jetson:
-```bash
-    cp make/crosscompile.jetson.mk config.mk
-```
-
-Edit the Mshadow Makefile to ensure MXNet builds with Pascal's hardware level low precision acceleration by editing 3rdparty/mshadow/make/mshadow.mk and adding the following after line 122:
-```bash
-MSHADOW_CFLAGS += -DMSHADOW_USE_PASCAL=1
-```
-
-Now you can build the complete MXNet library with the following command:
-```bash
-    make -j $(nproc)
-```
-
-Executing this command creates a file called `libmxnet.so` in the mxnet/lib directory.
-
-**Step 2** Install MXNet Python Bindings
-
-To install Python bindings run the following commands in the MXNet directory:
-
-```bash
-    cd python
-    pip install --upgrade pip
-    pip install -e .
-```
-
-Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
-
-Add the mxnet folder to the path:
-
-```bash
-    cd ..
-    export MXNET_HOME=$(pwd)
-    echo "export PYTHONPATH=$MXNET_HOME/python:$PYTHONPATH" >> ~/.bashrc
-    source ~/.bashrc
-```
-
-You are now ready to run MXNet on your NVIDIA Jetson TX2 device.
-
-</div> <!-- End of jetson -->
-</div> <!-- End of devices -->
-
-
-<!-- This # tag restarts the page and allows reuse
- of the div classes for validation sections, etc -->
-
-
-# Validate MXNet Installation
-
-<div class="linux macos">
-<div class="python">
-<div class="cpu">
-
-<div class="pip build-from-source">
-
-Start the python terminal.
-
-```bash
-$ python
-```
-</div>
-
-<div class="docker">
-
-Launch a Docker container with `mxnet/python` image and run example *MXNet* python program on the terminal.
-
-```bash
-$ docker run -it mxnet/python bash # Use sudo if you skip Step 2 in the installation instruction
-
-# Start a python terminal
-root@4919c4f58cac:/# python
-```
-</div>
-
-<div class="virtualenv">
-
-Activate the virtualenv environment created for *MXNet*.
-
-```bash
-$ source ~/mxnet/bin/activate
-```
-
-After activating the environment, you should see the prompt as below.
-
-```bash
-(mxnet)$
-```
-
-Start the python terminal.
-
-```bash
-$ python
-```
-
-</div>
-
-Run a short *MXNet* python program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
-
-```python
->>> import mxnet as mx
->>> a = mx.nd.ones((2, 3))
->>> b = a * 2 + 1
->>> b.asnumpy()
-array([[ 3.,  3.,  3.],
-       [ 3.,  3.,  3.]], dtype=float32)
-```
-</div><!-- linux macos -->
-</div><!-- python -->
-</div><!-- cpu -->
-
-<!-- Validate Windows CPU pip install -->
-
-<div class="windows">
-<div class="python">
-<div class="cpu">
-<div class="pip">
-
-Run a short *MXNet* python program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
-
-```python
->>> import mxnet as mx
->>> a = mx.nd.ones((2, 3))
->>> b = a * 2 + 1
->>> b.asnumpy()
-array([[ 3.,  3.,  3.],
-       [ 3.,  3.,  3.]], dtype=float32)
-```
-
-</div>
-</div>
-</div>
-</div>
-
-<!-- Mac OS GPU installation validation -->
-
-<div class="macos">
-<div class="python">
-<div class="gpu">
-
-<div class="pip virtualenv docker">
-</br>
-
-Will be available soon.
-
-</div>
-
-<div class="build-from-source">
-</br>
-
-From the MXNet root directory run: `python example/image-classification/train_mnist.py --network lenet --gpus 0` to test GPU training.
-
-</div>
-
-</div>
-</div>
-</div>
-
-<!-- Windows GPU installation validation -->
-
-<div class="windows">
-<div class="python">
-<div class="gpu">
-
-<div class="virtualenv docker">
-</br>
-
-Will be available soon.
-
-</div>
-
-<div class="pip build-from-source">
-</br>
-
-From the MXNet root directory run: `python example/image-classification/train_mnist.py --network lenet --gpus 0` to test GPU training.
-
-</div>
-
-</div><!-- windows -->
-</div><!-- python -->
-</div><!-- gpu -->
-
-<!-- Validation for GPU machines -->
-
-<div class="linux">
-<div class="python">
-<div class="gpu">
-
-<div class="pip build-from-source">
-
-Start the python terminal.
-
-```bash
-$ python
-```
-</div>
-
-<div class="docker">
-
-Launch a NVIDIA Docker container with `mxnet/python:gpu` image and run example *MXNet* python program on the terminal.
-
-```bash
-$ nvidia-docker run -it mxnet/python:gpu bash # Use sudo if you skip Step 2 in the installation instruction
-
-# Start a python terminal
-root@4919c4f58cac:/# python
-```
-</div>
-
-<div class="virtualenv">
-
-Activate the virtualenv environment created for *MXNet*.
-
-```bash
-$ source ~/mxnet/bin/activate
-```
-
-After activating the environment, you should see the prompt as below.
-
-```bash
-(mxnet)$
-```
-
-Start the python terminal.
-
-```bash
-$ python
-```
-
-</div>
-
-Run a short *MXNet* python program to create a 2X3 matrix of ones *a* on a *GPU*, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3. We use *mx.gpu()*, to set *MXNet* context to be GPUs.
-
-```python
->>> import mxnet as mx
->>> a = mx.nd.ones((2, 3), mx.gpu())
->>> b = a * 2 + 1
->>> b.asnumpy()
-array([[ 3.,  3.,  3.],
-       [ 3.,  3.,  3.]], dtype=float32)
-```
-</div><!-- linux -->
-</div><!-- python -->
-</div><!-- gpu -->
-
-
-
-
-
-
-<!-- Linux Clean up -->
-<div class="linux">
-<div class="python">
-<div class="cpu">
-
-<div class="pip build-from-source">
-
-Exit the Python terminal.
-
-```python
->>> exit()
-$
-```
-</div>
-
-<div class="virtualenv">
-
-Exit the Python terminal and Deactivate the virtualenv *MXNet* environment.
-```python
->>> exit()
-(mxnet)$ deactivate
-$
-```
-
-</div>
-
-<div class="docker">
-
-Exit the Python terminal and mxnet/python docker container.
-```python
->>> exit()
-root@4919c4f58cac:/# exit
-```
-
-</div>
-
-</div>
-</div>
-</div>
-
-<!-- MacOS Clean up -->
-<div class="macos">
-<div class="python">
-<div class="cpu">
-
-<div class="pip build-from-source">
-
-Exit the Python terminal.
-
-```python
->>> exit()
-$
-```
-</div>
-
-<div class="virtualenv">
-
-Exit the Python terminal and Deactivate the virtualenv *MXNet* environment.
-```python
->>> exit()
-(mxnet)$ deactivate
-$
-```
-
-</div>
-
-<div class="docker">
-
-Exit the Python terminal and then the docker container.
-```python
->>> exit()
-root@4919c4f58cac:/# exit
-```
-
-</div>
-
-</div>
-</div>
-</div>
-
-<!-- Validation for cloud installation -->
-
-<div class="cloud">
-
-Login to the cloud instance you launched, with pre-installed *MXNet*, following the guide by corresponding cloud provider.
-
-
-Start the python terminal.
-
-```bash
-$ python
-```
-<!-- Example Python code for CPU -->
-
-<div class="cpu">
-
-Run a short *MXNet* python program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
-
-```python
->>> import mxnet as mx
->>> a = mx.nd.ones((2, 3))
->>> b = a * 2 + 1
->>> b.asnumpy()
-array([[ 3.,  3.,  3.],
-         [ 3.,  3.,  3.]], dtype=float32)
-  ```
-
-Exit the Python terminal.
-
-```python
->>> exit()
-$
-```
-
-</div>
-
-<!-- Example Python code for CPU -->
-
-<div class="gpu">
-
-Run a short *MXNet* python program to create a 2X3 matrix of ones *a* on a *GPU*, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3. We use *mx.gpu()*, to set *MXNet* context to be GPUs.
-
-```python
->>> import mxnet as mx
->>> a = mx.nd.ones((2, 3), mx.gpu())
->>> b = a * 2 + 1
->>> b.asnumpy()
-array([[ 3.,  3.,  3.],
-       [ 3.,  3.,  3.]], dtype=float32)
-```
-
-</div>
-
-</div>
-
-<!-- Example R code for CPU -->
-
-<div class="linux macos windows">
-<div class="r">
-<div class="cpu">
-
-Run a short *MXNet* R program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
-
-```r
-library(mxnet)
-a <- mx.nd.ones(c(2,3), ctx = mx.cpu())
-b <- a * 2 + 1
-b
-```
-
-You should see the following output:
-
-```r
-[,1] [,2] [,3]
-[1,]    3    3    3
-[2,]    3    3    3
-```
-
-</div>
-</div>
-</div>
-
-<!-- Example R code for GPU -->
-
-<div class="linux macos windows">
-<div class="r">
-<div class="gpu">
-
-Run a short *MXNet* R program to create a 2X3 matrix of ones *a* on a *GPU*, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3. We use *mx.gpu()*, to set *MXNet* context to be GPUs.
-
-```r
-library(mxnet)
-a <- mx.nd.ones(c(2,3), ctx = mx.gpu())
-b <- a * 2 + 1
-b
-```
-
-You should see the following output:
-
-```r
-[,1] [,2] [,3]
-[1,]    3    3    3
-[2,]    3    3    3
-```
-
-</div>
-</div>
-</div>
-
-
-
-<div class="linux">
-<div class="scala">
-
-<div class="cpu gpu">
-      Run the <a href="https://github.com/apache/incubator-mxnet/tree/master/scala-package/mxnet-demo">MXNet-Scala demo project</a> to validate your Maven package installation.
-</div>
-
-</div><!-- scala -->
-
-<div class="julia perl cpp">
-<div class="cpu gpu">
-
-Will be available soon.
-
-</div><!-- cpu gpu -->
-</div><!-- julia perl cpp -->
-</div><!-- linux -->
-
-<div class="macos">
-<div class="scala">
-<div class="cpu gpu">
-      Run the <a href="https://github.com/apache/incubator-mxnet/tree/master/scala-package/mxnet-demo">MXNet-Scala demo project</a> to validate your Maven package installation.
-</div><!-- cpu gpu-->
-</div><!-- scala -->
-<div class="julia perl cpp">
-<div class="cpu gpu">
-
-Will be available soon.
-
-</div><!-- cpu gpu -->
-</div><!-- julia perl cpp -->
-</div><!-- macos -->
-
-<!-- Windows MXNet Installation validation -->
-<div class="windows">
-<div class="python">
-<div class="cpu">
-
-<div class="build-from-source virtualenv docker">
-<br/>
-Will be available soon.
-</div>
-
-</div>
-</div>
-
-<div class="scala julia perl cpp">
-<div class="cpu gpu">
-
-Will be available soon.
-
-</div>
-</div>
-</div>
-<!-- End Windows Installation validation -->
-
-<br/>
 <!-- Download -->
+<hr>
 
 # Source Download
 
-<a href="download.html">Download</a> your required version of MXNet.
+<a href="download.html">Download</a> your required version of MXNet and <a href="build_from_source.html">build from source</a>.
diff --git a/docs/install/osx_setup.md b/docs/install/osx_setup.md
index b90dfd1e582..53039252888 100644
--- a/docs/install/osx_setup.md
+++ b/docs/install/osx_setup.md
@@ -102,11 +102,22 @@ If building with ```GPU``` support, add the following configuration to config.mk
 &nbsp;
 
 We have installed MXNet core library. Next, we will install MXNet interface package for the programming language of your choice:
+- [Python](#install-mxnet-for-python)
 - [R](#install-the-mxnet-package-for-r)
 - [Julia](#install-the-mxnet-package-for-julia)
 - [Scala](#install-the-mxnet-package-for-scala)
 - [Perl](#install-the-mxnet-package-for-perl)
 
+## Install MXNet for Python
+To install the MXNet Python binding navigate to the root of the MXNet folder then run the following:
+
+```bash
+$ cd python
+$ pip install -e .
+```
+
+Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
+
 ## Install the MXNet Package for R
 You have 2 options:
 1. Building MXNet with the Prebuilt Binary Package
diff --git a/docs/install/ubuntu_setup.md b/docs/install/ubuntu_setup.md
index 13280b58573..432310dd763 100644
--- a/docs/install/ubuntu_setup.md
+++ b/docs/install/ubuntu_setup.md
@@ -115,8 +115,8 @@ You can build MXNet from source, and then you have the option of installing lang
 
 ### Build the Shared Library
 
-#### Quick MXNet Installation
-You can quickly build MXNet with the following script found in the `/docs/install` folder:
+#### Quick MXNet Build
+You can quickly build MXNet from source with the following script found in the `/docs/install` folder:
 
 ```bash
 cd docs/install
@@ -127,6 +127,8 @@ Or you can go through a manual process described next.
 
 #### Manual MXNet Installation
 
+It is recommended that you review the general [build from source](build_from_source.html) instructions before continuing.
+
 On Ubuntu versions 16.04 or later, you need the following dependencies:
 
 **Step 1:** Install build tools and git.
@@ -135,14 +137,18 @@ On Ubuntu versions 16.04 or later, you need the following dependencies:
     sudo apt-get install -y build-essential git
 ```
 
-**Step 2:** Install OpenBLAS.
+**Step 2:** Install a Math Library.
+
+Details on the different math libraries are found in the build from source guide's [Math Library Selection](build_from_source.html#math-library-selection) section.
 
-*MXNet* uses [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) library for accelerated numerical computations on CPU machine. There are several flavors of BLAS libraries - [OpenBLAS](http://www.openblas.net/), [ATLAS](http://math-atlas.sourceforge.net/) and [MKL](https://software.intel.com/en-us/intel-mkl). In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
+For OpenBLAS use:
 
 ```bash
     sudo apt-get install -y libopenblas-dev
 ```
 
+For other libraries, visit the [Math Library Selection](build_from_source.html#math-library-selection) section.
+
 **Step 3:** Install OpenCV.
 
 *MXNet* uses [OpenCV](http://opencv.org/) for efficient image loading and augmentation operations.
@@ -153,7 +159,7 @@ On Ubuntu versions 16.04 or later, you need the following dependencies:
 
 **Step 4:** Download MXNet sources and build MXNet core shared library.
 
-If building on CPU:
+If building on CPU and using OpenBLAS:
 
 ```bash
     git clone --recursive https://github.com/apache/incubator-mxnet.git
@@ -161,7 +167,7 @@ If building on CPU:
     make -j $(nproc) USE_OPENCV=1 USE_BLAS=openblas
 ```
 
-If building on GPU (make sure you have installed the [CUDA dependencies first](#cuda-dependencies)):
+If building on GPU and you want OpenCV and OpenBLAS (make sure you have installed the [CUDA dependencies first](#cuda-dependencies)):
 
 ```bash
     git clone --recursive https://github.com/apache/incubator-mxnet.git
@@ -169,38 +175,119 @@ If building on GPU (make sure you have installed the [CUDA dependencies first](#
     make -j $(nproc) USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
 ```
 
-*Note* - USE_OPENCV and USE_BLAS are make file flags to set compilation options to use OpenCV and BLAS library. You can explore and use more compilation options in `make/config.mk`.
+*Note* - USE_OPENCV and USE_BLAS are make file flags to set compilation options to use OpenCV and BLAS library. You can explore and use more compilation options in `make/config.mk` and also review common [usage examples](build_from_source.html#usage-examples).
 
-Executing these commands creates a library called ```libmxnet.so```.
+Building from source creates a library called ```libmxnet.so``` in the `lib` folder in your MXNet project root.
 
-Next, you may optionally install ```graphviz``` library that is used for visualizing network graphs you build on MXNet. You may also install [Jupyter Notebook](http://jupyter.readthedocs.io/) which is used for running MXNet tutorials and examples.
+You may also want to add the MXNet shared library to your `LD_LIBRARY_PATH`:
 
 ```bash
-    sudo apt-get install -y python-pip
-    sudo pip install graphviz
-    sudo pip install jupyter
+export LD_LIBRARY_PATH=~/incubator-mxnet/lib
 ```
+
+After building the MXNet library, you may install language bindings.
+
 <hr>
 
 
 ## Installing Language Packages for MXNet
 
 After you have installed the MXNet core library. You may install MXNet interface packages for the programming language of your choice:
-- [Scala](#install-the-mxnet-package-for-scala)
-- [R](#install-the-mxnet-package-for-r)
+- [Python](#install-mxnet-for-python)
+- [C++](#install-the-mxnet-package-for-c&plus;&plus;)
+- [Clojure](#install-the-mxnet-package-for-clojure)
 - [Julia](#install-the-mxnet-package-for-julia)
 - [Perl](#install-the-mxnet-package-for-perl)
+- [R](#install-the-mxnet-package-for-r)
+- [Scala](#install-the-mxnet-package-for-scala)
 
+<hr>
 
-### Install the MXNet Package for Scala
+### Install MXNet for Python
 
-To use the MXNet-Scala package, you can acquire the Maven package as a dependency.
+To install the MXNet Python binding navigate to the root of the MXNet folder then run the following:
 
-Further information is in the [MXNet-Scala Setup Instructions](scala_setup.html).
+```bash
+$ cd python
+$ pip install -e .
+```
 
-If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.html) instead.
+Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
+
+#### Optional Python Packages
+
+You may optionally install ```graphviz``` library that is used for visualizing network graphs you build on MXNet. You may also install [Jupyter Notebook](http://jupyter.readthedocs.io/) which is used for running MXNet tutorials and examples.
+
+```bash
+sudo pip install graphviz
+sudo pip install jupyter
+```
+<hr>
+
+
+### Install the MXNet Package for C++
+
+Refer to the [C++ Package setup guide](c_plus_plus.html).
+<hr>
+
+
+### Install the MXNet Package for Clojure
+
+Refer to the [Clojure setup guide](https://github.com/apache/incubator-mxnet/tree/master/contrib/clojure-package).
+<hr>
+
+
+### Install the MXNet Package for Julia
+
+The MXNet package for Julia is hosted in a separate repository, MXNet.jl, which is available on [GitHub](https://github.com/dmlc/MXNet.jl). To use Julia binding it with an existing libmxnet installation, set the ```MXNET_HOME``` environment variable by running the following command:
+
+```bash
+    export MXNET_HOME=/<path to>/libmxnet
+```
+
+The path to the existing libmxnet installation should be the root directory of libmxnet. In other words, you should be able to find the ```libmxnet.so``` file at ```$MXNET_HOME/lib```. For example, if the root directory of libmxnet is ```~```, you would run the following command:
+
+```bash
+    export MXNET_HOME=/~/libmxnet
+```
+
+You might want to add this command to your ```~/.bashrc``` file. If you do, you can install the Julia package in the Julia console using the following command:
+
+```julia
+    Pkg.add("MXNet")
+```
+
+For more details about installing and using MXNet with Julia, see the [MXNet Julia documentation](http://dmlc.ml/MXNet.jl/latest/user-guide/install/).
+<hr>
+
+
+### Install the MXNet Package for Perl
+
+Before you build MXNet for Perl from source code, you must complete [building the shared library](#build-the-shared-library). After you build the shared library, run the following command from the MXNet source root directory to build the MXNet Perl package:
+
+```bash
+    sudo apt-get install libmouse-perl pdl cpanminus swig libgraphviz-perl
+    cpanm -q -L "${HOME}/perl5" Function::Parameters Hash::Ordered PDL::CCS
+
+    MXNET_HOME=${PWD}
+    export LD_LIBRARY_PATH=${MXNET_HOME}/lib
+    export PERL5LIB=${HOME}/perl5/lib/perl5
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNet/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+```
 <hr>
 
+
 ### Install the MXNet Package for R
 
 Building *MXNet* from source is a 2 step process.
@@ -291,69 +378,19 @@ You should see the following output:
 [2,]    3    3    3
 > quit()
 ```
-
 <hr>
 
 
-### Install the MXNet Package for Julia
-
-The MXNet package for Julia is hosted in a separate repository, MXNet.jl, which is available on [GitHub](https://github.com/dmlc/MXNet.jl). To use Julia binding it with an existing libmxnet installation, set the ```MXNET_HOME``` environment variable by running the following command:
-
-```bash
-    export MXNET_HOME=/<path to>/libmxnet
-```
-
-The path to the existing libmxnet installation should be the root directory of libmxnet. In other words, you should be able to find the ```libmxnet.so``` file at ```$MXNET_HOME/lib```. For example, if the root directory of libmxnet is ```~```, you would run the following command:
-
-```bash
-    export MXNET_HOME=/~/libmxnet
-```
-
-You might want to add this command to your ```~/.bashrc``` file. If you do, you can install the Julia package in the Julia console using the following command:
-
-```julia
-    Pkg.add("MXNet")
-```
-
-For more details about installing and using MXNet with Julia, see the [MXNet Julia documentation](http://dmlc.ml/MXNet.jl/latest/user-guide/install/).
-<hr>
-
-
-## Install the MXNet Package for Scala
+### Install the MXNet Package for Scala
 
 To use the MXNet-Scala package, you can acquire the Maven package as a dependency.
 
-Further information is in the [MXNet-Scala Setup Instructions](./scala_setup.md).
-
-If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.md) instead.
-
-
-### Install the MXNet Package for Perl
-
-Before you build MXNet for Perl from source code, you must complete [building the shared library](#build-the-shared-library). After you build the shared library, run the following command from the MXNet source root directory to build the MXNet Perl package:
-
-```bash
-    sudo apt-get install libmouse-perl pdl cpanminus swig libgraphviz-perl
-    cpanm -q -L "${HOME}/perl5" Function::Parameters Hash::Ordered PDL::CCS
-
-    MXNET_HOME=${PWD}
-    export LD_LIBRARY_PATH=${MXNET_HOME}/lib
-    export PERL5LIB=${HOME}/perl5/lib/perl5
-
-    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make install
-
-    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make install
+Further information is in the [MXNet-Scala Setup Instructions](scala_setup.html).
 
-    cd ${MXNET_HOME}/perl-package/AI-MXNet/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make install
-```
+If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.html) instead.
 <hr>
 
+
 ## Contributions
 
 You are more than welcome to contribute easy installation scripts for other operating systems and programming languages. See the [community contributions page](../community/contribute.html) for further information.
diff --git a/docs/install/validate_mxnet.md b/docs/install/validate_mxnet.md
new file mode 100644
index 00000000000..a4cf5446f60
--- /dev/null
+++ b/docs/install/validate_mxnet.md
@@ -0,0 +1,185 @@
+# Validate Your MXNet Installation
+
+- [Python](#python)
+- [Python with GPU](#python-with-gpu)
+- [Verify GPU training](#verify-gpu-training)
+- [Virtualenv](#virtualenv)
+- [Docker with CPU](#docker-with-cpu)
+- [Docker with GPU](#docker-with-gpu)
+- [Cloud](#cloud)
+- [C++](#alternative-language-bindings)
+- [Clojure](#clojure)
+- [Julia](#julia)
+- [Perl](#perl)
+- [R](#r)
+- [Scala](#scala)
+
+
+## Python
+
+Start the python terminal.
+
+```bash
+$ python
+```
+
+Run a short *MXNet* python program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
+
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.ones((2, 3))
+>>> b = a * 2 + 1
+>>> b.asnumpy()
+array([[ 3.,  3.,  3.],
+       [ 3.,  3.,  3.]], dtype=float32)
+```
+
+
+## Python with GPU
+
+This is similar to the previous example, but this time we use *mx.gpu()*, to set *MXNet* context to be GPUs.
+
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.ones((2, 3), mx.gpu())
+>>> b = a * 2 + 1
+>>> b.asnumpy()
+array([[ 3.,  3.,  3.],
+       [ 3.,  3.,  3.]], dtype=float32)
+```
+
+
+## Verify GPU Training
+
+From the MXNet root directory run: `python example/image-classification/train_mnist.py --network lenet --gpus 0` to test GPU training.
+
+
+## Virtualenv
+
+Activate the virtualenv environment created for *MXNet*.
+
+```bash
+$ source ~/mxnet/bin/activate
+```
+
+After activating the environment, you should see the prompt as below.
+
+```bash
+(mxnet)$
+```
+
+Start the python terminal.
+
+```bash
+$ python
+```
+
+Run the previous Python example.
+
+
+## Docker with CPU
+
+Launch a Docker container with `mxnet/python` image and run example *MXNet* python program on the terminal.
+
+```bash
+$ docker run -it mxnet/python bash # Use sudo if you skip Step 2 in the installation instruction
+
+# Start a python terminal
+root@4919c4f58cac:/# python
+```
+
+Run the previous Python example.
+
+
+## Docker with GPU
+
+Launch a NVIDIA Docker container with `mxnet/python:gpu` image and run example *MXNet* python program on the terminal.
+
+```bash
+$ nvidia-docker run -it mxnet/python:gpu bash # Use sudo if you skip Step 2 in the installation instruction
+
+# Start a python terminal
+root@4919c4f58cac:/# python
+```
+
+Run the previous Python example and run the previous GPU examples.
+
+
+## Cloud
+
+Login to the cloud instance you launched, with pre-installed *MXNet*, following the guide by corresponding cloud provider.
+
+Start the python terminal.
+
+```bash
+$ python
+```
+
+Run the previous Python example, and for GPU instances run the previous GPU example.
+
+
+## Alternative Language Bindings
+
+### C++
+
+Please contribute an example!
+
+
+### Clojure
+
+Please contribute an example!
+
+
+### Julia
+
+Please contribute an example!
+
+
+### Perl
+
+Please contribute an example!
+
+
+### R
+
+Run a short *MXNet* R program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
+
+```r
+library(mxnet)
+a <- mx.nd.ones(c(2,3), ctx = mx.cpu())
+b <- a * 2 + 1
+b
+```
+
+You should see the following output:
+
+```r
+[,1] [,2] [,3]
+[1,]    3    3    3
+[2,]    3    3    3
+```
+
+
+#### R with GPU
+
+This is similar to the previous example, but this time we use *mx.gpu()*, to set *MXNet* context to be GPUs.
+
+```r
+library(mxnet)
+a <- mx.nd.ones(c(2,3), ctx = mx.gpu())
+b <- a * 2 + 1
+b
+```
+
+You should see the following output:
+
+```r
+[,1] [,2] [,3]
+[1,]    3    3    3
+[2,]    3    3    3
+```
+
+
+### Scala
+
+Run the <a href="https://github.com/apache/incubator-mxnet/tree/master/scala-package/mxnet-demo">MXNet-Scala demo project</a> to validate your Maven package installation.
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 40ddeb8182d..99ce7f63e85 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -91,7 +91,7 @@ Done! We have installed MXNet with Python interface. Run below commands to verif
 ```
 We actually did a small tensor computation using MXNet! You are all set with MXNet on your Windows machine.
 
-## Install MXNet Package for R
+## Install the MXNet Package for R
 MXNet for R is available for both CPUs and GPUs.
 
 ### Installing MXNet on a Computer with a CPU Processor
@@ -151,8 +151,8 @@ These dlls can be found in `prebuildbase_win10_x64_vc14/3rdparty`, `mxnet_x64_vc
     ├── dmlc
     ├── mxnet
     ├── mshadow
-    └── nnvm 
-    
+    └── nnvm
+
 ```
 6. Make sure that R executable is added to your ```PATH``` in the environment variables. Running the ```where R``` command at the command prompt should return the location.
 7. Also make sure that Rtools is installed and the executable is added to your ```PATH``` in the environment variables.
@@ -200,7 +200,7 @@ To install MXNet on a computer with a GPU processor, choose from two options:
 * Build the library from source code
 
 However, a few dependencies remain for both options.  You will need the following:
-* Install [Nvidia-drivers](http://www.nvidia.com/Download/index.aspx?lang=en-us) if not installed. Latest driver based on your system configuration is recommended. 
+* Install [Nvidia-drivers](http://www.nvidia.com/Download/index.aspx?lang=en-us) if not installed. Latest driver based on your system configuration is recommended.
 
 * Install [Microsoft Visual Studio](https://visualstudio.microsoft.com/downloads/) (VS2015 or VS2017 is required by CUDA)
 
@@ -224,7 +224,7 @@ For GPU package:
 ```
 Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
 #### Building MXNet from Source Code(GPU)
-After you have installed above software, continue with the following steps to build MXNet-R: 
+After you have installed above software, continue with the following steps to build MXNet-R:
 1. Clone the MXNet github repo.
 
 ```sh
@@ -261,8 +261,8 @@ These dlls can be found in `prebuildbase_win10_x64_vc14/3rdparty`, `mxnet_x64_vc
     ├── dmlc
     ├── mxnet
     ├── mshadow
-    └── nnvm 
-    
+    └── nnvm
+
 ```
 6. Make sure that R executable is added to your ```PATH``` in the environment variables. Running the ```where R``` command at the command prompt should return the location.
 7. Also make sure that Rtools is installed and the executable is added to your ```PATH``` in the environment variables.
diff --git a/docs/settings.ini b/docs/settings.ini
index f999b3efde2..b8e486e58e8 100644
--- a/docs/settings.ini
+++ b/docs/settings.ini
@@ -14,7 +14,7 @@ r_docs = 0
 scala_docs = 1
 
 [document_sets_v1.2.0]
-clojure_docs = 1
+clojure_docs = 0
 doxygen_docs = 1
 r_docs = 0
 scala_docs = 1
diff --git a/docs/tutorials/basic/index.md b/docs/tutorials/basic/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/basic/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/c++/index.md b/docs/tutorials/c++/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/c++/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/control_flow/ControlFlowTutorial.md b/docs/tutorials/control_flow/ControlFlowTutorial.md
new file mode 100644
index 00000000000..9e4c66f8521
--- /dev/null
+++ b/docs/tutorials/control_flow/ControlFlowTutorial.md
@@ -0,0 +1,388 @@
+# Hybridize Gluon models with control flows.
+
+MXNet currently provides three control flow operators: `cond`, `foreach` and `while_loop`. Like other MXNet operators, they all have a version for NDArray and a version for Symbol. These two versions have exactly the same semantics. We can take advantage of this and use them in Gluon to hybridize models.
+
+In this tutorial, we use a few examples to demonstrate the use of control flow operators in Gluon and show how a model that requires control flow is hybridized.
+
+## Prepare running the code
+
+
+```python
+import mxnet as mx
+from mxnet.gluon import HybridBlock
+```
+
+## foreach
+`foreach` is a for loop that iterates over the first dimension of the input data (it can be an array or a list of arrays). It is defined with the following signature:
+
+```python
+foreach(body, data, init_states, name) => (outputs, states)
+```
+
+It runs the Python function defined in `body` for every slice from the input arrays. The signature of the `body` function is defined as follows:
+
+```python
+body(data, states) => (outputs, states)
+```
+
+The inputs of the `body` function have two parts: `data` is a slice of an array (if there is only one input array in `foreach`) or a list of slices (if there are a list of input arrays); `states` are the arrays from the previous iteration. The outputs of the `body` function also have two parts: `outputs` is an array or a list of arrays; `states` is the computation states of the current iteration. `outputs` from all iterations are concatenated as the outputs of `foreach`.
+
+The following pseudocode illustrates the execution of `foreach`.
+
+```python
+def foreach(body, data, init_states):
+    states = init_states
+    outs = []
+
+    for i in range(data.shape[0]):
+        s = data[i]
+        out, states = body(s, states)
+        outs.append(out)
+    outs = mx.nd.stack(*outs)
+    return outs, states
+```
+
+### Example 1: `foreach` works like map
+`foreach` can work like a map function of a functional language. In this case, the states of `foreach` can be an empty list, which means the computation doesn't carry computation states across iterations.
+
+In this example, we use `foreach` to increase each element's value of an array by one.
+
+
+```python
+data = mx.nd.arange(5)
+print(data)
+```
+
+    
+    [ 0.  1.  2.  3.  4.]
+    <NDArray 5 @cpu(0)>
+
+
+
+```python
+def add1(data, _):
+    return data + 1, []
+
+class Map(HybridBlock):
+    def hybrid_forward(self, F, data):
+        out, _ = F.contrib.foreach(add1, data, [])
+        return out
+    
+map_layer = Map()
+out = map_layer(data)
+print(out)
+```
+
+    
+    [[ 1.]
+     [ 2.]
+     [ 3.]
+     [ 4.]
+     [ 5.]]
+    <NDArray 5x1 @cpu(0)>
+
+
+We can hybridize the block and run the computation again. It should generate the same result.
+
+
+```python
+map_layer.hybridize()
+out = map_layer(data)
+print(out)
+```
+
+    
+    [[ 1.]
+     [ 2.]
+     [ 3.]
+     [ 4.]
+     [ 5.]]
+    <NDArray 5x1 @cpu(0)>
+
+
+### Example 2: `foreach` works like scan
+`foreach` can work like a scan function in a functional language. In this case, the outputs of the Python function is an empty list.
+
+
+```python
+def sum(data, state):
+    return [], state + data
+
+class Scan(HybridBlock):
+    def hybrid_forward(self, F, data):
+        _, state = F.contrib.foreach(sum, data, F.zeros((1)))
+        return state
+scan_layer = Scan()
+state = scan_layer(data)
+print(data)
+print(state)
+```
+
+    
+    [ 0.  1.  2.  3.  4.]
+    <NDArray 5 @cpu(0)>
+    
+    [ 10.]
+    <NDArray 1 @cpu(0)>
+
+
+
+```python
+scan_layer.hybridize()
+state = scan_layer(data)
+print(state)
+```
+
+    
+    [ 10.]
+    <NDArray 1 @cpu(0)>
+
+
+### Example 3: `foreach` with both outputs and states
+This is probably the most common use case of `foreach`. We extend the previous scan example and return both output and states.
+
+
+```python
+def sum(data, state):
+    return state + data, state + data
+
+class ScanV2(HybridBlock):
+    def hybrid_forward(self, F, data):
+        out, state = F.contrib.foreach(sum, data, F.zeros((1)))
+        return out, state
+scan_layer = ScanV2()
+out, state = scan_layer(data)
+print(out)
+print(state)
+```
+
+    
+    [[  0.]
+     [  1.]
+     [  3.]
+     [  6.]
+     [ 10.]]
+    <NDArray 5x1 @cpu(0)>
+    
+    [ 10.]
+    <NDArray 1 @cpu(0)>
+
+
+
+```python
+scan_layer.hybridize()
+out, state = scan_layer(data)
+print(out)
+print(state)
+```
+
+    
+    [[  0.]
+     [  1.]
+     [  3.]
+     [  6.]
+     [ 10.]]
+    <NDArray 5x1 @cpu(0)>
+    
+    [ 10.]
+    <NDArray 1 @cpu(0)>
+
+
+### Example 4: use `foreach` to run an RNN on a variable-length sequence
+Previous examples illustrate `foreach` with simple use cases. Here we show an example of processing variable-length sequences with `foreach`. The same idea is used by `dynamic_rnn` in TensorFlow for processing variable-length sequences.
+
+
+```python
+class DynamicRNNLayer(HybridBlock):
+    def __init__(self, cell, prefix=None, params=None):
+        super(DynamicRNNLayer, self).__init__(prefix=prefix, params=params)
+        self.cell = cell
+    def hybrid_forward(self, F, inputs, begin_state, valid_length):
+        states = begin_state
+        zeros = []
+        for s in states:
+            zeros.append(F.zeros_like(s))
+        # the last state is the iteration number.
+        states.append(F.zeros((1)))
+        def loop_body(inputs, states):
+            cell_states = states[:-1]
+            # Get the iteration number from the states.
+            iter_no = states[-1]
+            out, new_states = self.cell(inputs, cell_states)
+            # Copy the old state if we have reached the end of a sequence.
+            for i, state in enumerate(cell_states):
+                new_states[i] = F.where(F.broadcast_greater(valid_length, iter_no),
+                                        new_states[i], state)
+            new_states.append(iter_no + 1)
+            return out, new_states
+
+        outputs, states = F.contrib.foreach(loop_body, inputs, states)
+        outputs = F.SequenceMask(outputs, sequence_length=valid_length,
+                                 use_sequence_length=True, axis=0)
+        # the last state is the iteration number. We don't need it.
+        return outputs, states[:-1]
+
+
+seq_len = 10
+batch_size = 2
+input_size = 5
+hidden_size = 6
+
+rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size))
+init_states = [mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size)) for i in range(2)]
+valid_length = mx.nd.round(mx.nd.random.uniform(low=1, high=10, shape=(batch_size))) 
+
+lstm = DynamicRNNLayer(mx.gluon.rnn.LSTMCell(hidden_size))
+lstm.initialize()
+res, states = lstm(rnn_data, [x for x in init_states], valid_length)
+
+lstm.hybridize()
+res, states = lstm(rnn_data, [x for x in init_states], valid_length)
+```
+
+## while_loop
+`while_loop` defines a while loop. It has the following signature:
+
+```python
+while_loop(cond, body, loop_vars, max_iterations, name) => (outputs, states)
+```
+
+Instead of running over the first dimension of an array, `while_loop` checks a condition function in every iteration and runs a `body` function for computation. The signature of the `body` function is defined as follows:
+
+```python
+body(state1, state2, ...) => (outputs, states)
+```
+
+The inputs of the `body` function in `while_loop` are a little different from the one in `foreach`. It has a variable number of input arguments. Each input argument is a loop variable and the number of arguments is determined by the number of loop variables. The outputs of the `body` function also have two parts: `outputs` is an array or a list of arrays; `states` are loop variables and will be passed to the next iteration as inputs of `body`. Like `foreach`, both `outputs` and `states` can be an empty list. `outputs` from all iterations are concatenated as the outputs of `while_loop`.
+
+### Example 5: scan with while_loop
+`while_loop` is more general than `foreach`. We can also use it to iterate over an array and sum all of its values together. In this example, instead of summing over the entire array, we only sum over the first 4 elements.
+
+**Note**: the output arrays of the current implementation of `while_loop` is determined by `max_iterations`. As such, even though the while loop in this example runs 4 iterations, it still outputs an array of 5 elements. The last element in the output array is actually filled with an arbitrary value.
+
+
+```python
+class ScanV2(HybridBlock):
+    def hybrid_forward(self, F, data):
+        def sum(state, i):
+            s = state + data[i]
+            return s, [s, i + 1]
+
+        def sum_cond(state, i):
+            return i < 4
+
+        out, state = F.contrib.while_loop(sum_cond, sum,
+                                          [F.zeros((1)), F.zeros((1))], max_iterations=5)
+        return out, state
+scan_layer = ScanV2()
+out, state = scan_layer(data)
+print(out)
+print(state)
+```
+
+    
+    [[ 0.]
+     [ 1.]
+     [ 3.]
+     [ 6.]
+     [ 0.]]
+    <NDArray 5x1 @cpu(0)>
+    [
+    [ 6.]
+    <NDArray 1 @cpu(0)>, 
+    [ 4.]
+    <NDArray 1 @cpu(0)>]
+
+
+## cond
+`cond` defines an if condition. It has the following signature:
+
+```python
+cond(pred, then_func, else_func, name)
+```
+
+`cond` checks `pred`, which is a symbol or an NDArray with one element. If its value is true, it calls `then_func`. Otherwise, it calls `else_func`. The signature of `then_func` and `else_func` are as follows:
+
+```python
+func() => [outputs]
+```
+
+`cond` requires all outputs from `then_func` and `else_func` have the same number of Symbols/NDArrays with the same shapes and data types.
+
+### Example 6: skip RNN computation with cond
+Example 4 shows how to process a batch with sequences of different lengths. It performs computation for all steps but discards some of the computation results.
+
+In this example, we show how to skip computation after we have reached the end of a sequence, whose length is indicated by `length`. The code below only works for a batch with one sequence.
+
+
+```python
+class SkipRNNCell(HybridBlock):
+    def __init__(self, cell, prefix=None, params=None):
+        super(SkipRNNCell, self).__init__(prefix=prefix, params=params)
+        self.cell = cell
+    def hybrid_forward(self, F, i, length, data, states):
+        def run_rnn():
+            return self.cell(data, states)
+
+        def copy_states():
+            return F.zeros_like(data), states
+        out, state = F.contrib.cond(i < length, run_rnn, copy_states)
+        return out, state
+
+class RNNLayer(HybridBlock):
+    def __init__(self, cell, prefix=None, params=None):
+        super(RNNLayer, self).__init__(prefix=prefix, params=params)
+        self.cell = SkipRNNCell(cell)
+    def hybrid_forward(self, F, length, data, init_states):
+        def body(data, states):
+            i = states[0]
+            out, states = self.cell(i, length, data, states[1])
+            return out, [i + 1, states]
+        print()
+        out, state = F.contrib.foreach(body, data, [F.zeros((1)), init_states])
+        return out, state
+
+
+seq_len = 5
+batch_size = 1
+input_size = 3
+hidden_size = 3
+
+rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size))
+init_states = [mx.nd.normal(loc=0, scale=1, shape=(batch_size, hidden_size)) for i in range(2)]
+
+cell = mx.gluon.rnn.LSTMCell(hidden_size)
+layer = RNNLayer(cell)
+layer.initialize()
+
+out, states = layer(mx.nd.array([3]), rnn_data, init_states)
+print(rnn_data)
+print(out)
+```
+
+    ()
+    
+    [[[-1.25296438  0.387312   -0.41055229]]
+    
+     [[ 1.28453672  0.21001032 -0.08666432]]
+    
+     [[ 1.46422136 -1.30581355  0.9344402 ]]
+    
+     [[ 0.5380863  -0.16038011  0.84187603]]
+    
+     [[-1.00553632  3.13221502 -0.4358989 ]]]
+    <NDArray 5x1x3 @cpu(0)>
+    
+    [[[-0.02620504  0.1605694   0.29636264]]
+    
+     [[-0.00474182  0.08719197  0.17757624]]
+    
+     [[ 0.00631597  0.04674901  0.12468992]]
+    
+     [[ 0.          0.          0.        ]]
+    
+     [[ 0.          0.          0.        ]]]
+    <NDArray 5x1x3 @cpu(0)>
+
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/embedded/index.md b/docs/tutorials/embedded/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/embedded/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/gluon/gotchas_numpy_in_mxnet.md b/docs/tutorials/gluon/gotchas_numpy_in_mxnet.md
new file mode 100644
index 00000000000..c82c63edbc2
--- /dev/null
+++ b/docs/tutorials/gluon/gotchas_numpy_in_mxnet.md
@@ -0,0 +1,168 @@
+
+# Gotchas using NumPy in Apache MXNet
+
+The goal of this tutorial is to explain some common misconceptions about using [NumPy](http://www.numpy.org/) arrays in Apache MXNet. We are going to explain why you need to minimize or completely remove usage of NumPy from your Apache MXNet code. We also going to show how to minimize NumPy performance impact, when you have to use NumPy.
+
+## Asynchronous and non-blocking nature of Apache MXNet
+
+Instead of using NumPy arrays Apache MXNet offers its own array implementation named [NDArray](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html). `NDArray API` was intentionally designed to be similar to `NumPy`, but there are differences.
+
+One key difference is in the way calculations are executed. Every `NDArray` manipulation in Apache MXNet is done in asynchronous, non-blocking way. That means, that when we write code like `c = a * b`, where both `a` and `b` are `NDArrays`, the function is pushed to the [Execution Engine](https://mxnet.incubator.apache.org/architecture/overview.html#execution-engine), which starts the calculation. The function immediately returns back, and the  user thread can continue execution, despite the fact that the calculation may not have been completed yet. 
+
+`Execution Engine` builds the computation graph which may reorder or combine some calculations, but it honors dependency order: if there are other manipulation with `c` done later in the code, the `Execution Engine` will start doing them once the result of `c` is available. We don't need to write callbacks to start execution of subsequent code - the `Execution Engine` is going to do it for us. 
+
+To get the result of the computation we only need to access the resulting variable, and the flow of the code will be blocked until the computation results are assigned to the resulting variable. This behavior allows to increase code performance while still supporting imperative programming mode. 
+
+Refer to the [intro tutorial to NDArray](https://mxnet.incubator.apache.org/tutorials/basic/ndarray.html), if you are new to Apache MXNet and would like to learn more how to manipulate NDArrays.
+
+## Converting NDArray to NumPy Array blocks calculation
+
+Many people are familiar with NumPy and flexible doing tensor manipulations using it. `NDArray API` offers  a convinient [.asnumpy() method](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.asnumpy) to cast `nd.array` to `np.array`. However, by doing this cast and using `np.array` for calculation, we cannot use all the goodness of `Execution Engine`. All manipulations done on `np.array` are blocking. Moreover, the cast to `np.array` itself is a blocking operation (same as [.asscalar()](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.asscalar), [.wait_to_read()](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.wait_to_read) and [.waitall()](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.waitall)). 
+
+That means that if we have a long computation graph and, at some point, we want to cast the result to `np.array`, it may feel like the casting takes a lot of time. But what really takes this time is `Execution Engine`, which finishes all the async calculations we have pushed into it to get the final result, which then will be converted to `np.array`.
+
+Because of the blocking nature of [.asnumpy() method](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.asnumpy), using it reduces the execution performance, especially if the calculations are done on GPU: Apache MXNet has to copy data from GPU to CPU to return `np.array`. 
+
+The best solution is to **make manipulations directly on NDArrays by methods provided in [NDArray API](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html)**.
+
+## NumPy operators vs. NDArray operators
+
+Despite the fact that [NDArray API](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html) was specifically designed to be similar to `NumPy`, sometimes it is not easy to replace existing `NumPy` computations. The main reason is that not all operators, that are available in `NumPy`, are available in `NDArray API`. The list of currently available operators is available on [NDArray class page](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#the-ndarray-class).
+
+If a required operator is missing from `NDArray API`, there are few things you can do.
+
+### Combine a higher level operator using a few lower level operators
+
+There are a situation, when you can assemble a higher level operator using existing operators. An example for that is the [np.full_like()](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.full_like.html) operator. This operator doesn't exist in `NDArray API`, but can be easily replaced with a combination of existing operators.
+
+
+```python
+from mxnet import nd
+import numpy as np
+
+# NumPy has full_like() operator 
+np_y = np.full_like(a=np.arange(6, dtype=int), fill_value=10)
+
+# NDArray doesn't have it, but we can replace it with
+# creating an array of ones and then multiplying by fill_value
+nd_y = nd.ones(shape=(6,)) * 10
+
+# To compare results we had to convert NDArray to NumPy
+# But this is okay for that particular case
+np.array_equal(np_y, nd_y.asnumpy())
+```
+
+    True <!--notebook-skip-line-->
+
+### Find similar operator with different name and/or signature
+
+Some operators may have slightly different name, but are similar in terms of functionality. For example [nd.ravel_multi_index()](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.ravel_multi_index) is similar to [np.ravel()](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.ma.ravel.html#numpy.ma.ravel). In other cases some operators may have similar names, but different signatures. For example [np.split()](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.split.html#numpy.split) and [nd.split()](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.split) are similar, but the former works with indices and the latter requires the number of splits to be provided.
+
+One particular example of different input requirements is [nd.pad()](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.pad). The trick is that it can only work with 4-dimensional tensors. If your input has less dimensions, then you need to expand its number before using `nd.pad()` as it is shown in the code block below: 
+
+
+```python
+def pad_array(data, max_length):
+    # expand dimensions to 4, because nd.pad can work only with 4 dims
+    data_expanded = data.reshape(1, 1, 1, data.shape[0])
+    
+    # pad all 4 dimensions with constant value of 0
+    data_padded = nd.pad(data_expanded,
+                             mode='constant',
+                             pad_width=[0, 0, 0, 0, 0, 0, 0, max_length - data.shape[0]],
+                             constant_value=0)
+    
+    # remove temporary dimensions 
+    data_reshaped_back = data_padded.reshape(max_length)
+    return data_reshaped_back
+
+pad_array(nd.array([1, 2, 3]), max_length=10)
+```
+    
+    [ 1.  2.  3.  0.  0.  0.  0.  0.  0.  0.] <!--notebook-skip-line-->
+
+    <NDArray 10 @cpu(0)> <!--notebook-skip-line-->
+
+
+### Search for an operator on [Github](https://github.com/apache/incubator-mxnet/labels/Operator)
+
+Apache MXNet community is responsive to requests, and everyone is welcomed to contribute new operators. Have in mind, that there is always a lag between new operators being merged into the codebase and release of a next stable version. For example, [nd.diag()](https://github.com/apache/incubator-mxnet/pull/11643) operator was recently introduced to Apache MXNet, but on the moment of writing this tutorial, it is not in any stable release. You can always get all latest implementations by installing the [master version](https://mxnet.incubator.apache.org/install/index.html?version=master#) of Apache MXNet.
+
+## How to minimize the impact of blocking calls
+
+There are cases, when you have to use either `.asnumpy()` or `.asscalar()` methods. As it is explained before, this will force Apache MXNet to block the execution until the result can be retrieved. One common use case is printing a metric or a value of a loss function.
+
+You can minimize the impact of a blocking call by calling `.asnumpy()` or `.asscalar()` in the moment, when you think the calculation of this value is already done. In the example below, we introduce the `LossBuffer` class. It is used to cache the previous value of a loss function. By doing so, we delay printing by one iteration in hope that the `Execution Engine` would finish the previous iteration and blocking time would be minimized.
+
+
+```python
+from __future__ import print_function
+
+import mxnet as mx
+from mxnet import gluon, nd, autograd
+from mxnet.ndarray import NDArray
+from mxnet.gluon import HybridBlock
+import numpy as np
+
+class LossBuffer(object):
+    """
+    Simple buffer for storing loss value
+    """
+    def __init__(self):
+        self._loss = None
+
+    def new_loss(self, loss):
+        ret = self._loss
+        self._loss = loss
+        return ret
+
+    @property
+    def loss(self):
+        return self._loss
+
+
+net = gluon.nn.Dense(10)
+ce = gluon.loss.SoftmaxCELoss()
+net.initialize()
+
+data = nd.random.uniform(shape=(1024, 100))
+label = nd.array(np.random.randint(0, 10, (1024,)), dtype='int32')
+train_dataset = gluon.data.ArrayDataset(data, label)
+train_data = gluon.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
+
+trainer = gluon.Trainer(net.collect_params(), optimizer='sgd')
+loss_buffer = LossBuffer()
+
+for data, label in train_data:
+    with autograd.record():
+        out = net(data)
+        # This call saves new loss and returns previous loss
+        prev_loss = loss_buffer.new_loss(ce(out, label))
+        
+    loss_buffer.loss.backward()
+    trainer.step(data.shape[0])
+    
+    if prev_loss is not None:
+        print("Loss: {}".format(np.mean(prev_loss.asnumpy())))
+```
+
+    Loss: 2.310760974884033 <!--notebook-skip-line-->
+
+    Loss: 2.334498643875122 <!--notebook-skip-line-->
+
+    Loss: 2.3244147300720215 <!--notebook-skip-line-->
+
+    Loss: 2.332686424255371 <!--notebook-skip-line-->
+
+    Loss: 2.321366310119629 <!--notebook-skip-line-->
+
+    Loss: 2.3236165046691895 <!--notebook-skip-line-->
+
+    Loss: 2.3178648948669434 <!--notebook-skip-line-->
+
+
+## Conclusion
+
+For performance reasons, it is better to use native `NDArray API` methods and avoid using NumPy altogether. In case when you must use NumPy, you can use convenient method `.asnumpy()` on `NDArray` to get NumPy representation. By doing so, you block the whole computational process, and force data to be synced between CPU and GPU. If it is a necessary evil to do that, try to minimize the blocking time by calling `.asnumpy()` in time, when you expect the value to be already computed.
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
diff --git a/docs/tutorials/gluon/index.md b/docs/tutorials/gluon/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/gluon/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index ae0851425be..8a6ac4081c0 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -1,5 +1,24 @@
 # Tutorials
 
+```eval_rst
+.. toctree::
+   :hidden:
+
+   basic/index.md
+   c++/index.md
+   embedded/index.md
+   gluon/index.md
+   nlp/index.md
+   onnx/index.md
+   python/index.md
+   r/index.md
+   scala/index.md
+   sparse/index.md
+   speech_recognition/index.md
+   unsupervised_learning/index.md
+   vision/index.md
+```
+
 MXNet tutorials can be found in this section. A variety of language bindings are available for MXNet (including Python, Scala, C++ and R) and we have a different tutorial section for each language.
 
 Are you new to MXNet, and don't have a preference on language? We currently recommend starting with Python, and specifically the Gluon APIs (versus Module APIs) as they're more flexible and easier to debug.
@@ -38,8 +57,10 @@ Select API:&nbsp;
     * [Word-level text generation with RNN, LSTM and GRU](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
     * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
 * Practitioner Guides
+    * [Gotchas using NumPy](/tutorials/gluon/gotchas_numpy_in_mxnet.html)
     * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
     * [Checkpointing and Model Serialization (a.k.a. saving and loading)](/tutorials/gluon/save_load_params.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/> ([Alternative](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html))
+    * [Distributed Training](https://github.com/apache/incubator-mxnet/tree/master/example/distributed_training)
     * [Inference using an ONNX model](/tutorials/onnx/inference_on_onnx_model.html)
     * [Fine-tuning an ONNX model on Gluon](/tutorials/onnx/fine_tuning_gluon.html)
     * [Visualizing Decisions of Convolutional Neural Networks](/tutorials/vision/cnn_visualization.html)
@@ -96,6 +117,7 @@ Select API:&nbsp;
     * [Fine-Tuning a pre-trained ImageNet model with a new dataset](/faq/finetune.html)
     * [Large-Scale Multi-Host Multi-GPU Image Classification](/tutorials/vision/large_scale_classification.html)
     * [Importing an ONNX model into MXNet](/tutorials/onnx/super_resolution.html)
+    * [Hybridize Gluon models with control flows](/tutorials/control_flow/ControlFlowTutorial.html)
 * API Guides
     * Core APIs
         * NDArray
diff --git a/docs/tutorials/nlp/index.md b/docs/tutorials/nlp/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/nlp/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/onnx/export_mxnet_to_onnx.md b/docs/tutorials/onnx/export_mxnet_to_onnx.md
new file mode 100644
index 00000000000..a9c03bed8b1
--- /dev/null
+++ b/docs/tutorials/onnx/export_mxnet_to_onnx.md
@@ -0,0 +1,134 @@
+
+# Exporting MXNet model to ONNX format
+
+[Open Neural Network Exchange (ONNX)](https://github.com/onnx/onnx) provides an open source format for AI models. It defines an extensible computation graph model, as well as definitions of built-in operators and standard data types.
+
+In this tutorial, we will show how you can save MXNet models to the ONNX format.
+
+MXNet-ONNX operators coverage and features are updated regularly. Visit the [ONNX operator coverage](https://cwiki.apache.org/confluence/display/MXNET/ONNX+Operator+Coverage) page for the latest information.
+
+In this tutorial, we will learn how to use MXNet to ONNX exporter on pre-trained models.
+
+## Prerequisites
+
+To run the tutorial you will need to have installed the following python modules:
+- [MXNet >= 1.3.0](http://mxnet.incubator.apache.org/install/index.html)
+- [onnx]( https://github.com/onnx/onnx#installation) v1.2.1 (follow the install guide)
+
+*Note:* MXNet-ONNX importer and exporter follows version 7 of ONNX operator set which comes with ONNX v1.2.1.
+
+
+```python
+import mxnet as mx
+import numpy as np
+from mxnet.contrib import onnx as onnx_mxnet
+import logging
+logging.basicConfig(level=logging.INFO)
+```
+
+## Downloading a model from the MXNet model zoo
+
+We download the pre-trained ResNet-18 [ImageNet](http://www.image-net.org/) model from the [MXNet Model Zoo](http://data.mxnet.io/models/imagenet/).
+We will also download synset file to match labels.
+
+```python
+# Download pre-trained resnet model - json and params by running following code.
+path='http://data.mxnet.io/models/imagenet/'
+[mx.test_utils.download(path+'resnet/18-layers/resnet-18-0000.params'),
+ mx.test_utils.download(path+'resnet/18-layers/resnet-18-symbol.json'),
+ mx.test_utils.download(path+'synset.txt')]
+```
+
+Now, we have downloaded ResNet-18 symbol, params and synset file on the disk.
+
+## MXNet to ONNX exporter API
+
+Let us describe the MXNet's `export_model` API. 
+
+```python
+help(onnx_mxnet.export_model)
+```
+
+```python
+Help on function export_model in module mxnet.contrib.onnx.mx2onnx.export_model:
+
+export_model(sym, params, input_shape, input_type=<type 'numpy.float32'>, onnx_file_path=u'model.onnx', verbose=False)
+    Exports the MXNet model file, passed as a parameter, into ONNX model.
+    Accepts both symbol,parameter objects as well as json and params filepaths as input.
+    Operator support and coverage - https://cwiki.apache.org/confluence/display/MXNET/ONNX
+    
+    Parameters
+    ----------
+    sym : str or symbol object
+        Path to the json file or Symbol object
+    params : str or symbol object
+        Path to the params file or params dictionary. (Including both arg_params and aux_params)
+    input_shape : List of tuple
+        Input shape of the model e.g [(1,3,224,224)]
+    input_type : data type
+        Input data type e.g. np.float32
+    onnx_file_path : str
+        Path where to save the generated onnx file
+    verbose : Boolean
+        If true will print logs of the model conversion
+    
+    Returns
+    -------
+    onnx_file_path : str
+        Onnx file path
+```
+
+`export_model` API can accept the MXNet model in one of the following two ways.
+
+1. MXNet sym, params objects:
+    * This is useful if we are training a model. At the end of training, we just need to invoke the `export_model` function and provide sym and params objects as inputs with other attributes to save the model in ONNX format.
+2. MXNet's exported json and params files:
+    * This is useful if we have pre-trained models and we want to convert them to ONNX format.
+
+Since we have downloaded pre-trained model files, we will use the `export_model` API by passing the path for symbol and params files.
+
+## How to use MXNet to ONNX exporter API
+
+We will use the downloaded pre-trained model files (sym, params) and define input variables.
+
+```python
+# Downloaded input symbol and params files
+sym = './resnet-18-symbol.json'
+params = './resnet-18-0000.params'
+
+# Standard Imagenet input - 3 channels, 224*224
+input_shape = (1,3,224,224)
+
+# Path of the output file
+onnx_file = './mxnet_exported_resnet50.onnx'
+```
+
+We have defined the input parameters required for the `export_model` API. Now, we are ready to covert the MXNet model into ONNX format.
+
+```python
+# Invoke export model API. It returns path of the converted onnx model
+converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file)
+```
+
+This API returns path of the converted model which you can later use to import the model into other frameworks.
+
+## Check validity of ONNX model
+
+Now we can check validity of the converted ONNX model by using ONNX checker tool. The tool will validate the model by checking if the content contains valid protobuf:
+
+```python
+from onnx import checker
+import onnx
+
+# Load onnx model
+model_proto = onnx.load(converted_model_path)
+
+# Check if converted ONNX protobuf is valid
+checker.check_graph(model_proto.graph)
+```
+
+If the converted protobuf format doesn't qualify to ONNX proto specifications, the checker will throw errors, but in this case it successfully passes. 
+
+This method confirms exported model protobuf is valid. Now, the model is ready to be imported in other frameworks for inference!
+    
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/onnx/index.md b/docs/tutorials/onnx/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/onnx/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/python/index.md b/docs/tutorials/python/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/python/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/r/index.md b/docs/tutorials/r/index.md
index 4692e7adce7..fbc8911f2a6 100644
--- a/docs/tutorials/r/index.md
+++ b/docs/tutorials/r/index.md
@@ -4,18 +4,9 @@ These tutorials introduce a few fundamental concepts in deep learning and how to
 
 ```eval_rst
 .. toctree::
-   :maxdepth: 1
+   :glob:
 
-   ndarray
-   symbol
-   fiveMinutesNeuralNetwork
-   classifyRealImageWithPretrainedModel
-   mnistCompetition
-   CatsDogsFinetune
-   CharRnnModel
-   CallbackFunction
-   CustomIterator
-   CustomLossFunction
+   *
 ```
 
 <br>
diff --git a/docs/tutorials/sparse/csr.md b/docs/tutorials/sparse/csr.md
index c2842ac16bd..0aede1ab431 100644
--- a/docs/tutorials/sparse/csr.md
+++ b/docs/tutorials/sparse/csr.md
@@ -512,9 +512,7 @@ Note that in the file the column indices are expected to be sorted in ascending
 
 ### GPU Support
 
-By default, `CSRNDArray` operators are executed on CPU. In MXNet, GPU support for `CSRNDArray` is experimental with only a few sparse operators such as [dot](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#mxnet.ndarray.sparse.dot).
-
-To create a `CSRNDArray` on a GPU, we need to explicitly specify the context:
+By default, `CSRNDArray` operators are executed on CPU. To create a `CSRNDArray` on a GPU, we need to explicitly specify the context:
 
 **Note** If a GPU is not available, an error will be reported in the following section. In order to execute it a cpu, set `gpu_device` to `mx.cpu()`.
 
diff --git a/docs/tutorials/sparse/index.md b/docs/tutorials/sparse/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/sparse/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/sparse/row_sparse.md b/docs/tutorials/sparse/row_sparse.md
index c4cab75df54..27cc0d3d903 100644
--- a/docs/tutorials/sparse/row_sparse.md
+++ b/docs/tutorials/sparse/row_sparse.md
@@ -541,12 +541,7 @@ Note that only [mxnet.optimizer.SGD](https://mxnet.incubator.apache.org/api/pyth
 
 ### GPU Support
 
-By default, RowSparseNDArray operators are executed on CPU. In MXNet, GPU support for RowSparseNDArray is limited
-to a few sparse operators such as [sgd_update](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#mxnet.ndarray.sparse.sgd_update),
-[dot](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#mxnet.ndarray.sparse.dot) and
-[Embedding](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.Embedding).
-
-To create a RowSparseNDArray on gpu, we need to explicitly specify the context:
+By default, RowSparseNDArray operators are executed on CPU. To create a RowSparseNDArray on gpu, we need to explicitly specify the context:
 
 **Note** If a GPU is not available, an error will be reported in the following section. In order to execute it on a cpu, set gpu_device to mx.cpu().
 
diff --git a/docs/tutorials/sparse/train.md b/docs/tutorials/sparse/train.md
index 7472fcd14ca..fde4c0e6552 100644
--- a/docs/tutorials/sparse/train.md
+++ b/docs/tutorials/sparse/train.md
@@ -314,7 +314,7 @@ assert metric.get()[1] < 1, "Achieved MSE (%f) is larger than expected (1.0)" %
 
 ### Training the model with multiple machines or multiple devices
 
-To train a sparse model with multiple machines, you need to call `prepare` before `forward`, or `save_checkpoint`.
+Distributed training with `row_sparse` weights and gradients are supported in MXNet, which significantly reduces communication cost for large models. To train a sparse model with multiple machines, you need to call `prepare` before `forward`, or `save_checkpoint`.
 Please refer to the example in [mxnet/example/sparse/linear_classification](https://github.com/apache/incubator-mxnet/tree/master/example/sparse/linear_classification)
 for more details.
 
diff --git a/docs/tutorials/speech_recognition/index.md b/docs/tutorials/speech_recognition/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/speech_recognition/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/unsupervised_learning/index.md b/docs/tutorials/unsupervised_learning/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/unsupervised_learning/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/vision/index.md b/docs/tutorials/vision/index.md
new file mode 100644
index 00000000000..87d72894424
--- /dev/null
+++ b/docs/tutorials/vision/index.md
@@ -0,0 +1,8 @@
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/example/distributed_training/README.md b/example/distributed_training/README.md
new file mode 100644
index 00000000000..b0b0447725b
--- /dev/null
+++ b/example/distributed_training/README.md
@@ -0,0 +1,255 @@
+# Distributed Training using Gluon
+
+Deep learning models are usually trained using GPUs because GPUs can do a lot more computations in parallel that CPUs. But even with the modern GPUs, it could take several days to train big models. Training can be done faster by using multiple GPUs like described in [this](https://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) tutorial. However only a certain number of GPUs can be attached to one host (typically 8 or 16). To make the training even faster, we can use multiple GPUs attached to multiple hosts.
+
+In this tutorial, we will show how to train a model faster using multi-host distributed training.
+
+![Multiple GPUs connected to multiple hosts](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/distributed_training/distributed_training.png)
+
+We will use data parallelism to distribute the training which involves splitting the training data across GPUs attached to multiple hosts. Since the hosts are working with different subset of the training data in parallel, the training completes a lot faster.
+
+In this tutorial, we will train a ResNet18 network using CIFAR-10 dataset using two hosts each having four GPUs.
+
+## Distributed Training Architecture:
+
+Multihost distributed training involves working with three different types of processes - worker, parameter server and scheduler.
+
+![Distributed training architecture](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/distributed_training/dist_train_arch.png)
+
+### Parameter Server:
+The parameters of the model needs to be shared with all hosts since multiple hosts are working together to train one model. To make this sharing efficient, the parameters are split across multiple hosts. A parameter server in each host stores a subset of parameters. In the figure above, parameters are split evenly between the two hosts. At the end of every iteration, each host communicates with every other host to update all parameters of the model.
+
+### Worker:
+Each host has a worker process which in each iteration fetches a batch of data, runs forward and backward pass on all GPUs in the host, computes the parameter updates and sends those updates to the parameter servers in each host. Since we have multiple workers to train the model, each worker only needs to process 1/N part of the training data where N is the number of workers.
+
+### Scheduler:
+Scheduler is responsible for scheduling the workers and parameter servers. There is only one scheduler in the entire cluster.
+
+## Moving to distributed training:
+
+[cifar10_dist.py](cifar10_dist.py) contains code that trains a ResNet18 network using distributed training. In this section we'll walk through parts of that file that are unique to distributed training.
+
+### Step 1: Use a distributed key-value store:
+
+Like mentioned above, in distributed training, parameters are split into N parts and distributed across N hosts. This is done automatically by the [distributed key-value store](https://mxnet.incubator.apache.org/tutorials/python/kvstore.html). User only needs to create the distributed kv store and ask the `Trainer` to use the created store.
+
+```python
+store = mxnet.kv.create('dist')
+```
+
+It is the job of the trainer to take the gradients computed in the backward pass and update the parameters of the model. We'll tell the trainer to store and update the parameters in the distributed kv store we just created instead of doing it in GPU of CPU memory. For example,
+
+```python
+trainer = gluon.Trainer(net.collect_params(),
+                        'adam', {'learning_rate': .001},
+                        kvstore=store)
+```
+
+## Step 2: Split the training data:
+
+In distributed training (using data parallelism), training data is split into equal parts across all workers and each worker uses its subset of the training data for training. For example, if we had two machines, each running a worker, each worker managing four GPUs we'll split the data like shown below. Note that we don't split the data depending on the number of GPUs but split it depending on the number of workers.
+
+![Splitting data](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/distributed_training/split_data.png)
+
+Each worker can find out the total number of workers in the cluster and its own rank which is an integer between 0 and N-1 where N is the number of workers.
+
+```python
+store = kv.create('dist')
+print("Total number of workers: %d" % store.num_workers)
+print("This worker's rank: %d" % store.rank)
+```
+
+```
+Total number of workers: 2
+This worker's rank: 0
+```
+
+Knowing the number of workers and a particular worker's rank, it is easy to split the dataset into partitions and pick one partition to train depending on the rank of the worker. Here is a sampler that does exactly that.
+
+```python
+class SplitSampler(gluon.data.sampler.Sampler):
+    """ Split the dataset into `num_parts` parts and sample from the part with index `part_index`
+    Parameters
+    ----------
+    length: int
+      Number of examples in the dataset
+    num_parts: int
+      Partition the data into multiple parts
+    part_index: int
+      The index of the part to read from
+    """
+    def __init__(self, length, num_parts=1, part_index=0):
+        # Compute the length of each partition
+        self.part_len = length // num_parts
+        # Compute the start index for this partition
+        self.start = self.part_len * part_index
+        # Compute the end index for this partition
+        self.end = self.start + self.part_len
+
+    def __iter__(self):
+        # Extract examples between `start` and `end`, shuffle and return them.
+        indices = list(range(self.start, self.end))
+        random.shuffle(indices)
+        return iter(indices)
+
+    def __len__(self):
+        return self.part_len
+```
+
+We can then create a `DataLoader` using the `SplitSampler` like shown below:
+
+```python
+# Load the training data
+train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=True, transform=transform),
+                                      batch_size,
+                                      sampler=SplitSampler(50000, store.num_workers, store.rank))
+```
+
+## Step 3: Training with multiple GPUs
+
+Note that we didn't split the dataset by the number of GPUs. We split it by the number of workers which usually translates to number of machines. It is the worker's responsibility to split the partition it has across multiple GPUs it might have and run the training in parallel across multiple GPUs.
+
+To train with multiple GPUs, we first need to specify the list of GPUs we want to use for training:
+
+```python
+ctx = [mx.gpu(i) for i in range(gpus_per_machine)]
+```
+
+We can then train a batch like shown below:
+
+```python
+# Train a batch using multiple GPUs
+def train_batch(batch, ctx, net, trainer):
+
+    # Split and load data into multiple GPUs
+    data = batch[0]
+    data = gluon.utils.split_and_load(data, ctx)
+    
+    # Split and load label into multiple GPUs
+    label = batch[1]
+    label = gluon.utils.split_and_load(label, ctx)
+
+    # Run the forward and backward pass
+    forward_backward(net, data, label)
+
+    # Update the parameters
+    this_batch_size = batch[0].shape[0]
+    trainer.step(this_batch_size)
+```
+
+Here is the code that runs the forward (computing loss) and backward (computing gradients) pass on multiple GPUs:
+
+```python
+# We'll use cross entropy loss since we are doing multiclass classification
+loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+# Run one forward and backward pass on multiple GPUs
+def forward_backward(net, data, label):
+
+    # Ask autograd to remember the forward pass
+    with autograd.record():
+        # Compute the loss on all GPUs
+        losses = [loss(net(X), Y) for X, Y in zip(data, label)]
+
+    # Run the backward pass (calculate gradients) on all GPUs
+    for l in losses:
+        l.backward()
+```
+
+Given `train_batch`, training an epoch is simple:
+
+```python
+for batch in train_data:
+    # Train the batch using multiple GPUs
+    train_batch(batch, ctx, net, trainer)
+```
+
+## Final Step: Launching the distributed training
+
+Note that there are several processes that needs to be launched on multiple machines to do distributed training. One worker and one parameter server needs to be launched on each host. Scheduler needs to be launched on one of the hosts. While this can be done manually, MXNet provides the [`launch.py`](https://github.com/apache/incubator-mxnet/blob/master/tools/launch.py) tool to make this easy.
+
+For example, the following command launches distributed training on two machines:
+
+```
+python ~/mxnet/tools/launch.py -n 2 -s 2 -H hosts \
+    --sync-dst-dir /home/ubuntu/cifar10_dist \
+    --launcher ssh \
+    "python /home/ubuntu/cifar10_dist/cifar10_dist.py"
+```
+
+- `-n 2` specifies the number of workers that must be launched
+- `-s 2` specifies the number of parameter servers that must be launched.
+- `--sync-dst-dir` specifies a destination location where the contents of the current directory will be rsync'd
+- `--launcher ssh` tells `launch.py` to use ssh to login on each machine in the cluster and launch processes.
+- `"python /home/ubuntu/dist/dist.py"` is the command that will get executed in each of the launched processes.
+- Finally, `-H hosts` specifies the list of hosts in the cluster to be used for distributed training.
+
+Let's take a look at the `hosts` file.
+
+```
+~/dist$ cat hosts 
+d1
+d2
+```
+
+'d1' and 'd2' are the hostnames of the hosts we want to run distributed training using. `launch.py` should be able to ssh into these hosts by providing just the hostname on the command line. For example:
+
+```
+~/dist$ ssh d1
+Welcome to Ubuntu 16.04.3 LTS (GNU/Linux 4.4.0-1049-aws x86_64)
+
+ * Documentation:  https://help.ubuntu.com
+ * Management:     https://landscape.canonical.com
+ * Support:        https://ubuntu.com/advantage
+
+  Get cloud support with Ubuntu Advantage Cloud Guest:
+    http://www.ubuntu.com/business/services/cloud
+
+0 packages can be updated.
+0 updates are security updates.
+
+
+Last login: Wed Jan 31 18:06:45 2018 from 72.21.198.67
+```
+
+Note that no authentication information was provided to login to the host. This can be done using multiple methods. One easy way is to specify the ssh certificates in `~/.ssh/config`. Example:
+
+```
+~$ cat ~/.ssh/config 
+Host d1
+    HostName ec2-34-201-108-233.compute-1.amazonaws.com
+    port 22
+    user ubuntu
+    IdentityFile /home/ubuntu/my_key.pem
+    IdentitiesOnly yes
+
+Host d2
+    HostName ec2-34-238-232-97.compute-1.amazonaws.com
+    port 22
+    user ubuntu
+    IdentityFile /home/ubuntu/my_key.pem
+    IdentitiesOnly yes
+```
+
+A better way is to use ssh agent forwarding. Check [this](https://aws.amazon.com/blogs/security/securely-connect-to-linux-instances-running-in-a-private-amazon-vpc/) article for more details.
+
+Here is a sample output from running distributed training:
+
+```
+$ python ~/mxnet/tools/launch.py -n 2 -s 2 -H hosts --sync-dst-dir /home/ubuntu/cifar10_dist --launcher ssh "python /home/ubuntu/cifar10_dist/cifar10_dist.py"
+2018-06-03 05:30:05,609 INFO rsync /home/ubuntu/cifar10_dist/ -> a1:/home/ubuntu/cifar10_dist
+2018-06-03 05:30:05,879 INFO rsync /home/ubuntu/cifar10_dist/ -> a2:/home/ubuntu/cifar10_dist
+Epoch 0: Test_acc 0.467400
+Epoch 0: Test_acc 0.466800
+Epoch 1: Test_acc 0.568500
+Epoch 1: Test_acc 0.571300
+Epoch 2: Test_acc 0.586300
+Epoch 2: Test_acc 0.594000
+Epoch 3: Test_acc 0.659200
+Epoch 3: Test_acc 0.653300
+Epoch 4: Test_acc 0.681200
+Epoch 4: Test_acc 0.687900
+```
+
+Note that the output from all hosts are merged and printed to the console.
+
diff --git a/example/distributed_training/cifar10_dist.py b/example/distributed_training/cifar10_dist.py
new file mode 100644
index 00000000000..506afbbe081
--- /dev/null
+++ b/example/distributed_training/cifar10_dist.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import random, sys
+
+import mxnet as mx
+from mxnet import autograd, gluon, kv, nd
+from mxnet.gluon.model_zoo import vision
+
+import numpy as np
+
+# Create a distributed key-value store
+store = kv.create('dist')
+
+# Clasify the images into one of the 10 digits
+num_outputs = 10
+
+# 64 images in a batch
+batch_size_per_gpu = 64
+# How many epochs to run the training
+epochs = 5
+
+# How many GPUs per machine
+gpus_per_machine = 4
+# Effective batch size across all GPUs
+batch_size = batch_size_per_gpu * gpus_per_machine
+
+# Create the context (a list of all GPUs to be used for training)
+ctx = [mx.gpu(i) for i in range(gpus_per_machine)]
+
+# Convert to float 32
+# Having channel as the first dimension makes computation more efficient. Hence the (2,0,1) transpose.
+# Dividing by 255 normalizes the input between 0 and 1
+def transform(data, label):
+    return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)
+
+class SplitSampler(gluon.data.sampler.Sampler):
+    """ Split the dataset into `num_parts` parts and sample from the part with index `part_index`
+
+    Parameters
+    ----------
+    length: int
+      Number of examples in the dataset
+    num_parts: int
+      Partition the data into multiple parts
+    part_index: int
+      The index of the part to read from
+    """
+    def __init__(self, length, num_parts=1, part_index=0):
+        # Compute the length of each partition
+        self.part_len = length // num_parts
+        # Compute the start index for this partition
+        self.start = self.part_len * part_index
+        # Compute the end index for this partition
+        self.end = self.start + self.part_len
+
+    def __iter__(self):
+        # Extract examples between `start` and `end`, shuffle and return them.
+        indices = list(range(self.start, self.end))
+        random.shuffle(indices)
+        return iter(indices)
+
+    def __len__(self):
+        return self.part_len
+
+# Load the training data
+train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=True, transform=transform),
+                                      batch_size,
+                                      sampler=SplitSampler(50000, store.num_workers, store.rank))
+
+# Load the test data 
+test_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=False, transform=transform),
+                                     batch_size, shuffle=False)
+
+# Use ResNet from model zoo
+net = vision.resnet18_v1()
+
+# Initialize the parameters with Xavier initializer
+net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+
+# SoftmaxCrossEntropy is the most common choice of loss function for multiclass classification
+softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
+
+# Use Adam optimizer. Ask trainer to use the distributer kv store.
+trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': .001}, kvstore=store)
+
+# Evaluate accuracy of the given network using the given data
+def evaluate_accuracy(data_iterator, net):
+
+    acc = mx.metric.Accuracy()
+
+    # Iterate through data and label
+    for i, (data, label) in enumerate(data_iterator):
+
+        # Get the data and label into the GPU
+        data = data.as_in_context(ctx[0])
+        label = label.as_in_context(ctx[0])
+
+        # Get network's output which is a probability distribution
+        # Apply argmax on the probability distribution to get network's classification.
+        output = net(data)
+        predictions = nd.argmax(output, axis=1)
+
+        # Give network's prediction and the correct label to update the metric
+        acc.update(preds=predictions, labels=label)
+
+    # Return the accuracy
+    return acc.get()[1]
+
+# We'll use cross entropy loss since we are doing multiclass classification
+loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+# Run one forward and backward pass on multiple GPUs
+def forward_backward(net, data, label):
+
+    # Ask autograd to remember the forward pass
+    with autograd.record():
+        # Compute the loss on all GPUs
+        losses = [loss(net(X), Y) for X, Y in zip(data, label)]
+
+    # Run the backward pass (calculate gradients) on all GPUs
+    for l in losses:
+        l.backward()
+
+# Train a batch using multiple GPUs
+def train_batch(batch, ctx, net, trainer):
+
+    # Split and load data into multiple GPUs
+    data = batch[0]
+    data = gluon.utils.split_and_load(data, ctx)
+
+    # Split and load label into multiple GPUs
+    label = batch[1]
+    label = gluon.utils.split_and_load(label, ctx)
+
+    # Run the forward and backward pass
+    forward_backward(net, data, label)
+
+    # Update the parameters
+    this_batch_size = batch[0].shape[0]
+    trainer.step(this_batch_size)
+
+# Run as many epochs as required
+for epoch in range(epochs):
+
+    # Iterate through batches and run training using multiple GPUs
+    batch_num = 1
+    for batch in train_data:
+
+        # Train the batch using multiple GPUs
+        train_batch(batch, ctx, net, trainer)
+
+        batch_num += 1
+
+    # Print test accuracy after every epoch
+    test_accuracy = evaluate_accuracy(test_data, net)
+    print("Epoch %d: Test_acc %f" % (epoch, test_accuracy))
+    sys.stdout.flush()
+
diff --git a/example/gan/CGAN_mnist_R/CGAN_mnist_setup.R b/example/gan/CGAN_mnist_R/CGAN_mnist_setup.R
deleted file mode 100644
index ad57bc54123..00000000000
--- a/example/gan/CGAN_mnist_R/CGAN_mnist_setup.R
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-require("imager")
-require("dplyr")
-require("readr")
-require("mxnet")
-
-source("iterators.R")
-
-######################################################
-### Data import and preperation
-### First download MNIST train data at Kaggle: 
-###   https://www.kaggle.com/c/digit-recognizer/data
-######################################################
-train <- read_csv('data/train.csv')
-train<- data.matrix(train)
-
-train_data <- train[,-1]
-train_data <- t(train_data/255*2-1)
-train_label <- as.integer(train[,1])
-
-dim(train_data) <- c(28, 28, 1, ncol(train_data))
-
-##################################################
-#### Model parameters
-##################################################
-random_dim<- 96
-gen_features<- 96
-dis_features<- 32
-image_depth = 1
-fix_gamma<- T
-no_bias<- T
-eps<- 1e-5 + 1e-12
-batch_size<- 64
-
-
-##################################################
-#### Generator Symbol
-##################################################
-data = mx.symbol.Variable('data')
-
-gen_rand<- mx.symbol.normal(loc=0, scale=1, shape=c(1, 1, random_dim, batch_size), name="gen_rand")
-gen_concat<- mx.symbol.Concat(data = list(data, gen_rand), num.args = 2, name="gen_concat")
-
-g1 = mx.symbol.Deconvolution(gen_concat, name='g1', kernel=c(4,4), num_filter=gen_features*4, no_bias=T)
-gbn1 = mx.symbol.BatchNorm(g1, name='gbn1', fix_gamma=fix_gamma, eps=eps)
-gact1 = mx.symbol.Activation(gbn1, name='gact1', act_type='relu')
-
-g2 = mx.symbol.Deconvolution(gact1, name='g2', kernel=c(3,3), stride=c(2,2), pad=c(1,1), num_filter=gen_features*2, no_bias=no_bias)
-gbn2 = mx.symbol.BatchNorm(g2, name='gbn2', fix_gamma=fix_gamma, eps=eps)
-gact2 = mx.symbol.Activation(gbn2, name='gact2', act_type='relu')
-
-g3 = mx.symbol.Deconvolution(gact2, name='g3', kernel=c(4,4), stride=c(2,2), pad=c(1,1), num_filter=gen_features, no_bias=no_bias)
-gbn3 = mx.symbol.BatchNorm(g3, name='gbn3', fix_gamma=fix_gamma, eps=eps)
-gact3 = mx.symbol.Activation(gbn3, name='gact3', act_type='relu')
-
-g4 = mx.symbol.Deconvolution(gact3, name='g4', kernel=c(4,4), stride=c(2,2), pad=c(1,1), num_filter=image_depth, no_bias=no_bias)
-G_sym = mx.symbol.Activation(g4, name='G_sym', act_type='tanh')
-
-
-##################################################
-#### Discriminator Symbol
-##################################################
-data = mx.symbol.Variable('data')
-dis_digit = mx.symbol.Variable('digit')
-label = mx.symbol.Variable('label')
-
-dis_digit<- mx.symbol.Reshape(data=dis_digit, shape=c(1,1,10,batch_size), name="digit_reshape")
-dis_digit<- mx.symbol.broadcast_to(data=dis_digit, shape=c(28,28,10, batch_size), name="digit_broadcast")
-
-data_concat <- mx.symbol.Concat(list(data, dis_digit), num.args = 2, dim = 1, name='dflat_concat')
-
-d1 = mx.symbol.Convolution(data=data_concat, name='d1', kernel=c(3,3), stride=c(1,1), pad=c(0,0), num_filter=24, no_bias=no_bias)
-dbn1 = mx.symbol.BatchNorm(d1, name='dbn1', fix_gamma=fix_gamma, eps=eps)
-dact1 = mx.symbol.LeakyReLU(dbn1, name='dact1', act_type='elu', slope=0.25)
-pool1 <- mx.symbol.Pooling(data=dact1, name="pool1", pool_type="max", kernel=c(2,2), stride=c(2,2), pad=c(0,0))
-
-d2 = mx.symbol.Convolution(pool1, name='d2', kernel=c(3,3), stride=c(2,2), pad=c(0,0), num_filter=32, no_bias=no_bias)
-dbn2 = mx.symbol.BatchNorm(d2, name='dbn2', fix_gamma=fix_gamma, eps=eps)
-dact2 = mx.symbol.LeakyReLU(dbn2, name='dact2', act_type='elu', slope=0.25)
-
-d3 = mx.symbol.Convolution(dact2, name='d3', kernel=c(3,3), stride=c(1,1), pad=c(0,0), num_filter=64, no_bias=no_bias)
-dbn3 = mx.symbol.BatchNorm(d3, name='dbn3', fix_gamma=fix_gamma, eps=eps)
-dact3 = mx.symbol.LeakyReLU(dbn3, name='dact3', act_type='elu', slope=0.25)
-
-d4 = mx.symbol.Convolution(dact2, name='d3', kernel=c(4,4), stride=c(1,1), pad=c(0,0), num_filter=64, no_bias=no_bias)
-dbn4 = mx.symbol.BatchNorm(d4, name='dbn4', fix_gamma=fix_gamma, eps=eps)
-dact4 = mx.symbol.LeakyReLU(dbn4, name='dact4', act_type='elu', slope=0.25)
-
-# pool4 <- mx.symbol.Pooling(data=dact3, name="pool4", pool_type="avg", kernel=c(4,4), stride=c(1,1), pad=c(0,0))
-
-dflat = mx.symbol.Flatten(dact4, name="dflat")
-
-dfc <- mx.symbol.FullyConnected(data=dflat, name="dfc", num_hidden=1, no_bias=F)
-D_sym = mx.symbol.LogisticRegressionOutput(data=dfc, label=label, name='D_sym')
-
-
-########################
-### Graph
-########################
-input_shape_G<- c(1, 1, 10, batch_size)
-input_shape_D<- c(28, 28, 1, batch_size)
-
-graph.viz(G_sym, type = "graph", direction = "LR")
-graph.viz(D_sym, type = "graph", direction = "LR")
-
diff --git a/example/gan/CGAN_mnist_R/CGAN_train.R b/example/gan/CGAN_mnist_R/CGAN_train.R
index 9c7649f3e26..7d3225483c0 100644
--- a/example/gan/CGAN_mnist_R/CGAN_train.R
+++ b/example/gan/CGAN_mnist_R/CGAN_train.R
@@ -15,81 +15,187 @@
 # specific language governing permissions and limitations
 # under the License.
 
-#####################################################
+require("imager")
+require("dplyr")
+require("readr")
+require("mxnet")
+
+source("iterators.R")
+
+### Data import and preperation 
+# First download MNIST train data at Kaggle:
+# https://www.kaggle.com/c/digit-recognizer/data
+
+train <- read_csv("data/train.csv")
+train <- data.matrix(train)
+
+train_data <- train[, -1]
+train_data <- t(train_data/255 * 2 - 1)
+train_label <- as.integer(train[, 1])
+
+dim(train_data) <- c(28, 28, 1, ncol(train_data))
+
+### Model parameters
+random_dim <- 96
+gen_features <- 96
+dis_features <- 32
+image_depth <- 1
+fix_gamma <- T
+no_bias <- T
+eps <- 1e-05 + 1e-12
+batch_size <- 64
+
+
+### Generator Symbol
+data <- mx.symbol.Variable("data")
+
+gen_rand <- mx.symbol.normal(loc = 0, scale = 1, shape = c(1, 1, random_dim, batch_size), 
+  name = "gen_rand")
+gen_concat <- mx.symbol.concat(data = list(data, gen_rand), num.args = 2, name = "gen_concat")
+
+g1 <- mx.symbol.Deconvolution(gen_concat, name = "g1", kernel = c(4, 4), num_filter = gen_features * 
+  4, no_bias = T)
+gbn1 <- mx.symbol.BatchNorm(g1, name = "gbn1", fix_gamma = fix_gamma, eps = eps)
+gact1 <- mx.symbol.Activation(gbn1, name = "gact1", act_type = "relu")
+
+g2 <- mx.symbol.Deconvolution(gact1, name = "g2", kernel = c(3, 3), stride = c(2, 
+  2), pad = c(1, 1), num_filter = gen_features * 2, no_bias = no_bias)
+gbn2 <- mx.symbol.BatchNorm(g2, name = "gbn2", fix_gamma = fix_gamma, eps = eps)
+gact2 <- mx.symbol.Activation(gbn2, name = "gact2", act_type = "relu")
+
+g3 <- mx.symbol.Deconvolution(gact2, name = "g3", kernel = c(4, 4), stride = c(2, 
+  2), pad = c(1, 1), num_filter = gen_features, no_bias = no_bias)
+gbn3 <- mx.symbol.BatchNorm(g3, name = "gbn3", fix_gamma = fix_gamma, eps = eps)
+gact3 <- mx.symbol.Activation(gbn3, name = "gact3", act_type = "relu")
+
+g4 <- mx.symbol.Deconvolution(gact3, name = "g4", kernel = c(4, 4), stride = c(2, 
+  2), pad = c(1, 1), num_filter = image_depth, no_bias = no_bias)
+G_sym <- mx.symbol.Activation(g4, name = "G_sym", act_type = "tanh")
+
+
+### Discriminator Symbol
+data <- mx.symbol.Variable("data")
+dis_digit <- mx.symbol.Variable("digit")
+label <- mx.symbol.Variable("label")
+
+dis_digit <- mx.symbol.Reshape(data = dis_digit, shape = c(1, 1, 10, batch_size), 
+  name = "digit_reshape")
+dis_digit <- mx.symbol.broadcast_to(data = dis_digit, shape = c(28, 28, 10, batch_size), 
+  name = "digit_broadcast")
+
+data_concat <- mx.symbol.concat(list(data, dis_digit), num.args = 2, dim = 1, name = "dflat_concat")
+
+d1 <- mx.symbol.Convolution(data = data_concat, name = "d1", kernel = c(3, 3), stride = c(1, 
+  1), pad = c(0, 0), num_filter = 24, no_bias = no_bias)
+dbn1 <- mx.symbol.BatchNorm(d1, name = "dbn1", fix_gamma = fix_gamma, eps = eps)
+dact1 <- mx.symbol.LeakyReLU(dbn1, name = "dact1", act_type = "elu", slope = 0.25)
+pool1 <- mx.symbol.Pooling(data = dact1, name = "pool1", pool_type = "max", kernel = c(2, 
+  2), stride = c(2, 2), pad = c(0, 0))
+
+d2 <- mx.symbol.Convolution(pool1, name = "d2", kernel = c(3, 3), stride = c(2, 2), 
+  pad = c(0, 0), num_filter = 32, no_bias = no_bias)
+dbn2 <- mx.symbol.BatchNorm(d2, name = "dbn2", fix_gamma = fix_gamma, eps = eps)
+dact2 <- mx.symbol.LeakyReLU(dbn2, name = "dact2", act_type = "elu", slope = 0.25)
+
+d3 <- mx.symbol.Convolution(dact2, name = "d3", kernel = c(3, 3), stride = c(1, 1), 
+  pad = c(0, 0), num_filter = 64, no_bias = no_bias)
+dbn3 <- mx.symbol.BatchNorm(d3, name = "dbn3", fix_gamma = fix_gamma, eps = eps)
+dact3 <- mx.symbol.LeakyReLU(dbn3, name = "dact3", act_type = "elu", slope = 0.25)
+
+d4 <- mx.symbol.Convolution(dact2, name = "d3", kernel = c(4, 4), stride = c(1, 1), 
+  pad = c(0, 0), num_filter = 64, no_bias = no_bias)
+dbn4 <- mx.symbol.BatchNorm(d4, name = "dbn4", fix_gamma = fix_gamma, eps = eps)
+dact4 <- mx.symbol.LeakyReLU(dbn4, name = "dact4", act_type = "elu", slope = 0.25)
+
+# pool4 <- mx.symbol.Pooling(data=dact3, name='pool4', pool_type='avg',
+# kernel=c(4,4), stride=c(1,1), pad=c(0,0))
+
+dflat <- mx.symbol.Flatten(dact4, name = "dflat")
+
+dfc <- mx.symbol.FullyConnected(data = dflat, name = "dfc", num_hidden = 1, no_bias = F)
+D_sym <- mx.symbol.LogisticRegressionOutput(data = dfc, label = label, name = "D_sym")
+
+
+### Graph
+input_shape_G <- c(1, 1, 10, batch_size)
+input_shape_D <- c(28, 28, 1, batch_size)
+
+graph.viz(G_sym, type = "graph", direction = "LR")
+graph.viz(D_sym, type = "graph", direction = "LR")
+
+
 ### Training module for GAN
-#####################################################
 
-devices<- mx.cpu()
+# Change this to mx.gpu() when running on gpu machine.
+devices <- mx.cpu()
 
-data_shape_G<- c(1, 1, 10, batch_size)
-data_shape_D<- c(28, 28, 1, batch_size)
-digit_shape_D<- c(10, batch_size)
+data_shape_G <- c(1, 1, 10, batch_size)
+data_shape_D <- c(28, 28, 1, batch_size)
+digit_shape_D <- c(10, batch_size)
 
 mx.metric.binacc <- mx.metric.custom("binacc", function(label, pred) {
-  res <- mean(label==round(pred))
+  res <- mean(label == round(pred))
   return(res)
 })
 
 mx.metric.logloss <- mx.metric.custom("logloss", function(label, pred) {
-  res <- mean(label*log(pred)+(1-label)*log(1-pred))
+  res <- mean(label * log(pred) + (1 - label) * log(1 - pred))
   return(res)
 })
 
-##############################################
 ### Define iterators
-iter_G<- G_iterator(batch_size = batch_size)
-iter_D<- D_iterator(batch_size = batch_size)
+iter_G <- G_iterator(batch_size = batch_size)
+iter_D <- D_iterator(batch_size = batch_size)
 
-exec_G<- mx.simple.bind(symbol = G_sym, data=data_shape_G, ctx = devices, grad.req = "write")
-exec_D<- mx.simple.bind(symbol = D_sym, data=data_shape_D, digit=digit_shape_D, ctx = devices, grad.req = "write")
+exec_G <- mx.simple.bind(symbol = G_sym, data = data_shape_G, ctx = devices, grad.req = "write")
+exec_D <- mx.simple.bind(symbol = D_sym, data = data_shape_D, digit = digit_shape_D, 
+  ctx = devices, grad.req = "write")
 
 ### initialize parameters - To Do - personalise each layer
-initializer<- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
+initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
 
-arg_param_ini_G<- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(G_sym, data=data_shape_G)$arg.shapes, ctx = mx.cpu())
-aux_param_ini_G<- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(G_sym, data=data_shape_G)$aux.shapes, ctx = mx.cpu())
+arg_param_ini_G <- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(G_sym, 
+  data = data_shape_G)$arg.shapes, ctx = devices)
+aux_param_ini_G <- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(G_sym, 
+  data = data_shape_G)$aux.shapes, ctx = devices)
 
-arg_param_ini_D<- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(D_sym, data=data_shape_D, digit=digit_shape_D)$arg.shapes, ctx = mx.cpu())
-aux_param_ini_D<- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(D_sym, data=data_shape_D, digit=digit_shape_D)$aux.shapes, ctx = mx.cpu())
+arg_param_ini_D <- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(D_sym, 
+  data = data_shape_D, digit = digit_shape_D)$arg.shapes, ctx = devices)
 
-mx.exec.update.arg.arrays(exec_G, arg_param_ini_G, match.name=TRUE)
-mx.exec.update.aux.arrays(exec_G, aux_param_ini_G, match.name=TRUE)
+aux_param_ini_D <- mx.init.create(initializer = initializer, shape.array = mx.symbol.infer.shape(D_sym, 
+  data = data_shape_D, digit = digit_shape_D)$aux.shapes, ctx = devices)
 
-mx.exec.update.arg.arrays(exec_D, arg_param_ini_D, match.name=TRUE)
-mx.exec.update.aux.arrays(exec_D, aux_param_ini_D, match.name=TRUE)
+mx.exec.update.arg.arrays(exec_G, arg_param_ini_G, match.name = TRUE)
+mx.exec.update.aux.arrays(exec_G, aux_param_ini_G, match.name = TRUE)
+
+mx.exec.update.arg.arrays(exec_D, arg_param_ini_D, match.name = TRUE)
+mx.exec.update.aux.arrays(exec_D, aux_param_ini_D, match.name = TRUE)
 
 input_names_G <- mxnet:::mx.model.check.arguments(G_sym)
 input_names_D <- mxnet:::mx.model.check.arguments(D_sym)
 
 
-###################################################
-#initialize optimizers
-optimizer_G<-mx.opt.create(name = "adadelta",
-                           rho=0.92, 
-                           epsilon = 1e-6, 
-                           wd=0, 
-                           rescale.grad=1/batch_size, 
-                           clip_gradient=1)
+### initialize optimizers
+optimizer_G <- mx.opt.create(name = "adadelta", rho = 0.92, epsilon = 1e-06, wd = 0, 
+  rescale.grad = 1/batch_size, clip_gradient = 1)
+
+updater_G <- mx.opt.get.updater(optimizer = optimizer_G, weights = exec_G$ref.arg.arrays, 
+  ctx = devices)
 
-updater_G<- mx.opt.get.updater(optimizer = optimizer_G, weights = exec_G$ref.arg.arrays)
+optimizer_D <- mx.opt.create(name = "adadelta", rho = 0.92, epsilon = 1e-06, wd = 0, 
+  rescale.grad = 1/batch_size, clip_gradient = 1)
 
-optimizer_D<-mx.opt.create(name = "adadelta",
-                           rho=0.92, 
-                           epsilon = 1e-6, 
-                           wd=0, 
-                           rescale.grad=1/batch_size, 
-                           clip_gradient=1)
-updater_D<- mx.opt.get.updater(optimizer = optimizer_D, weights = exec_D$ref.arg.arrays)
+updater_D <- mx.opt.get.updater(optimizer = optimizer_D, weights = exec_D$ref.arg.arrays, 
+  ctx = devices)
 
-####################################
-#initialize metric
-metric_G<- mx.metric.binacc
-metric_G_value<- metric_G$init()
+### initialize metric
+metric_G <- mx.metric.binacc
+metric_G_value <- metric_G$init()
 
-metric_D<- mx.metric.binacc
-metric_D_value<- metric_D$init()
+metric_D <- mx.metric.binacc
+metric_D_value <- metric_D$init()
 
-iteration<- 1
+iteration <- 1
 iter_G$reset()
 iter_D$reset()
 
@@ -102,71 +208,81 @@ for (iteration in 1:2400) {
   ### Random input to Generator to produce fake sample
   G_values <- iter_G$value()
   G_data <- G_values[input_names_G]
-  mx.exec.update.arg.arrays(exec_G, arg.arrays = G_data, match.name=TRUE)
-  mx.exec.forward(exec_G, is.train=T)
+  mx.exec.update.arg.arrays(exec_G, arg.arrays = G_data, match.name = TRUE)
+  mx.exec.forward(exec_G, is.train = T)
   
-  ### Feed Discriminator with Concatenated Generator images and real images
-  ### Random input to Generator
+  ### Feed Discriminator with Concatenated Generator images and real images Random
+  ### input to Generator
   D_data_fake <- exec_G$ref.outputs$G_sym_output
-  D_digit_fake <- G_values$data %>% mx.nd.Reshape(shape=c(-1, batch_size))
+  D_digit_fake <- G_values$data %>% mx.nd.Reshape(shape = c(-1, batch_size))
   
   D_values <- iter_D$value()
   D_data_real <- D_values$data
   D_digit_real <- D_values$digit
   
   ### Train loop on fake
-  mx.exec.update.arg.arrays(exec_D, arg.arrays = list(data=D_data_fake, digit=D_digit_fake, label=mx.nd.array(rep(0, batch_size))), match.name=TRUE)
-  mx.exec.forward(exec_D, is.train=T)
+  mx.exec.update.arg.arrays(exec_D, arg.arrays = list(data = D_data_fake, digit = D_digit_fake, 
+    label = mx.nd.array(rep(0, batch_size))), match.name = TRUE)
+  mx.exec.forward(exec_D, is.train = T)
   mx.exec.backward(exec_D)
-  update_args_D<- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null=TRUE)
+  update_args_D <- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null = TRUE)
   
-  metric_D_value <- metric_D$update(label = mx.nd.array(rep(0, batch_size)), exec_D$ref.outputs[["D_sym_output"]], metric_D_value)
+  metric_D_value <- metric_D$update(label = as.array(mx.nd.array(rep(0, batch_size))), 
+    pred = as.array(exec_D$ref.outputs[["D_sym_output"]]), metric_D_value)
   
   ### Train loop on real
-  mx.exec.update.arg.arrays(exec_D, arg.arrays = list(data=D_data_real, digit=D_digit_real, label=mx.nd.array(rep(1, batch_size))), match.name=TRUE)
-  mx.exec.forward(exec_D, is.train=T)
+  mx.exec.update.arg.arrays(exec_D, arg.arrays = list(data = D_data_real, digit = D_digit_real, 
+    label = mx.nd.array(rep(1, batch_size))), match.name = TRUE)
+  mx.exec.forward(exec_D, is.train = T)
   mx.exec.backward(exec_D)
-  update_args_D<- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null=TRUE)
+  update_args_D <- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null = TRUE)
   
-  metric_D_value <- metric_D$update(mx.nd.array(rep(1, batch_size)), exec_D$ref.outputs[["D_sym_output"]], metric_D_value)
+  metric_D_value <- metric_D$update(label = as.array(mx.nd.array(rep(1, batch_size))), 
+    pred = as.array(exec_D$ref.outputs[["D_sym_output"]]), metric_D_value)
   
   ### Update Generator weights - use a seperate executor for writing data gradients
-  exec_D_back<- mxnet:::mx.symbol.bind(symbol = D_sym, arg.arrays = exec_D$arg.arrays, aux.arrays = exec_D$aux.arrays, grad.reqs = rep("write", length(exec_D$arg.arrays)), ctx = devices)
-  mx.exec.update.arg.arrays(exec_D_back, arg.arrays = list(data=D_data_fake, digit=D_digit_fake, label=mx.nd.array(rep(1, batch_size))), match.name=TRUE)
-  mx.exec.forward(exec_D_back, is.train=T)
+  exec_D_back <- mxnet:::mx.symbol.bind(symbol = D_sym, arg.arrays = exec_D$arg.arrays, 
+    aux.arrays = exec_D$aux.arrays, grad.reqs = rep("write", length(exec_D$arg.arrays)), 
+    ctx = devices)
+  mx.exec.update.arg.arrays(exec_D_back, arg.arrays = list(data = D_data_fake, 
+    digit = D_digit_fake, label = mx.nd.array(rep(1, batch_size))), match.name = TRUE)
+  mx.exec.forward(exec_D_back, is.train = T)
   mx.exec.backward(exec_D_back)
-  D_grads<- exec_D_back$ref.grad.arrays$data
-  mx.exec.backward(exec_G, out_grads=D_grads)
+  D_grads <- exec_D_back$ref.grad.arrays$data
+  mx.exec.backward(exec_G, out_grads = D_grads)
   
-  update_args_G<- updater_G(weight = exec_G$ref.arg.arrays, grad = exec_G$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec_G, update_args_G, skip.null=TRUE)
+  update_args_G <- updater_G(weight = exec_G$ref.arg.arrays, grad = exec_G$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec_G, update_args_G, skip.null = TRUE)
   
-  ### Update metrics
-  #metric_G_value <- metric_G$update(values[[label_name]], exec_G$ref.outputs[[output_name]], metric_G_value)
+  ### Update metrics metric_G_value <- metric_G$update(values[[label_name]],
+  ### exec_G$ref.outputs[[output_name]], metric_G_value)
   
-  if (iteration %% 25==0){
+  if (iteration%%25 == 0) {
     D_metric_result <- metric_D$get(metric_D_value)
-    cat(paste0("[", iteration, "] ", D_metric_result$name, ": ", D_metric_result$value, "\n"))
+    cat(paste0("[", iteration, "] ", D_metric_result$name, ": ", D_metric_result$value, 
+      "\n"))
   }
   
-  if (iteration==1 | iteration %% 100==0){
+  if (iteration == 1 | iteration%%100 == 0) {
     
-    metric_D_value<- metric_D$init()
+    metric_D_value <- metric_D$init()
     
-    par(mfrow=c(3,3), mar=c(0.1,0.1,0.1,0.1))
+    par(mfrow = c(3, 3), mar = c(0.1, 0.1, 0.1, 0.1))
     for (i in 1:9) {
-      img <- as.array(exec_G$ref.outputs$G_sym_output)[,,,i]
-      plot(as.cimg(img), axes=F)
+      img <- as.array(exec_G$ref.outputs$G_sym_output)[, , , i]
+      plot(as.cimg(img), axes = F)
     }
-
+    
     print(as.numeric(as.array(G_values$digit)))
     print(as.numeric(as.array(D_values$label)))
     
   }
 }
 
+ifelse(!dir.exists(file.path(".", "models")), dir.create(file.path(".", "models")), 
+  "Folder already exists")
 mx.symbol.save(D_sym, filename = "models/D_sym_model_v1.json")
 mx.nd.save(exec_D$arg.arrays, filename = "models/D_aux_params_v1.params")
 mx.nd.save(exec_D$aux.arrays, filename = "models/D_aux_params_v1.params")
@@ -177,23 +293,23 @@ mx.nd.save(exec_G$aux.arrays, filename = "models/G_aux_params_v1.params")
 
 
 ### Inference
-G_sym<- mx.symbol.load("models/G_sym_model_v1.json")
-G_arg_params<- mx.nd.load("models/G_arg_params_v1.params")
-G_aux_params<- mx.nd.load("models/G_aux_params_v1.params")
+G_sym <- mx.symbol.load("models/G_sym_model_v1.json")
+G_arg_params <- mx.nd.load("models/G_arg_params_v1.params")
+G_aux_params <- mx.nd.load("models/G_aux_params_v1.params")
 
-digit<- mx.nd.array(rep(9, times=batch_size))
-data<- mx.nd.one.hot(indices = digit, depth = 10)
-data<- mx.nd.reshape(data = data, shape = c(1,1,-1, batch_size))
+digit <- mx.nd.array(rep(9, times = batch_size))
+data <- mx.nd.one.hot(indices = digit, depth = 10)
+data <- mx.nd.reshape(data = data, shape = c(1, 1, -1, batch_size))
 
-exec_G<- mx.simple.bind(symbol = G_sym, data=data_shape_G, ctx = devices, grad.req = "null")
-mx.exec.update.arg.arrays(exec_G, G_arg_params, match.name=TRUE)
-mx.exec.update.arg.arrays(exec_G, list(data=data), match.name=TRUE)
-mx.exec.update.aux.arrays(exec_G, G_aux_params, match.name=TRUE)
+exec_G <- mx.simple.bind(symbol = G_sym, data = data_shape_G, ctx = devices, grad.req = "null")
+mx.exec.update.arg.arrays(exec_G, G_arg_params, match.name = TRUE)
+mx.exec.update.arg.arrays(exec_G, list(data = data), match.name = TRUE)
+mx.exec.update.aux.arrays(exec_G, G_aux_params, match.name = TRUE)
 
-mx.exec.forward(exec_G, is.train=F)
+mx.exec.forward(exec_G, is.train = F)
 
-par(mfrow=c(3,3), mar=c(0.1,0.1,0.1,0.1))
+par(mfrow = c(3, 3), mar = c(0.1, 0.1, 0.1, 0.1))
 for (i in 1:9) {
-  img <- as.array(exec_G$ref.outputs$G_sym_output)[,,,i]
-  plot(as.cimg(img), axes=F)
+  img <- as.array(exec_G$ref.outputs$G_sym_output)[, , , i]
+  plot(as.cimg(img), axes = F)
 }
diff --git a/example/gan/CGAN_mnist_R/iterators.R b/example/gan/CGAN_mnist_R/iterators.R
index 6069296c24f..dffe468ad2c 100644
--- a/example/gan/CGAN_mnist_R/iterators.R
+++ b/example/gan/CGAN_mnist_R/iterators.R
@@ -16,64 +16,66 @@
 # under the License.
 
 
-G_iterator<- function(batch_size){
+G_iterator <- function(batch_size) {
   
-  batch<- 0
-  batch_per_epoch<-5
+  batch <- 0
+  batch_per_epoch <- 5
   
-  reset<- function(){
-    batch<<- 0
+  reset <- function() {
+    batch <<- 0
   }
   
-  iter.next<- function(){
-    batch<<- batch+1
-    if (batch>batch_per_epoch) {
+  iter.next <- function() {
+    batch <<- batch + 1
+    if (batch > batch_per_epoch) {
       return(FALSE)
     } else {
       return(TRUE)
     }
   }
   
-  value<- function(){
-    set.seed(123+batch)
-    digit<- mx.nd.array(sample(0:9, size = batch_size, replace = T))
-    data<- mx.nd.one.hot(indices = digit, depth = 10)
-    data<- mx.nd.reshape(data = data, shape = c(1,1,-1, batch_size))
-    return(list(data=data, digit=digit))
+  value <- function() {
+    set.seed(123 + batch)
+    digit <- mx.nd.array(sample(0:9, size = batch_size, replace = T))
+    data <- mx.nd.one.hot(indices = digit, depth = 10)
+    data <- mx.nd.reshape(data = data, shape = c(1, 1, -1, batch_size))
+    return(list(data = data, digit = digit))
   }
   
-  return(list(reset=reset, iter.next=iter.next, value=value, batch_size=batch_size, batch=batch))
+  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, 
+    batch = batch))
 }
 
-D_iterator<- function(batch_size){
+D_iterator <- function(batch_size) {
   
-  batch<- 0
-  batch_per_epoch<-5
+  batch <- 0
+  batch_per_epoch <- 5
   
-  reset<- function(){
-    batch<<- 0
+  reset <- function() {
+    batch <<- 0
   }
   
-  iter.next<- function(){
-    batch<<- batch+1
-    if (batch>batch_per_epoch) {
+  iter.next <- function() {
+    batch <<- batch + 1
+    if (batch > batch_per_epoch) {
       return(FALSE)
     } else {
       return(TRUE)
     }
   }
   
-  value<- function(){
-    set.seed(123+batch)
-    idx<- sample(length(train_label), size = batch_size, replace = T)
-    data<- train_data[,,,idx, drop=F]
-    label<- mx.nd.array(train_label[idx])
-    digit<- mx.nd.one.hot(indices = label, depth = 10)
+  value <- function() {
+    set.seed(123 + batch)
+    idx <- sample(length(train_label), size = batch_size, replace = T)
+    data <- train_data[, , , idx, drop = F]
+    label <- mx.nd.array(train_label[idx])
+    digit <- mx.nd.one.hot(indices = label, depth = 10)
     
-    return(list(data=mx.nd.array(data), digit=digit, label=label))
+    return(list(data = mx.nd.array(data), digit = digit, label = label))
   }
   
-  return(list(reset=reset, iter.next=iter.next, value=value, batch_size=batch_size, batch=batch))
+  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, 
+    batch = batch))
 }
 
 
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index b3b13053add..5775f30bd88 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -49,7 +49,8 @@ def _get_lr_scheduler(args, kv):
     steps = [epoch_size * (x - begin_epoch)
              for x in step_epochs if x - begin_epoch > 0]
     if steps:
-        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor))
+        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
+                                                         base_lr=args.lr))
     else:
         return (lr, None)
 
diff --git a/example/speech_recognition/README.md b/example/speech_recognition/README.md
index 00d16660240..f95fddf2103 100644
--- a/example/speech_recognition/README.md
+++ b/example/speech_recognition/README.md
@@ -19,9 +19,9 @@ With rich functionalities and convenience explained above, you can build your ow
 ## **Environments**
 - MXNet version: 0.9.5+
 - GPU memory size: 2.4GB+
-- Install tensorboard for logging
+- Install mxboard for logging
 <pre>
-<code>pip install tensorboard</code>
+<code>pip install mxboard</code>
 </pre>  
 
 - [SoundFile](https://pypi.python.org/pypi/SoundFile/0.8.1) for audio preprocessing (If encounter errors about libsndfile, follow [this tutorial](http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libsndfile.html).)
diff --git a/example/speech_recognition/deepspeech.cfg b/example/speech_recognition/deepspeech.cfg
index ec3af045958..69894ae7d64 100644
--- a/example/speech_recognition/deepspeech.cfg
+++ b/example/speech_recognition/deepspeech.cfg
@@ -26,14 +26,15 @@ prefix = deep_bucket
 # when mode is load or predict, model will be loaded from the file name with model_file under checkpoints
 model_file = deep_bucketn_epoch0n_batch-0018
 batch_size = 12
-#batch_size=4
+#use batch_size 4 with single GPU
+#batch_size = 4
 # log will be saved by the log_filename
 log_filename = deep_bucket.log
 # checkpoint set n to save checkpoints after n epoch
 save_checkpoint_every_n_epoch = 1
 save_checkpoint_every_n_batch = 3000
 is_bi_graphemes = True
-tensorboard_log_dir = tblog/deep_bucket
+mxboard_log_dir = mxlog/deep_bucket
 # if random_seed is -1 then it gets random seed from timestamp
 mx_random_seed = -1
 random_seed = -1
diff --git a/example/speech_recognition/default.cfg b/example/speech_recognition/default.cfg
index e4beb83d32d..b0869a9dad2 100644
--- a/example/speech_recognition/default.cfg
+++ b/example/speech_recognition/default.cfg
@@ -31,7 +31,7 @@ log_filename = test.log
 save_checkpoint_every_n_epoch = 20
 save_checkpoint_every_n_batch = 1000
 is_bi_graphemes = False
-tensorboard_log_dir = tblog/libri_sample
+mxboard_log_dir = mxlog/libri_sample
 # if random_seed is -1 then it gets random seed from timestamp
 mx_random_seed = 1234
 random_seed = 1234
diff --git a/example/speech_recognition/singleton.py b/example/speech_recognition/singleton.py
index 1d68edfb3ca..01717e4df06 100644
--- a/example/speech_recognition/singleton.py
+++ b/example/speech_recognition/singleton.py
@@ -19,9 +19,9 @@
 import logging as log
 
 class Singleton:
-    def __init__(self, decrated):
-        log.debug("Singleton Init %s" % decrated)
-        self._decorated = decrated
+    def __init__(self, decorated):
+        log.debug("Singleton Init %s" % decorated)
+        self._decorated = decorated
 
     def getInstance(self):
         try:
@@ -30,25 +30,11 @@ def getInstance(self):
             self._instance = self._decorated()
             return self._instance
 
-    def __new__(class_, *args, **kwargs):
+    def __new__(cls, *args, **kwargs):
         print("__new__")
-        class_.instances[class_] = super(Singleton, class_).__new__(class_, *args, **kwargs)
-        return class_.instances[class_]
+        cls._instance = super(Singleton, cls).__new__(cls, *args, **kwargs)
+        return cls._instance
 
     def __call__(self):
         raise TypeError("Singletons must be accessed through 'getInstance()'")
 
-
-class SingletonInstane:
-  __instance = None
-
-  @classmethod
-  def __getInstance(cls):
-    return cls.__instance
-
-  @classmethod
-  def instance(cls, *args, **kargs):
-    cls.__instance = cls(*args, **kargs)
-    cls.instance = cls.__getInstance
-    return cls.__instance
-
diff --git a/example/speech_recognition/stt_metric.py b/example/speech_recognition/stt_metric.py
index fc1916b40c3..ec74fc063dc 100644
--- a/example/speech_recognition/stt_metric.py
+++ b/example/speech_recognition/stt_metric.py
@@ -47,6 +47,7 @@ def __init__(self, batch_size, num_gpu, is_epoch_end=False, is_logging=True):
         self.total_ctc_loss = 0.
         self.batch_loss = 0.
         self.is_logging = is_logging
+
     def update(self, labels, preds):
         check_label_shapes(labels, preds)
         if self.is_logging:
@@ -83,10 +84,15 @@ def update(self, labels, preds):
                     if self.is_logging:
                         log.info("loss: %f " % loss)
         self.total_ctc_loss += self.batch_loss
+
     def get_batch_loss(self):
         return self.batch_loss
+
     def get_name_value(self):
-        total_cer = float(self.total_l_dist) / float(self.total_n_label)
+        try:
+            total_cer = float(self.total_l_dist) / float(self.total_n_label)
+        except ZeroDivisionError:
+            total_cer = float('inf')
 
         return total_cer, self.total_n_label, self.total_l_dist, self.total_ctc_loss
 
@@ -244,4 +250,3 @@ def char_match_2way(label, pred):
     val = val1_max if val1_max > val2_max else val2_max
     val_matched = val1_max_matched if val1_max > val2_max else val2_max_matched
     return val, val_matched, n_whole_label
-
diff --git a/example/speech_recognition/train.py b/example/speech_recognition/train.py
index 0d04e4e47a5..b1ae50b0755 100644
--- a/example/speech_recognition/train.py
+++ b/example/speech_recognition/train.py
@@ -16,15 +16,14 @@
 # under the License.
 
 import sys
-
+import json
 sys.path.insert(0, "../../python")
 import os.path
+#mxboard setting
+from mxboard import SummaryWriter
 import mxnet as mx
 from config_util import get_checkpoint_path, parse_contexts
 from stt_metric import STTMetric
-#tensorboard setting
-from tensorboard import SummaryWriter
-import json
 from stt_bucketing_module import STTBucketingModule
 
 
@@ -65,7 +64,7 @@ def do_training(args, module, data_train, data_val, begin_epoch=0):
     contexts = parse_contexts(args)
     num_gpu = len(contexts)
     eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric,is_epoch_end=True)
-    # tensorboard setting
+    # mxboard setting
     loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric,is_epoch_end=False)
 
     optimizer = args.config.get('optimizer', 'optimizer')
@@ -131,9 +130,9 @@ def reset_optimizer(force_init=False):
         data_train.reset()
         data_train.is_first_epoch = True
 
-    #tensorboard setting
-    tblog_dir = args.config.get('common', 'tensorboard_log_dir')
-    summary_writer = SummaryWriter(tblog_dir)
+    #mxboard setting
+    mxlog_dir = args.config.get('common', 'mxboard_log_dir')
+    summary_writer = SummaryWriter(mxlog_dir)
 
     while True:
 
@@ -144,7 +143,7 @@ def reset_optimizer(force_init=False):
         for nbatch, data_batch in enumerate(data_train):
             module.forward_backward(data_batch)
             module.update()
-            # tensorboard setting
+            # mxboard setting
             if (nbatch + 1) % show_every == 0:
                 module.update_metric(loss_metric, data_batch.label)
             #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch)
@@ -160,7 +159,7 @@ def reset_optimizer(force_init=False):
             module.forward(data_batch, is_train=True)
             module.update_metric(eval_metric, data_batch.label)
 
-        # tensorboard setting
+        # mxboard setting
         val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value()
         log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label)
         curr_acc = val_cer
@@ -170,7 +169,7 @@ def reset_optimizer(force_init=False):
         data_train.reset()
         data_train.is_first_epoch = False
 
-        # tensorboard setting
+        # mxboard setting
         train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value()
         summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch)
         summary_writer.add_scalar('CER train', train_cer, n_epoch)
diff --git a/include/mxnet/c_api_test.h b/include/mxnet/c_api_test.h
new file mode 100644
index 00000000000..fe6fc7fe9cc
--- /dev/null
+++ b/include/mxnet/c_api_test.h
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file c_api_test.h
+ * \brief C API of mxnet for ease of testing backend in Python
+ */
+#ifndef MXNET_C_API_TEST_H_
+#define MXNET_C_API_TEST_H_
+
+/*! \brief Inhibit C++ name-mangling for MXNet functions. */
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include <mxnet/c_api.h>
+
+/*!
+ * \brief This API partitions a graph only by the operator names
+ * provided by users. This will attach a DefaultSubgraphProperty
+ * to the input graph for partitioning. This function should be
+ * used only for the testing purpose.
+ */
+MXNET_DLL int MXPartitionGraphByOpNames(SymbolHandle sym_handle,
+                                        const char* prop_name,
+                                        const mx_uint num_ops,
+                                        const char** op_names,
+                                        SymbolHandle* ret_sym_handle);
+
+/*!
+ * \brief Given a subgraph property name, use the provided op names
+ * as the op_names attribute for that subgraph property, instead of
+ * the predefined one. This is only for the purpose of testing.
+ */
+MXNET_DLL int MXSetSubgraphPropertyOpNames(const char* prop_name,
+                                           const mx_uint num_ops,
+                                           const char** op_names);
+
+/*!
+ * \brief Given a subgraph property name, delete the op name set
+ * in the SubgraphPropertyOpNameSet.
+ */
+MXNET_DLL int MXRemoveSubgraphPropertyOpNames(const char* prop_name);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // MXNET_C_API_TEST_H_
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index dc48bfb83fa..11e64edfcd5 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -41,8 +41,26 @@ class Engine;
 
 /*! \brief namespace of engine internal types. */
 namespace engine {
-/*! \brief Internal representation of variable. */
-struct Var;
+/*! \brief base class of engine variables.*/
+struct Var {
+  virtual size_t version() {
+    return version_;
+  }
+  virtual ~Var() = default;
+  /*!
+   * \brief cast variable to derived type T
+   * \tparam T the type we want to cast into.
+   * \return A casted variable.
+   */
+  template <typename T>
+  inline T* Cast();
+  /*!
+   * \brief version number of the var. Every time the object it is associated with
+   * is modified, the version number is incremented by 1.
+   */
+  size_t version_{0};
+};  // struct Var
+
 /*! \brief Internal representation of operator.  */
 struct Opr;
 /*! \brief Variable pointer type, usually hold by user used to specify dependencies. */
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index bae3ea90d5e..6141a4da78e 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -340,6 +340,10 @@ class NDArray {
   inline size_t byte_offset() const {
     return byte_offset_;
   }
+  /*! \brief return var version of the NDArray*/
+  inline size_t version() const {
+    return var()->version();
+  }
   /*!
    * \brief save the content into binary stream
    * \param strm the output stream
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 2bfcdd62eda..89e1c9e087b 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -574,7 +574,7 @@ def _get_op_name_prefix(op_name):
     return ""
 
 
-# pylint: enable=too-many-locals, invalid-name
+# pylint: enable=invalid-name
 def _init_op_module(root_namespace, module_name, make_op_func):
     """
     Registers op functions created by `make_op_func` under
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index af7fedb33cb..0960776251c 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -308,6 +308,126 @@ def convert_tanh(node, **kwargs):
     )
     return [node]
 
+@mx_op.register("cos")
+def convert_cos(node, **kwargs):
+    """Map MXNet's cos operator attributes to onnx's Cos operator
+    and return the created node.
+    """
+    helper, _, _ = import_onnx_modules()
+    name = node["name"]
+    inputs = node["inputs"]
+    input_node_idx = kwargs["index_lookup"][inputs[0][0]]
+    proc_nodes = kwargs["proc_nodes"]
+    input_node = proc_nodes[input_node_idx].name
+
+    node = helper.make_node(
+        'Cos',
+        [input_node],
+        [name],
+        name=name
+    )
+    return [node]
+
+@mx_op.register("sin")
+def convert_sin(node, **kwargs):
+    """Map MXNet's sin operator attributes to onnx's Sin operator
+    and return the created node.
+    """
+    helper, _, _ = import_onnx_modules()
+    name = node["name"]
+    inputs = node["inputs"]
+    input_node_idx = kwargs["index_lookup"][inputs[0][0]]
+    proc_nodes = kwargs["proc_nodes"]
+    input_node = proc_nodes[input_node_idx].name
+
+    node = helper.make_node(
+        'Sin',
+        [input_node],
+        [name],
+        name=name
+    )
+    return [node]
+
+@mx_op.register("tan")
+def convert_tan(node, **kwargs):
+    """Map MXNet's tan operator attributes to onnx's tan operator
+    and return the created node.
+    """
+    helper, _, _ = import_onnx_modules()
+    name = node["name"]
+    inputs = node["inputs"]
+    input_node_idx = kwargs["index_lookup"][inputs[0][0]]
+    proc_nodes = kwargs["proc_nodes"]
+    input_node = proc_nodes[input_node_idx].name
+
+    node = helper.make_node(
+        'Tan',
+        [input_node],
+        [name],
+        name=name
+    )
+    return [node]
+
+@mx_op.register("arccos")
+def convert_acos(node, **kwargs):
+    """Map MXNet's acos operator attributes to onnx's acos operator
+    and return the created node.
+    """
+    helper, _, _ = import_onnx_modules()
+    name = node["name"]
+    inputs = node["inputs"]
+    input_node_idx = kwargs["index_lookup"][inputs[0][0]]
+    proc_nodes = kwargs["proc_nodes"]
+    input_node = proc_nodes[input_node_idx].name
+
+    node = helper.make_node(
+        'Acos',
+        [input_node],
+        [name],
+        name=name
+    )
+    return [node]
+
+@mx_op.register("arcsin")
+def convert_asin(node, **kwargs):
+    """Map MXNet's asin operator attributes to onnx's asin operator
+    and return the created node.
+    """
+    helper, _, _ = import_onnx_modules()
+    name = node["name"]
+    inputs = node["inputs"]
+    input_node_idx = kwargs["index_lookup"][inputs[0][0]]
+    proc_nodes = kwargs["proc_nodes"]
+    input_node = proc_nodes[input_node_idx].name
+
+    node = helper.make_node(
+        'Asin',
+        [input_node],
+        [name],
+        name=name
+    )
+    return [node]
+
+@mx_op.register("arctan")
+def convert_atan(node, **kwargs):
+    """Map MXNet's atan operator attributes to onnx's atan operator
+    and return the created node.
+    """
+    helper, _, _ = import_onnx_modules()
+    name = node["name"]
+    inputs = node["inputs"]
+    input_node_idx = kwargs["index_lookup"][inputs[0][0]]
+    proc_nodes = kwargs["proc_nodes"]
+    input_node = proc_nodes[input_node_idx].name
+
+    node = helper.make_node(
+        'Atan',
+        [input_node],
+        [name],
+        name=name
+    )
+    return [node]
+
 #Basic neural network functions
 @mx_op.register("sigmoid")
 def convert_sigmoid(node, **kwargs):
diff --git a/python/mxnet/contrib/onnx/mx2onnx/export_model.py b/python/mxnet/contrib/onnx/mx2onnx/export_model.py
index 0dbfdc1d7b9..33292bf664a 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/export_model.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/export_model.py
@@ -18,7 +18,7 @@
 # coding: utf-8
 #pylint: disable-msg=too-many-arguments
 
-"""export function"""
+"""Exports an MXNet model to the ONNX model format"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/python/mxnet/contrib/onnx/onnx2mx/import_model.py b/python/mxnet/contrib/onnx/onnx2mx/import_model.py
index 4e4d7863755..e190c3bdadc 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/import_model.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/import_model.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-"""import function"""
+"""Functions for importing ONNX models to MXNet and for checking metadata"""
 # pylint: disable=no-member
 
 from .import_onnx import GraphProto
@@ -72,6 +72,7 @@ def get_model_metadata(model_file):
             'output_tensor_data' : <list of tuples representing the shape of the output
                                     of the model>
         }
+
     """
     graph = GraphProto()
     try:
diff --git a/python/mxnet/executor_manager.py b/python/mxnet/executor_manager.py
index 825aa76e43c..9a53562204b 100644
--- a/python/mxnet/executor_manager.py
+++ b/python/mxnet/executor_manager.py
@@ -127,7 +127,7 @@ def _bind_exec(sym, ctx, input_shapes, param_names, need_grad=False,
     assert(arg_types is not None)
 
     arg_arrays = []
-    grad_arrays = {} if need_grad != False else None
+    grad_arrays = {} if need_grad is not False else None
 
     arg_names = sym.list_arguments()
 
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 96ecc21c81b..5f20d20c02a 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -153,6 +153,8 @@ def __repr__(self):
             s += ', groups={num_group}'
         if self.bias is None:
             s += ', bias=False'
+        if self.act:
+            s += ', {}'.format(self.act)
         s += ')'
         shape = self.weight.shape
         return s.format(name=self.__class__.__name__,
diff --git a/python/mxnet/gluon/rnn/rnn_cell.py b/python/mxnet/gluon/rnn/rnn_cell.py
index 21cc8043154..557837c3fa5 100644
--- a/python/mxnet/gluon/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/rnn/rnn_cell.py
@@ -252,14 +252,12 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
     #pylint: disable=no-self-use
     def _get_activation(self, F, inputs, activation, **kwargs):
         """Get activation function. Convert if is string"""
-        if activation == 'tanh':
-            return F.tanh(inputs, **kwargs)
-        elif activation == 'sigmoid':
-            return F.sigmoid(inputs, **kwargs)
-        elif activation == 'relu':
-            return F.relu(inputs, **kwargs)
-        elif activation == 'softsign':
-            return F.softsign(inputs, **kwargs)
+        func = {'tanh': F.tanh,
+                'relu': F.relu,
+                'sigmoid': F.sigmoid,
+                'softsign': F.softsign}.get(activation)
+        if func:
+            return func(inputs, **kwargs)
         elif isinstance(activation, string_types):
             return F.Activation(inputs, act_type=activation, **kwargs)
         elif isinstance(activation, LeakyReLU):
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index f04479d2371..d5a14a6859a 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -115,8 +115,23 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
     return [i.as_in_context(ctx) for i, ctx in zip(slices, ctx_list)]
 
 
-def clip_global_norm(arrays, max_norm):
+def clip_global_norm(arrays, max_norm, check_isfinite=True):
     """Rescales NDArrays so that the sum of their 2-norm is smaller than `max_norm`.
+
+    Parameters
+    ----------
+    arrays : list of NDArray
+    max_norm : float
+    check_isfinite : bool, default True
+         If True, check that the total_norm is finite (not nan or inf). This
+         requires a blocking .asscalar() call.
+
+    Returns
+    -------
+    NDArray or float
+      Total norm. Return type is NDArray of shape (1,) if check_isfinite is
+      False. Otherwise a float is returned.
+
     """
     def _norm(array):
         if array.stype == 'default':
@@ -126,15 +141,20 @@ def _norm(array):
     assert len(arrays) > 0
     ctx = arrays[0].context
     total_norm = ndarray.add_n(*[_norm(arr).as_in_context(ctx) for arr in arrays])
-    total_norm = ndarray.sqrt(total_norm).asscalar()
-    if not np.isfinite(total_norm):
-        warnings.warn(UserWarning('nan or inf is detected. Clipping results will be undefined.'),
-                      stacklevel=2)
+    total_norm = ndarray.sqrt(total_norm)
+    if check_isfinite:
+        if not np.isfinite(total_norm.asscalar()):
+            warnings.warn(
+                UserWarning('nan or inf is detected. '
+                            'Clipping results will be undefined.'), stacklevel=2)
     scale = max_norm / (total_norm + 1e-8)
-    if scale < 1.0:
-        for arr in arrays:
-            arr *= scale
-    return total_norm
+    scale = ndarray.min(ndarray.concat(scale, ndarray.ones(1, ctx=ctx), dim=0))
+    for arr in arrays:
+        arr *= scale.as_in_context(arr.context)
+    if check_isfinite:
+        return total_norm.asscalar()
+    else:
+        return total_norm
 
 
 def _indent(s_, numSpaces):
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index 63a44ab4643..caaa4006302 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -308,8 +308,7 @@ def _random_crop_proposal(self, label, height, width):
                 h -= 1
                 w = int(round(h * ratio))
                 area = w * h
-            if (area < min_area or area > max_area or w > width or h > height \
-                or w <= 0 or h <= 0):
+            if not (min_area <= area <= max_area and 0 <= w <= width and 0 <= h <= height):
                 continue
 
             y = random.randint(0, max(0, height - h))
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 24f5309d136..791de4bebdd 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -72,12 +72,12 @@ def imread(filename, *args, **kwargs):
 
     Set `flag` parameter to 0 to get grayscale output
 
-    >>> mx.img.imdecode("flower.jpg", flag=0)
+    >>> mx.img.imread("flower.jpg", flag=0)
     <NDArray 224x224x1 @cpu(0)>
 
     Set `to_rgb` parameter to 0 to get output in OpenCV format (BGR)
 
-    >>> mx.img.imdecode(str_image, to_rgb=0)
+    >>> mx.img.imread("flower.jpg", to_rgb=0)
     <NDArray 224x224x3 @cpu(0)>
     """
     return _internal._cvimread(filename, *args, **kwargs)
diff --git a/python/mxnet/lr_scheduler.py b/python/mxnet/lr_scheduler.py
index 963560d1785..436085620a2 100644
--- a/python/mxnet/lr_scheduler.py
+++ b/python/mxnet/lr_scheduler.py
@@ -17,6 +17,7 @@
 
 """Scheduling learning rate."""
 import logging
+from math import cos, pi
 
 class LRScheduler(object):
     """Base class of a learning rate scheduler.
@@ -28,9 +29,41 @@ class LRScheduler(object):
     ----------
     base_lr : float, optional
         The initial learning rate.
+    warmup_steps: int
+        number of warmup steps used before this scheduler starts decay
+    warmup_begin_lr: float
+        if using warmup, the learning rate from which it starts warming up
+    warmup_mode: string
+        warmup can be done in two modes.
+        'linear' mode gradually increases lr with each step in equal increments
+        'constant' mode keeps lr at warmup_begin_lr for warmup_steps
     """
-    def __init__(self, base_lr=0.01):
+    def __init__(self, base_lr=0.01,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
         self.base_lr = base_lr
+        assert isinstance(warmup_steps, int)
+        self.warmup_steps = warmup_steps
+
+        self.warmup_final_lr = base_lr
+        self.warmup_begin_lr = warmup_begin_lr
+        if self.warmup_begin_lr > self.warmup_final_lr:
+            raise ValueError("Base lr has to be higher than warmup_begin_lr")
+        if self.warmup_steps < 0:
+            raise ValueError("Warmup steps has to be positive or 0")
+        if warmup_mode not in ['linear', 'constant']:
+            raise ValueError("Supports only linear and constant modes of warmup")
+        self.warmup_mode = warmup_mode
+
+    def get_warmup_lr(self, num_update):
+        assert num_update < self.warmup_steps
+        if self.warmup_mode == 'linear':
+            increase = (self.warmup_final_lr - self.warmup_begin_lr) \
+                       * float(num_update) / float(self.warmup_steps)
+            return self.warmup_begin_lr + increase
+        elif self.warmup_mode == 'constant':
+            return self.warmup_begin_lr
+        else:
+            raise ValueError("Invalid warmup mode %s"%self.warmup_mode)
 
     def __call__(self, num_update):
         """Return a new learning rate.
@@ -66,8 +99,9 @@ class FactorScheduler(LRScheduler):
     stop_factor_lr : float, optional
         Stop updating the learning rate if it is less than this value.
     """
-    def __init__(self, step, factor=1, stop_factor_lr=1e-8):
-        super(FactorScheduler, self).__init__()
+    def __init__(self, step, factor=1, stop_factor_lr=1e-8, base_lr=0.01,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
+        super(FactorScheduler, self).__init__(base_lr, warmup_steps, warmup_begin_lr, warmup_mode)
         if step < 1:
             raise ValueError("Schedule step must be greater or equal than 1 round")
         if factor > 1.0:
@@ -78,6 +112,9 @@ def __init__(self, step, factor=1, stop_factor_lr=1e-8):
         self.count = 0
 
     def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
+
         # NOTE: use while rather than if  (for continuing training via load_epoch)
         while num_update > self.count + self.step:
             self.count += self.step
@@ -108,9 +145,19 @@ class MultiFactorScheduler(LRScheduler):
         The list of steps to schedule a change
     factor: float
         The factor to change the learning rate.
+    warmup_steps: int
+        number of warmup steps used before this scheduler starts decay
+    warmup_begin_lr: float
+        if using warmup, the learning rate from which it starts warming up
+    warmup_mode: string
+        warmup can be done in two modes.
+        'linear' mode gradually increases lr with each step in equal increments
+        'constant' mode keeps lr at warmup_begin_lr for warmup_steps
     """
-    def __init__(self, step, factor=1):
-        super(MultiFactorScheduler, self).__init__()
+    def __init__(self, step, factor=1, base_lr=0.01, warmup_steps=0, warmup_begin_lr=0,
+                 warmup_mode='linear'):
+        super(MultiFactorScheduler, self).__init__(base_lr, warmup_steps,
+                                                   warmup_begin_lr, warmup_mode)
         assert isinstance(step, list) and len(step) >= 1
         for i, _step in enumerate(step):
             if i != 0 and step[i] <= step[i-1]:
@@ -125,6 +172,9 @@ def __init__(self, step, factor=1):
         self.count = 0
 
     def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
+
         # NOTE: use while rather than if  (for continuing training via load_epoch)
         while self.cur_step_ind <= len(self.step)-1:
             if num_update > self.step[self.cur_step_ind]:
@@ -138,33 +188,94 @@ def __call__(self, num_update):
         return self.base_lr
 
 class PolyScheduler(LRScheduler):
-    """ Reduce the learning rate by given a list of steps.
+    """ Reduce the learning rate according to a polynomial of given power.
 
-    Calculate the new learning rate by::
+    Calculate the new learning rate, after warmup if any, by::
 
-       base_lr * (1-nup/max_nup)^pwr
+       final_lr + (start_lr - final_lr) * (1-nup/max_nup)^pwr
        if nup < max_nup, 0 otherwise.
 
     Parameters
     ----------
-       max_update: maximum number of updates before the decay reaches 0.
-       base_lr:    base learning rate
-       pwr:   power of the decay term as a funtion of the current number of updates.
-
+        max_update: int
+            maximum number of updates before the decay reaches final learning rate.
+        base_lr: float
+            base learning rate to start from
+        pwr:   int
+            power of the decay term as a function of the current number of updates.
+        final_lr:   float
+            final learning rate after all steps
+        warmup_steps: int
+            number of warmup steps used before this scheduler starts decay
+        warmup_begin_lr: float
+            if using warmup, the learning rate from which it starts warming up
+        warmup_mode: string
+            warmup can be done in two modes.
+            'linear' mode gradually increases lr with each step in equal increments
+            'constant' mode keeps lr at warmup_begin_lr for warmup_steps
     """
 
-    def __init__(self, max_update, base_lr=0.01, pwr=2):
-        super(PolyScheduler, self).__init__(base_lr)
+    def __init__(self, max_update, base_lr=0.01, pwr=2, final_lr=0,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
+        super(PolyScheduler, self).__init__(base_lr, warmup_steps, warmup_begin_lr, warmup_mode)
         assert isinstance(max_update, int)
         if max_update < 1:
             raise ValueError("maximum number of updates must be strictly positive")
+        self.power = pwr
         self.base_lr_orig = self.base_lr
         self.max_update = max_update
-        self.power = pwr
-        self.base_lr = self.base_lr_orig
+        self.final_lr = final_lr
+        self.max_steps = self.max_update - self.warmup_steps
+
+    def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
+        if num_update <= self.max_update:
+            self.base_lr = self.final_lr + (self.base_lr_orig - self.final_lr) * \
+                pow(1 - float(num_update - self.warmup_steps) / float(self.max_steps), self.power)
+        return self.base_lr
+
+class CosineScheduler(LRScheduler):
+    """ Reduce the learning rate according to a cosine function
+
+    Calculate the new learning rate by::
+
+       final_lr + (start_lr - final_lr) * (1+cos(pi * nup/max_nup))/2
+       if nup < max_nup, 0 otherwise.
+
+    Parameters
+    ----------
+        max_update: int
+            maximum number of updates before the decay reaches 0
+        base_lr: float
+            base learning rate
+        final_lr: float
+            final learning rate after all steps
+        warmup_steps: int
+            number of warmup steps used before this scheduler starts decay
+        warmup_begin_lr: float
+            if using warmup, the learning rate from which it starts warming up
+        warmup_mode: string
+            warmup can be done in two modes.
+            'linear' mode gradually increases lr with each step in equal increments
+            'constant' mode keeps lr at warmup_begin_lr for warmup_steps
+    """
+
+    def __init__(self, max_update, base_lr=0.01, final_lr=0,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
+        super(CosineScheduler, self).__init__(base_lr, warmup_steps, warmup_begin_lr, warmup_mode)
+        assert isinstance(max_update, int)
+        if max_update < 1:
+            raise ValueError("maximum number of updates must be strictly positive")
+        self.base_lr_orig = base_lr
+        self.max_update = max_update
+        self.final_lr = final_lr
+        self.max_steps = self.max_update - self.warmup_steps
 
     def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
         if num_update <= self.max_update:
-            self.base_lr = self.base_lr_orig * pow(1.0 - float(num_update) / float(self.max_update),
-                                                   self.power)
+            self.base_lr = self.final_lr + (self.base_lr_orig - self.final_lr) * \
+                (1 + cos(pi * (num_update - self.warmup_steps) / self.max_steps)) / 2
         return self.base_lr
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 3a50553a615..2666f8bbcd4 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -132,7 +132,7 @@ def _update_params_on_kvstore_nccl(param_arrays, grad_arrays, kvstore, param_nam
     size = len(valid_grad_arrays)
     start = 0
     # Use aggregation by default only with NCCL
-    default_batch = 16
+    default_batch = '16'
     batch = int(os.getenv('MXNET_UPDATE_AGGREGATION_SIZE', default_batch))
     while start < size:
         end = start + batch if start + batch < size else size
@@ -378,7 +378,6 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                 _multiple_callbacks(eval_end_callback, eval_end_params)
             eval_data.reset()
     # end of all epochs
-    return
 
 
 def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params):
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 46b21a90d4c..d6d619f30ca 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -2475,7 +2475,7 @@ def moveaxis(tensor, source, destination):
 
 
 # pylint: disable= no-member, protected-access, too-many-arguments, redefined-outer-name
-def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t):
+def arange(start, stop=None, step=1.0, repeat=1, infer_range=False, ctx=None, dtype=mx_real_t):
     """Returns evenly spaced values within a given interval.
 
     Values are generated within the half-open interval [`start`, `stop`). In other
@@ -2519,7 +2519,7 @@ def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t):
     if ctx is None:
         ctx = current_context()
     return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
-                             dtype=dtype, ctx=str(ctx))
+                             infer_range=infer_range, dtype=dtype, ctx=str(ctx))
 # pylint: enable= no-member, protected-access, too-many-arguments
 
 
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index 48d5c01fb43..3b19a772411 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -26,6 +26,7 @@
 from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null # pylint: disable=unused-import
 
 
+# pylint: disable=too-many-locals
 def _generate_ndarray_function_code(handle, name, func_name, signature_only=False):
     """Generate function for ndarray op by handle and function name."""
     real_name = ctypes.c_char_p()
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 5f6cbd6b6e1..4864ce99163 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -1285,6 +1285,7 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
             raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
         return c_array(NDArrayHandle, arg_handles), arg_arrays
 
+    # pylint: disable=too-many-locals
     def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
                     group2ctx=None, shared_arg_names=None, shared_exec=None,
                     shared_buffer=None, **kwargs):
@@ -2886,7 +2887,7 @@ def full(shape, val, dtype=None, **kwargs):
     return _internal._full(shape=shape, dtype=dtype, value=float(val), **kwargs)
 
 # pylint: disable=redefined-outer-name
-def arange(start, stop=None, step=1.0, repeat=1, name=None, dtype=None):
+def arange(start, stop=None, step=1.0, repeat=1, infer_range=False, name=None, dtype=None):
     """Returns evenly spaced values within a given interval.
 
     Parameters
@@ -2911,7 +2912,7 @@ def arange(start, stop=None, step=1.0, repeat=1, name=None, dtype=None):
     if dtype is None:
         dtype = _numpy.float32
     return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
-                             name=name, dtype=dtype)
+                             infer_range=infer_range, name=name, dtype=dtype)
 
 def histogram(a, bins=10, range=None, **kwargs):
     """Compute the histogram of the input data.
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 57bc2bf7638..62c05d25282 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -27,4 +27,4 @@ def makedirs(d):
         from distutils.dir_util import mkpath
         mkpath(d)
     else:
-        os.makedirs(d, exist_ok=True)
+        os.makedirs(d, exist_ok=True)  # pylint: disable=unexpected-keyword-arg
diff --git a/python/setup.py b/python/setup.py
index add5e6681fe..91563539822 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -20,6 +20,8 @@
 from __future__ import absolute_import
 import os
 import sys
+
+from setuptools import find_packages
 # need to use distutils.core for correct placement of cython dll
 kwargs = {}
 if "--inplace" in sys.argv:
@@ -29,7 +31,6 @@
     from setuptools import setup
     from setuptools.extension import Extension
     kwargs = {'install_requires': ['numpy<=1.15.0,>=1.8.2', 'requests<2.19.0,>=2.18.4', 'graphviz<0.9.0,>=0.8.1'], 'zip_safe': False}
-from setuptools import find_packages
 
 with_cython = False
 if '--with-cython' in sys.argv:
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 548c30b73a1..8b5e1e01095 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -407,11 +407,10 @@ object NDArray extends NDArrayBase {
    * @param dType The data type of the `NDArray`. The default datatype is `DType.Float32`.
    * @return NDArray of evenly spaced values in the specified range.
    */
-  def arange(start: Float, stop: Option[Float] = None, step: Float = 1.0f,
-    repeat: Int = 1, ctx: Context = Context.defaultCtx,
-    dType: DType = Base.MX_REAL_TYPE): NDArray = {
-    val params = Map("start" -> start, "step" -> step,
-      "repeat" -> repeat, "ctx" -> ctx.toString, "dtype" -> dType.toString())
+  def arange(start: Float, stop: Option[Float], step: Float,
+             repeat: Int, ctx: Context, dType: DType): NDArray = {
+    val params = Map("start" -> start, "step" -> step, "repeat" -> repeat,
+      "infer_range" -> false, "ctx" -> ctx.toString, "dtype" -> dType.toString())
     val fParams = if (stop == None) params else params ++ Map("stop" -> stop.get)
     NDArray.genericNDArrayFunctionInvoke("_arange", Seq(), fParams)(0)
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayCollector.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayCollector.scala
index 3952b73cfb0..0b7f9af705f 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayCollector.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayCollector.scala
@@ -133,6 +133,10 @@ class NDArrayCollector private(private val autoDispose: Boolean = true,
    * If the return type of scope is <em>NDArray</em> or <em>NDArrayFuncReturn</em>,
    * it is smart enough NOT to collect or dispose the returned NDArray. <br />
    * However in other cases, it is users' responsibility NOT to leak allocated NDArrays outside.
+   * <br />
+   * We might switch to try -with-resources statement (by AutoCloseable in Java 1.7+)
+   * and deprecate this method later, thus it is marked as Experimental.
+   *
    * @param codeBlock code block to be executed within the scope.
    * @tparam T return type of the function <em>codeBlock</em>.
    * @return The result of function <em>codeBlock</em>.
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 194d3681523..e3e1a320358 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -955,9 +955,28 @@ object Symbol extends SymbolBase {
    * @return Symbol The created Symbol.
    */
   def arange(start: Float, stop: Option[Float] = None, step: Float = 1.0f,
-    repeat: Int = 1, name: String = null, dType: DType = Base.MX_REAL_TYPE): Symbol = {
-    val params = Map("start" -> start, "step" -> step,
-      "repeat" -> repeat, "dtype" -> dType.toString())
+             repeat: Int = 1, name: String = null, dType: DType = Base.MX_REAL_TYPE): Symbol = {
+    arange(start, stop, step, repeat, infer_range = false, name, dType)
+  }
+
+  /**
+   * Returns evenly spaced values within a given interval.
+   * stop value can be infered from the output shape,
+   * which must be known from the rest of the net.
+   * @param start Start of interval. The default start value is 0.
+   * @param stop End of interval.
+   * @param step Spacing between values. The default step size is 1.
+   * @param repeat Number of times to repeat each element. The default repeat count is 1.
+   * @param infer_range Infer the stop value from output shape
+   * @param ctx Device context. Default context is the current default context.
+   * @param dType The data type of the `NDArray`. The default datatype is `DType.Float32`.
+   * @return NDArray of evenly spaced values in the specified range.
+   */
+  def arange(start: Float, stop: Option[Float], step: Float,
+             repeat: Int, infer_range: Boolean, name: String,
+             dType: DType): Symbol = {
+    val params = Map("start" -> start, "step" -> step, "repeat" -> repeat,
+      "infer_range" -> infer_range, "dtype" -> dType.toString())
     val fParams = if (stop == None) params else params ++ Map("stop" -> stop.get)
     createSymbolGeneral("_arange", name, null, Array.empty[Symbol], fParams)
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/annotation/Experimental.scala b/scala-package/core/src/main/scala/org/apache/mxnet/annotation/Experimental.scala
index 147d651fb04..d63194d48bc 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/annotation/Experimental.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/annotation/Experimental.scala
@@ -21,7 +21,7 @@ import java.lang.annotation.{ElementType, Retention, Target, _}
 
 /**
   * Experimental: there is a comparably high chance that
-  * the API will undergo some kind of changes
+  * the API will be changed or removed.
   */
 @Retention(RetentionPolicy.RUNTIME)
 @Target(Array(ElementType.TYPE, ElementType.FIELD, ElementType.METHOD, ElementType.PARAMETER,
diff --git a/tests/travis/r_vignettes.R b/src/c_api/.clang-tidy
similarity index 84%
rename from tests/travis/r_vignettes.R
rename to src/c_api/.clang-tidy
index 1b03b8bba4e..2af4b0d7f52 100644
--- a/tests/travis/r_vignettes.R
+++ b/src/c_api/.clang-tidy
@@ -15,7 +15,5 @@
 # specific language governing permissions and limitations
 # under the License.
 
-fnames <- list.files("R-package/vignettes/", pattern="*.Rmd")
-sapply(fnames, function(x){
-	knitr::purl(paste0("R-package/vignettes/", x))
-	})
\ No newline at end of file
+# Disable most clang-tidy checks in the c_api folder.
+Checks: -*,readability-non-const-parameter
diff --git a/src/c_api/c_api_test.cc b/src/c_api/c_api_test.cc
new file mode 100644
index 00000000000..623faa71adc
--- /dev/null
+++ b/src/c_api/c_api_test.cc
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file c_api_test.cc
+ * \brief C API of mxnet for the ease of testing backend in Python
+ */
+#include <mxnet/c_api_test.h>
+#include <nnvm/pass.h>
+#include "./c_api_common.h"
+#include "../operator/subgraph/subgraph_property.h"
+
+int MXPartitionGraphByOpNames(SymbolHandle sym_handle,
+                              const char* prop_name,
+                              const mx_uint num_ops,
+                              const char** op_names,
+                              SymbolHandle* ret_sym_handle) {
+  nnvm::Symbol* s = new nnvm::Symbol();
+  API_BEGIN();
+  std::unordered_set<std::string> op_name_set;
+  for (size_t i = 0; i < num_ops; ++i) {
+    op_name_set.emplace(op_names[i]);
+  }
+  nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(sym_handle);
+  *s = sym->Copy();
+  nnvm::Graph g;
+  g.outputs = s->outputs;
+  if (!op_name_set.empty()) {
+    mxnet::op::SubgraphPropertyPtr property
+        = mxnet::op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(prop_name);
+    property->SetAttr("op_names", op_name_set);
+    g.attrs["subgraph_property"] = std::make_shared<nnvm::any>(std::move(property));
+  }
+  g = nnvm::ApplyPass(std::move(g), "PartitionGraph");
+  s->outputs = g.outputs;
+  *ret_sym_handle = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int MXSetSubgraphPropertyOpNames(const char* prop_name,
+                                 const mx_uint num_ops,
+                                 const char** op_names) {
+  API_BEGIN();
+  std::unordered_set<std::string> op_name_set;
+  for (size_t i = 0; i < num_ops; ++i) {
+    op_name_set.emplace(op_names[i]);
+  }
+  (*mxnet::op::SubgraphPropertyOpNameSet::Get())[prop_name] = op_name_set;
+  API_END();
+}
+
+int MXRemoveSubgraphPropertyOpNames(const char* prop_name) {
+  API_BEGIN();
+  mxnet::op::SubgraphPropertyOpNameSet::Get()->erase(prop_name);
+  API_END();
+}
diff --git a/src/engine/engine_impl.h b/src/engine/engine_impl.h
index b3ec34dc857..f15141f4e7a 100644
--- a/src/engine/engine_impl.h
+++ b/src/engine/engine_impl.h
@@ -33,20 +33,6 @@
 namespace mxnet {
 namespace engine {
 
-/*! \brief base class of engine variables, used for type checking */
-struct Var {
-#if ENGINE_DEBUG
-  virtual ~Var() = default;
-#endif  // ENGINE_DEBUG
-  /*!
-   * \brief cast variable to derived type T
-   * \tparam T the type we want to cast into.
-   * \return A casted variable.
-   */
-  template <typename T>
-  inline T* Cast();
-};  // struct Var
-
 /*! \brief base class of engine operators, used for type checking */
 struct Opr {
 #if ENGINE_DEBUG
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 8196af2de2f..daff5306694 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -28,10 +28,24 @@
 #include "./engine_impl.h"
 #include "../profiler/profiler.h"
 #include "./openmp.h"
+#include "../common/object_pool.h"
 
 namespace mxnet {
 namespace engine {
 
+/*!
+ * \brief var used in Naive Engine for tracking the version
+ * of the objects it is associated with.
+ */
+class NaiveVar final
+    : public Var, public common::ObjectPoolAllocatable<NaiveVar> {
+ public:
+  inline static NaiveVar* CastFromBase(Var* ptr) {
+    return ptr->Cast<NaiveVar>();
+  }
+};  // class NaiveVar
+
+
 // implement naive engine
 class NaiveEngine final : public Engine {
  public:
@@ -71,8 +85,7 @@ class NaiveEngine final : public Engine {
 
   // new variables
   VarHandle NewVariable() override {
-    size_t v = ++counter_;
-    return reinterpret_cast<VarHandle>(v);
+    return NaiveVar::New();
   }
 
   OprHandle NewOperator(AsyncFn fn,
@@ -146,6 +159,10 @@ class NaiveEngine final : public Engine {
       opr->opr_profile.reset(new profiler::ProfileOperator(opr->opr_name, attrs.release()));
       opr->opr_profile->start(exec_ctx.dev_type, exec_ctx.dev_id);
     }
+    // increment mutable var version
+    for (auto var : mutable_vars) {
+      ++var->version_;
+    }
     if (exec_ctx.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       size_t dev_id = static_cast<size_t>(exec_ctx.dev_id);
@@ -171,8 +188,12 @@ class NaiveEngine final : public Engine {
   }
 
   void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) override {
-    this->PushSync(delete_fn, exec_ctx, {}, {var},
-                   FnProperty::kNormal, 0, "DeleteVariable");
+    NaiveVar* naive_var = NaiveVar::CastFromBase(var);
+    this->PushAsync([delete_fn, naive_var](RunContext ctx, CallbackOnComplete on_complete) mutable {
+        delete_fn(ctx);
+        NaiveVar::Delete(naive_var);
+        on_complete();
+      }, exec_ctx, {}, {var}, FnProperty::kDeleteVar, 0, "DeleteVariable");
   }
 
   void WaitForVar(VarHandle var) override {
@@ -192,8 +213,6 @@ class NaiveEngine final : public Engine {
   }
   // whether action is completed
   bool req_completed_;
-  // counter
-  std::atomic<size_t> counter_{0};
   /*! \brief whether it is during shutdown phase*/
   std::atomic<bool> shutdown_phase_{false};
   // CPU stream
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index e70cc197c0c..3a7587fef13 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -130,6 +130,9 @@ inline bool ThreadedVar::CompleteWriteDependency(Dispatcher dispatcher) {
     assert(pending_write_ != nullptr);
     CHECK_EQ(num_pending_reads_, kWriteTriggered);
 
+    // increment version number
+    ++version_;
+
     // really delete
     if (to_delete_) {
       VersionedVarBlock *head = pending_write_->next;
@@ -164,7 +167,7 @@ inline bool ThreadedVar::CompleteWriteDependency(Dispatcher dispatcher) {
   }
   // This is outside of lock scope
   // Be very carful, pending_write_ and num_pending_reads_
-  // can change now, do not reply ont the two variables.
+  // can change now, do not rely on these two variables.
   // The linked list \in [old_pending_write, end_of_read_chain)
   // is already detached from this Var.
   // So it is safe to modify these
@@ -196,6 +199,11 @@ inline bool ThreadedVar::ready_to_read() {
   return this->is_ready_to_read();
 }
 
+inline size_t ThreadedVar::version() {
+  std::lock_guard<std::mutex> lock{mutex_};
+  return this->version_;
+}
+
 // implementation of threaded engine
 ThreadedVar* ThreadedEngine::NewVariable() {
   return ThreadedVar::New(VersionedVarBlock::New());
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 428f0d8c554..a2c1a2b943a 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -162,6 +162,7 @@ class ThreadedVar final
   inline void SetToDelete();
   /*! \return whether this variable is ready to read. */
   inline bool ready_to_read();
+  inline size_t version() override;
   /*!
    * \brief Cast a Var pointer to ThreadedVar pointer
    * \param ptr pointer from base.
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index c011c1d9ce0..0e415ef5112 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -159,6 +159,9 @@ class StatefulComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
 #if MXNET_USE_MKLDNN == 1
     InvalidateOutputs(out_array, req);
+    CreateDefaultInputs(in_array, &in_array_fallback);
+    fcompute_(state_, op_ctx, in_array_fallback, req, out_array);
+    return;
 #endif
     fcompute_(state_, op_ctx, in_array, req, out_array);
   }
@@ -226,6 +229,13 @@ class FComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
 #if MXNET_USE_MKLDNN == 1
     InvalidateOutputs(out_array, req);
+    // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
+    const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
+    if (!is_mkldnn.get(attrs_.op, false)) {
+      CreateDefaultInputs(in_array, &in_array_fallback);
+      fcompute_(attrs_, op_ctx, in_array_fallback, req, out_array);
+      return;
+    }
 #endif
     fcompute_(attrs_, op_ctx, in_array, req, out_array);
   }
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index cd1db0ac194..52f7c790c77 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -86,6 +86,10 @@ class OpExecutor {
   virtual OpStatePtr state() const {
     return OpStatePtr();
   }
+
+  // TODO(alexzai): (MXNET-856) Remove instance member after subgraph feature added
+ protected:
+  std::vector<NDArray> in_array_fallback;
 };
 
 /*!
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 32b14b8e963..265554ab391 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -33,6 +33,7 @@
 #include "../profiler/profiler.h"
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
+#include "../operator/subgraph/subgraph_property.h"
 
 namespace mxnet {
 namespace exec {
@@ -42,6 +43,7 @@ using namespace mxnet::common;
 GraphExecutor::GraphExecutor() {
   log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
   need_grad_ = false;
+  subgraph_property_ = dmlc::GetEnv("MXNET_SUBGRAPH_BACKEND", std::string());
 }
 
 GraphExecutor::~GraphExecutor() {
@@ -1428,6 +1430,146 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     iter->c_str());
   return ret;
 }
+
+// Infer shapes, dtypes, stypes, contexts for the forward graph
+static nnvm::Graph InferForwardAttrs(nnvm::Graph g,
+                                     nnvm::ShapeVector arg_shapes,
+                                     nnvm::DTypeVector arg_dtypes,
+                                     StorageTypeVector arg_stypes,
+                                     const Context& default_ctx,
+                                     const std::map<std::string, Context>& ctx_map,
+                                     const std::vector<Context>& in_arg_ctxes,
+                                     const std::vector<Context>& aux_state_ctxes) {
+  const auto& indexed_graph = g.indexed_graph();
+  const auto num_forward_inputs = indexed_graph.input_nodes().size();
+  g = AssignContext(g, default_ctx, ctx_map, in_arg_ctxes, {},
+                   aux_state_ctxes, {}, num_forward_inputs, g.outputs.size());
+  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
+  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
+    HandleInferShapeError(num_forward_inputs, indexed_graph,
+                          g.GetAttr<nnvm::ShapeVector>("shape"));
+  }
+  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
+  if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
+    HandleInferTypeError(num_forward_inputs, indexed_graph,
+                         g.GetAttr<nnvm::DTypeVector>("dtype"));
+  }
+  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
+  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
+    HandleInferStorageTypeError(num_forward_inputs, indexed_graph,
+                                g.GetAttr<StorageTypeVector>("storage_type"));
+  }
+  return g;
+}
+
+// Given input attr arrays, partition the graph using the backend name equal to prop_name.
+// This is a common function for bind and simple_bind flows.
+static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
+                                   const std::string& prop_name,
+                                   const nnvm::ShapeVector& arg_shapes,
+                                   const nnvm::DTypeVector& arg_dtypes,
+                                   const StorageTypeVector& arg_stypes,
+                                   const Context& default_ctx,
+                                   const std::map<std::string, Context>& ctx_map,
+                                   const std::vector<Context>& in_arg_ctxes,
+                                   const std::vector<Context>& aux_state_ctxes) {
+  auto subgraph_prop = op::SubgraphPropertyRegistry::Get()->CreateSubgraphProperty(prop_name);
+  nnvm::Symbol ret = src.Copy();
+  nnvm::Graph g;
+  g.outputs = ret.outputs;
+  g = InferForwardAttrs(g, arg_shapes, arg_dtypes, arg_stypes, default_ctx,
+                        ctx_map, in_arg_ctxes, aux_state_ctxes);
+  subgraph_prop->SetAttr("graph", g);
+  auto it = op::SubgraphPropertyOpNameSet::Get()->find(prop_name);
+  // assign a op name set to the subgraph property if it has been provided by users
+  if (it != op::SubgraphPropertyOpNameSet::Get()->end()) {
+    LOG(INFO) << "SubgraphPropertyOpNameSet for subgraph property " << prop_name
+              << " has been assigned a value. Please make sure it is initialized"
+                 " only for the testing purpose.";
+    subgraph_prop->SetAttr("op_names", it->second);
+  }
+  g.attrs["subgraph_property"] = std::make_shared<nnvm::any>(std::move(subgraph_prop));
+  g = ApplyPass(std::move(g), "PartitionGraph");
+  ret.outputs = g.outputs;
+  return ret;
+}
+
+// Given input attr dicts, partition the graph using the backend name equal to prop_name.
+// This is for simple_bind flow.
+static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
+                                   const std::string& prop_name,
+                                   const std::unordered_map<std::string, TShape>& arg_shape_map,
+                                   const std::unordered_map<std::string, int>& arg_dtype_map,
+                                   const std::unordered_map<std::string, int>& arg_stype_map,
+                                   const Context& default_ctx,
+                                   const std::map<std::string, Context>& ctx_map,
+                                   const std::vector<Context>& in_arg_ctxes,
+                                   const std::vector<Context>& aux_state_ctxes) {
+  const std::vector<std::string> input_names = src.ListInputNames(Symbol::kAll);
+  nnvm::ShapeVector arg_shapes(input_names.size(), TShape());
+  nnvm::DTypeVector arg_dtypes(input_names.size(), -1);
+  StorageTypeVector arg_stypes(input_names.size(), kUndefinedStorage);
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    auto it1 = arg_shape_map.find(input_names[i]);
+    if (arg_shape_map.end() != it1) {
+      arg_shapes[i] = it1->second;
+    }
+    auto it2 = arg_dtype_map.find(input_names[i]);
+    if (arg_dtype_map.end() != it2) {
+      arg_dtypes[i] = it2->second;
+    }
+    auto it3 = arg_stype_map.find(input_names[i]);
+    if (arg_stype_map.end() != it3) {
+      arg_stypes[i] = it3->second;
+    }
+  }
+  return PartitionGraph(src, prop_name, arg_shapes, arg_dtypes, arg_stypes,
+                        default_ctx, ctx_map, in_arg_ctxes, aux_state_ctxes);
+}
+
+// Given input ndarrays, partition the graph using the backend name equal to prop_name.
+// This is for bind flow.
+static nnvm::Symbol PartitionGraph(const nnvm::Symbol& src,
+                                   const std::string& prop_name,
+                                   const std::vector<NDArray> &in_args,
+                                   const std::vector<NDArray> &aux_states,
+                                   const Context& default_ctx,
+                                   const std::map<std::string, Context>& ctx_map) {
+  const std::vector<std::string> input_names = src.ListInputNames(Symbol::kAll);
+  const std::vector<std::string> arg_names = src.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+  const std::vector<std::string> aux_names = src.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+  CHECK_EQ(arg_names.size(), in_args.size());
+  CHECK_EQ(aux_names.size(), aux_states.size());
+  nnvm::ShapeVector arg_shapes;  // all input shapes
+  arg_shapes.reserve(input_names.size());
+  nnvm::DTypeVector arg_dtypes;  // all input dtypes
+  arg_dtypes.reserve(input_names.size());
+  StorageTypeVector arg_stypes;  // all input stypes
+  arg_stypes.reserve(input_names.size());
+  std::vector<Context> in_arg_ctxes(in_args.size());
+  std::vector<Context> aux_state_ctxes(aux_states.size());
+
+  size_t i1 = 0, i2 = 0;
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    if (i2 < aux_names.size() && aux_names[i2] == input_names[i]) {
+      arg_shapes.push_back(aux_states[i2].shape());
+      arg_dtypes.push_back(aux_states[i2].dtype());
+      arg_stypes.push_back(aux_states[i2].storage_type());
+      aux_state_ctxes[i2] = aux_states[i2].ctx();
+      ++i2;
+    } else {
+      CHECK(i1 < arg_names.size());
+      CHECK_EQ(arg_names[i1], input_names[i]);
+      arg_shapes.push_back(in_args[i1].shape());
+      arg_dtypes.push_back(in_args[i1].dtype());
+      arg_stypes.push_back(in_args[i1].storage_type());
+      in_arg_ctxes[i1] = in_args[i1].ctx();
+      ++i1;
+    }
+  }
+  return PartitionGraph(src, prop_name, arg_shapes, arg_dtypes, arg_stypes,
+                        default_ctx, ctx_map, in_arg_ctxes, aux_state_ctxes);
+}
 }  // namespace exec
 
 Executor *Executor::SimpleBind(nnvm::Symbol symbol,
@@ -1447,6 +1589,11 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
                                std::unordered_map<std::string, NDArray>* shared_buffer,
                                Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
+  if (!exec->subgraph_property().empty()) {
+    symbol = exec::PartitionGraph(symbol, exec->subgraph_property(), arg_shape_map, arg_dtype_map,
+                                  arg_stype_map, default_ctx, group2ctx, in_arg_ctxes,
+                                  aux_state_ctxes);
+  }
   exec->Init(symbol, default_ctx, group2ctx,
              in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
              arg_shape_map, arg_dtype_map, arg_stype_map,
@@ -1465,6 +1612,10 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
                          const std::vector<NDArray> &aux_states,
                          Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
+  if (!exec->subgraph_property().empty()) {
+    symbol = exec::PartitionGraph(symbol, exec->subgraph_property(), in_args, aux_states,
+                                  default_ctx, group2ctx);
+  }
   exec->Init(symbol, default_ctx, group2ctx,
              in_args, arg_grad_store, grad_req_type, aux_states,
              reinterpret_cast<Executor*>(shared_exec));
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index 7b936c30025..b94bb437778 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -117,6 +117,8 @@ class GraphExecutor : public Executor {
                     std::vector<NDArray>* arg_grads,
                     std::vector<NDArray>* aux_states) override;
 
+  const std::string& subgraph_property() const { return subgraph_property_; }
+
  protected:
   friend class mxnet::Imperative;
   // Information about operational node
@@ -256,6 +258,8 @@ class GraphExecutor : public Executor {
   std::unordered_set<std::string> cached_seg_opr_names_;
   // verbose logging
   bool log_verbose_ = false;
+  // subgraph property name
+  std::string subgraph_property_;
 };
 
 }  // namespace exec
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 34cab3037ce..61370a5bfaf 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -459,6 +459,7 @@ class CommDevice : public Comm {
   void Init(int key, const NDArrayStorageType stype, const TShape& shape,
             int dtype = mshadow::kFloat32) override {
     sorted_key_attrs_.emplace_back(key, shape, dtype);
+    inited_ = false;
   }
 
   void InitBuffersAndComm(const std::vector<NDArray>& src) {
@@ -701,8 +702,10 @@ class CommDevice : public Comm {
       }
       // Delayed allocation - as the dense merged buffer might not be used at all if push()
       // only sees sparse arrays
-      bool delay_alloc = true;
-      buf.merged = NDArray(shape, ctx, delay_alloc, type);
+      if (buf.merged.is_none()) {
+        bool delay_alloc = true;
+        buf.merged = NDArray(shape, ctx, delay_alloc, type);
+      }
       ctx_info[ctx.dev_id].second += shape.Size();
     }
     inited_ = true;
diff --git a/src/operator/bilinear_sampler-inl.h b/src/operator/bilinear_sampler-inl.h
index 657aebafdb7..e0b4db7b367 100644
--- a/src/operator/bilinear_sampler-inl.h
+++ b/src/operator/bilinear_sampler-inl.h
@@ -44,7 +44,10 @@ enum BilinearSamplerOpOutputs {kOut, kTmp};
 }
 
 struct BilinearSamplerParam : public dmlc::Parameter<BilinearSamplerParam> {
+  dmlc::optional<bool> cudnn_off;
   DMLC_DECLARE_PARAMETER(BilinearSamplerParam) {
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(dmlc::optional<bool>())
+        .describe("whether to turn cudnn off");
   }
 };
 
diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
index 0ab628da700..e1f205258a2 100644
--- a/src/operator/bilinear_sampler.cu
+++ b/src/operator/bilinear_sampler.cu
@@ -212,7 +212,11 @@ Operator* CreateOp<gpu>(BilinearSamplerParam param, int dtype) {
   Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNBilinearSamplerOp<DType>(param);
+    if (param.cudnn_off.has_value() && param.cudnn_off.value()) {
+      op = new BilinearSamplerOp<gpu, DType>(param);
+    } else {
+      op = new CuDNNBilinearSamplerOp<DType>(param);
+    }
   })
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/contrib/ctc_loss-inl.h b/src/operator/contrib/ctc_loss-inl.h
index 72209ae286c..9380be47451 100644
--- a/src/operator/contrib/ctc_loss-inl.h
+++ b/src/operator/contrib/ctc_loss-inl.h
@@ -409,7 +409,8 @@ class CTCLossOp : public Operator {
 
     // since the input is activation before softmax and cudnn ctc takes softmax
     // apply softmax to inputs first.
-    mxnet_op::Softmax<mxnet_op::softmax_fwd>(s, data.dptr_, prob.dptr_, data.shape_, 2, 1.0);
+    mxnet_op::Softmax<mxnet_op::softmax_fwd, false>(
+      s, data.dptr_, prob.dptr_, data.shape_, 2, 1.0);
 
     CUDNN_CALL(cudnnCTCLoss(s->dnn_handle_,
                             prob_desc_,
@@ -426,8 +427,8 @@ class CTCLossOp : public Operator {
                             workspace_bytes));
 
     if (req_grad) {
-      mxnet_op::SoftmaxGrad<mshadow_op::mul, mxnet_op::softmax_bwd, kWriteTo>(s,
-          prob.dptr_, grad.dptr_, grad.dptr_, data.shape_, 2, 1.0);
+      mxnet_op::SoftmaxGrad<mshadow_op::mul, mxnet_op::softmax_bwd, kWriteTo, false>(
+        s, prob.dptr_, grad.dptr_, grad.dptr_, data.shape_, 2, 1.0);
       Assign(grad, mxnet::kWriteInplace, grad * alphabet_size);
     }
   }
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index d6b6703ddd5..ba7f5c0ad8b 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -314,7 +314,6 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
 
   // For the shape of output data.
   size_t len = in_shape->at(0)[0];
-  CHECK_GT(len, 0);
   for (int i = 0; i < params.num_out_data; i++) {
     // If the output shape isn't inferred, we don't need to propogate the info.
     const auto& g_out_shape = subg_out_shape[i];
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 277ca8e3013..ba44ebd4ed4 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -31,6 +31,8 @@
 #include "./mkldnn/mkldnn_base-inl.h"
 #include "./mkldnn/mkldnn_ops-inl.h"
 #endif  // MXNET_USE_MKLDNN
+#include "../operator_common.h"
+#include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -101,6 +103,7 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
 }
 #endif
 
+#if MXNET_USE_MKLDNN == 1
 inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
                                          const int dev_mask,
                                          DispatchMode* dispatch_mode,
@@ -108,20 +111,9 @@ inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
                                          std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
-  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask,
-                                                            dispatch_mode,
-                                                            in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
-    *dispatch_mode = DispatchMode::kFComputeFallback;
-    return ret;
-  }
-#endif
-  return ret;
+  return MKLDNNStorageType(attrs, dev_mask, SupportMKLDNNAct(param),
+                           dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
@@ -129,46 +121,17 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
                                           DispatchMode* dispatch_mode,
                                           std::vector<int> *in_attrs,
                                           std::vector<int> *out_attrs) {
-  bool ret = false;
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
   if (param.act_type != activation::kReLU) {
     CHECK_EQ(in_attrs->size(), 3U);
-    ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask,
-                                                         dispatch_mode,
-                                                         in_attrs, out_attrs);
   } else {
     // for ReLU activation, the backward pass only needs ograd and output
     CHECK_EQ(in_attrs->size(), 2U);
-    ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
-                                                         dispatch_mode,
-                                                         in_attrs, out_attrs);
-  }
-#else
-  if (param.act_type == activation::kSoftSign) {
-    CHECK_EQ(in_attrs->size(), 3U);
-    ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask,
-                                                         dispatch_mode,
-                                                         in_attrs, out_attrs);
-  } else {
-    CHECK_EQ(in_attrs->size(), 2U);
-    ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
-                                                         dispatch_mode,
-                                                         in_attrs, out_attrs);
   }
-#endif
-  CHECK_EQ(out_attrs->size(), 1U);
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
-    *dispatch_mode = DispatchMode::kFComputeFallback;
-    return ret;
-  }
-#endif
-  return ret;
+  return MKLDNNStorageType(attrs, dev_mask, SupportMKLDNNAct(param),
+                           dispatch_mode, in_attrs, out_attrs);
 }
+#endif
 
 MXNET_OPERATOR_REGISTER_UNARY(Activation)
 .describe(R"code(Applies an activation function element-wise to the input.
@@ -183,13 +146,16 @@ The following activation functions are supported:
 
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<ActivationParam>)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", ActivationStorageType)
+#endif
 .set_attr<nnvm::FListOutputNames>("FListOutputNames",
     [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"output"};
 })
 .set_attr<FCompute>("FCompute<cpu>", ActivationCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ActivationComputeExCPU)
 #endif
 .set_attr<nnvm::FGradient>("FGradient", ActivationGrad{"_backward_Activation"})
@@ -204,7 +170,9 @@ NNVM_REGISTER_OP(_backward_Activation)
   })
 .set_num_outputs(1)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", BackwardActStorageType)
+#endif
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
@@ -217,6 +185,7 @@ NNVM_REGISTER_OP(_backward_Activation)
 #endif
 .set_attr_parser(ParamParser<ActivationParam>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ActivationGradComputeExCPU)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", ActivationGradCompute<cpu>);
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index c7b1b609990..4ea494d64e4 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -27,6 +27,7 @@
 #include "batch_norm-inl.h"
 #include <nnvm/op_attr_types.h>
 #include "../elemwise_op_common.h"
+#include "../operator_common.h"
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_batch_norm-inl.h"
 #endif
@@ -544,7 +545,7 @@ Both *mean* and *var* returns a scalar by treating the input as a vector.
 
 Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
 have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and
-the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these 
+the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these
 two outputs are blocked.
 
 Besides the inputs and the outputs, this operator accepts two auxiliary
@@ -600,6 +601,7 @@ the sparse tensors will fallback.
 #endif
 .set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
@@ -632,6 +634,7 @@ NNVM_REGISTER_OP(_backward_BatchNorm)
 #endif
 .set_attr_parser(ParamParser<BatchNormParam>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 9df459e9224..ac8a814ce70 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -367,6 +367,7 @@ Example::
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
+.set_attr<bool>("TIsMKLDNN", true)
 #endif
 CONCAT_FORWARD_ATTRS
 .set_attr<nnvm::FInferShape>("FInferShape", ConcatShape)
@@ -387,6 +388,7 @@ NNVM_REGISTER_OP(_backward_Concat)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FInferStorageType>("FInferStorageType", BackwardConcatStorageType)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ConcatGradComputeExCPU)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 18c0132023d..d5abe629123 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -26,11 +26,14 @@
 
 #include "./convolution-inl.h"
 #include "../elemwise_op_common.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
+#include "../operator_common.h"
 #if MXNET_USE_NNPACK == 1
 #include "../nnpack/nnpack_pooling-inl.h"
 #endif  // MXNET_USE_NNPACK
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_base-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#endif  // MXNET_USE_MKLDNN
 
 namespace mxnet {
 namespace op {
@@ -288,27 +291,19 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+#if MXNET_USE_MKLDNN == 1
 inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
                                    const int dev_mask,
                                    DispatchMode* dispatch_mode,
-                                   std::vector<int> *in_attrs,
-                                   std::vector<int> *out_attrs) {
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   uint32_t in_expected = param.no_bias ? 2 : 3;
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), 1);
 
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
-    wanted_mode = DispatchMode::kFComputeFallback;
-  else if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
 
 inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
@@ -322,18 +317,10 @@ inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), out_expected);
 
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
-    wanted_mode = DispatchMode::kFComputeFallback;
-  else if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
+#endif
 
 void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
   using namespace mshadow;
@@ -492,9 +479,12 @@ There are other options to tune the performance.
 })
 .set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
 .set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
+#endif
 .set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionComputeExCPU)
 #endif
 .set_attr<nnvm::FGradient>("FGradient", ConvolutionGrad{"_backward_Convolution"})
@@ -512,12 +502,15 @@ NNVM_REGISTER_OP(_backward_Convolution)
   return params.no_bias ? 2 : 3;
 })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", BackwardConvStorageType)
+#endif
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
 .set_attr_parser(ConvolutionParamParser)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradComputeExCPU)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>);
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 54b77aafda0..1ab391d92b0 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -25,8 +25,12 @@
 */
 
 #include "./deconvolution-inl.h"
+#include "../operator_common.h"
+#include "../../common/utils.h"
+#if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_ops-inl.h"
 #include "./mkldnn/mkldnn_base-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -256,6 +260,7 @@ static bool DeconvolutionType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+#if MXNET_USE_MKLDNN == 1
 inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs,
                                      const int dev_mask,
                                      DispatchMode* dispatch_mode,
@@ -266,17 +271,8 @@ inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), in_expected);
   CHECK_EQ(out_attrs->size(), 1);
 
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
-    wanted_mode = DispatchMode::kFComputeFallback;
-  else if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
 
 inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
@@ -289,20 +285,10 @@ inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U);
   CHECK_EQ(out_attrs->size(), out_expected);
 
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
-    wanted_mode = DispatchMode::kFComputeFallback;
-  else if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
 
-#if MXNET_USE_MKLDNN == 1
 static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                       const OpContext& ctx,
                                       const std::vector<NDArray>& inputs,
@@ -419,12 +405,15 @@ NNVM_REGISTER_OP(Deconvolution)
 })
 .set_attr<nnvm::FInferShape>("FInferShape", DeconvolutionShape)
 .set_attr<nnvm::FInferType>("FInferType", DeconvolutionType)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", DeconvStorageType)
+#endif
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
 .set_attr<FCompute>("FCompute<cpu>", DeconvolutionCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionComputeExCPU)
 #endif
 .set_attr<nnvm::FGradient>("FGradient", DeconvolutionGrad{"_backward_Deconvolution"})
@@ -440,12 +429,15 @@ NNVM_REGISTER_OP(_backward_Deconvolution)
   return params.no_bias ? 2 : 3;
 })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", BackwardDeconvStorageType)
+#endif
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
 .set_attr_parser(DeconvolutionParamParser)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionGradComputeExCPU)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", DeconvolutionGradCompute<cpu>);
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index 2338f8974aa..2b75419d2a8 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -61,6 +61,11 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
     DMLC_DECLARE_FIELD(flatten).set_default(true)
     .describe("Whether to collapse all but the first axis of the input data tensor.");
   }
+  bool operator==(const FullyConnectedParam& other) const {
+    return this->num_hidden == other.num_hidden &&
+           this->no_bias == other.no_bias &&
+           this->flatten == other.flatten;
+  }
 };
 
 template<typename xpu, typename DType>
@@ -228,4 +233,16 @@ void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs,
 
 }  // namespace op
 }  // namespace mxnet
+namespace std {
+template<>
+struct hash<mxnet::op::FullyConnectedParam> {
+  size_t operator()(const mxnet::op::FullyConnectedParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.num_hidden);
+    ret = dmlc::HashCombine(ret, val.no_bias);
+    ret = dmlc::HashCombine(ret, val.flatten);
+    return ret;
+  }
+};
+}  // namespace std
 #endif  // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index eb881d29abd..d8a32f0ae96 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -290,6 +290,7 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
     return std::vector<std::string>{"output"};
 })
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
@@ -322,6 +323,7 @@ NNVM_REGISTER_OP(_backward_FullyConnected)
 .set_attr<FInferStorageType>("FInferStorageType", BackwardFCStorageType)
 .set_attr_parser(ParamParser<FullyConnectedParam>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedGradComputeExCPU)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", FullyConnectedGradCompute<cpu>);
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 4433519df81..a428eb1e4fa 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -28,6 +28,7 @@
 #include "../operator_common.h"
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_lrn-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 #endif
 
 namespace mxnet {
@@ -81,24 +82,16 @@ struct LRNGrad {
   }
 };
 
+#if MXNET_USE_MKLDNN == 1
 bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                 const int dev_mask,
                                 DispatchMode* dispatch_mode,
                                 std::vector<int> *in_attrs,
                                 std::vector<int> *out_attrs) {
   CHECK(!in_attrs->empty());
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                        dispatch_mode, DispatchMode::kFComputeFallback);
-  } else if (dev_mask == mshadow::cpu::kDevMask) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                        dispatch_mode, DispatchMode::kFComputeEx);
-  }
-#endif
-  storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                      dispatch_mode, DispatchMode::kFCompute);
-  return true;
+
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
 
 bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
@@ -107,20 +100,11 @@ bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                  std::vector<int> *in_attrs,
                                  std::vector<int> *out_attrs) {
   CHECK(!in_attrs->empty());
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                        dispatch_mode, DispatchMode::kFComputeFallback);
-  } else if (dev_mask == mshadow::cpu::kDevMask) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                        dispatch_mode, DispatchMode::kFComputeEx);
-  }
-#endif
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                      dispatch_mode, DispatchMode::kFCompute);
+
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
 
-#if MXNET_USE_MKLDNN == 1
 void LRNComputeExCPU(const nnvm::NodeAttrs &attrs,
                      const OpContext &ctx,
                      const std::vector<NDArray> &inputs,
@@ -183,7 +167,9 @@ number of kernels in the layer.
 .set_attr_parser(ParamParser<LRNParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", LRNShape)
 .set_attr<nnvm::FInferType>("FInferType", LRNType)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", LRNForwardInferStorageType)
+#endif
 .set_attr<nnvm::FListInputNames>("FListInputNames",
     [](const NodeAttrs& attrs) {
   return std::vector<std::string>{"data"};
@@ -194,6 +180,7 @@ number of kernels in the layer.
 })
 .set_attr<FCompute>("FCompute<cpu>", LRNCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", LRNComputeExCPU)
 #endif
 .set_attr<nnvm::FGradient>("FGradient", LRNGrad{"_backward_LRN"})
@@ -203,9 +190,12 @@ number of kernels in the layer.
 NNVM_REGISTER_OP(_backward_LRN)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<LRNParam>)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", LRNBackwardInferStorageType)
+#endif
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", LRNGradComputeExCPU)
 // Native compute requires norm while MKLDNN does not so cannot be compared in debug mode
 .set_attr<bool>("TExcludeMKLDNNDebug", true)
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 273afcd32dc..6eb90f845d3 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -356,6 +356,18 @@ static inline void InvalidateOutputs(const std::vector<NDArray> &arrs,
   }
 }
 
+// TODO(alexzai): (MXNET-856) Remove helper function after subgraph feature added
+static inline void CreateDefaultInputs(const std::vector<NDArray> &arrs,
+                                       std::vector<NDArray> *out_arrs) {
+  out_arrs->clear();
+  for (size_t i = 0; i < arrs.size(); ++i) {
+    if (arrs[i].IsMKLDNNData())
+      out_arrs->push_back(arrs[i].Reorder2Default());
+    else
+      out_arrs->push_back(arrs[i]);
+  }
+}
+
 const mkldnn::memory *GetWeights(const NDArray &arr,
                                  const mkldnn::memory::primitive_desc &target_pd,
                                  int num_groups);
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 27c574deae5..f3facd966aa 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -536,7 +536,9 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
 
   DispatchMode wanted_mode;
 #if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && support_mkldnn)
+  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
+    wanted_mode = DispatchMode::kFComputeFallback;
+  else if (dev_mask == mshadow::cpu::kDevMask && support_mkldnn)
     wanted_mode = DispatchMode::kFComputeEx;
   else
 #endif
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
index f86f8dbefa2..5f672cd51fd 100644
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -82,6 +82,100 @@ inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWei
   }
 }
 
+class MKLDNNFullyConnectForward {
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> weight;
+  std::shared_ptr<mkldnn::memory> out;
+  std::shared_ptr<mkldnn::memory> bias;
+  std::shared_ptr<mkldnn::inner_product_forward> ipFwd;
+
+ public:
+  mkldnn::inner_product_forward::primitive_desc ipFwd_pd;
+
+  MKLDNNFullyConnectForward(const FullyConnectedParam &param, bool is_train,
+                            const NDArray &data, const NDArray &weight,
+                            const NDArray *bias,
+                            const mkldnn::memory::desc &output)
+      : ipFwd_pd(GetIPFwd(data, weight, bias, output, is_train)) {}
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
+                 const mkldnn::memory *bias, const mkldnn::memory &output) {
+    if (this->data == nullptr)
+      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              ipFwd_pd.src_primitive_desc(), data.get_data_handle()));
+    else
+      this->data->set_data_handle(data.get_data_handle());
+
+    if (this->weight == nullptr)
+      this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              ipFwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+    else
+      this->weight->set_data_handle(weight.get_data_handle());
+
+    if (this->out == nullptr)
+      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              ipFwd_pd.dst_primitive_desc(), output.get_data_handle()));
+    else
+      this->out->set_data_handle(output.get_data_handle());
+
+    if (bias != nullptr) {
+      if (this->bias == nullptr)
+        this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+        ipFwd_pd.bias_primitive_desc(), bias->get_data_handle()));
+      else
+        this->bias->set_data_handle(bias->get_data_handle());
+      if (this->ipFwd == nullptr)
+        this->ipFwd = std::shared_ptr<mkldnn::inner_product_forward>(
+            new mkldnn::inner_product_forward(
+                ipFwd_pd, mkldnn::primitive::at(*this->data),
+                mkldnn::primitive::at(*this->weight),
+                mkldnn::primitive::at(*this->bias), *this->out));
+    } else if (this->ipFwd == nullptr) {
+      this->ipFwd = std::shared_ptr<mkldnn::inner_product_forward>(
+          new mkldnn::inner_product_forward(
+              ipFwd_pd, mkldnn::primitive::at(*this->data),
+              mkldnn::primitive::at(*this->weight), *this->out));
+    }
+  }
+  const mkldnn::inner_product_forward &GetIpFwd() const {
+    return *ipFwd;
+  }
+};
+
+typedef ParamOpSign<FullyConnectedParam> MKLDNNFullyconSignature;
+
+static inline MKLDNNFullyConnectForward &GetFCFwd(
+    const nnvm::NodeAttrs &attrs, const NDArray &data, const NDArray &weight,
+    const NDArray *bias, const mkldnn::memory::desc &output,
+    const bool is_train) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNFullyconSignature,
+              MKLDNNFullyConnectForward, OpHash> fcFwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNFullyconSignature,
+              MKLDNNFullyConnectForward, OpHash> fcFwds;
+#endif
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  MKLDNNFullyconSignature key(param);
+  key.AddSign(data);
+  key.AddSign(weight);
+  key.AddSign(is_train);
+
+  if (bias)
+    key.AddSign(*bias);
+
+  auto it = fcFwds.find(key);
+  if (it == fcFwds.end()) {
+    MKLDNNFullyConnectForward fcFwd(param, is_train, data, weight, bias,
+                                    output);
+    auto ins_ret = fcFwds.insert(
+        std::pair<MKLDNNFullyconSignature, MKLDNNFullyConnectForward>(key, fcFwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
 void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                      const std::vector<NDArray> &in_data,
                      const std::vector<OpReqType> &req,
@@ -112,21 +206,21 @@ void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
     out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()),
       mkldnn::memory::format::any);
   }
-
-  mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight,
-      param.no_bias ? nullptr : &in_data[fullc::kBias], out_md, ctx.is_train);
-  auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc());
-  auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc());
+  MKLDNNFullyConnectForward &FCFwd =
+      GetFCFwd(attrs, data, weight, param.no_bias ? nullptr : &in_data[fullc::kBias],
+               out_md, ctx.is_train);
+  auto data_mem = data.GetMKLDNNDataReorder(FCFwd.ipFwd_pd.src_primitive_desc());
+  auto weight_mem = weight.GetMKLDNNDataReorder(FCFwd.ipFwd_pd.weights_primitive_desc());
   auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut],
-      ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]);
-  if (param.no_bias) {
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(
-          ipFwd_pd, *data_mem, *weight_mem, *out_mem.second));
+      FCFwd.ipFwd_pd.dst_primitive_desc(), req[fullc::kOut], &data);
+  if (!param.no_bias) {
+    auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(
+        FCFwd.ipFwd_pd.bias_primitive_desc());
+    FCFwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
   } else {
-    auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc());
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd,
-          *data_mem, *weight_mem, *bias_mem, *out_mem.second));
+    FCFwd.SetNewMem(*data_mem, *weight_mem, nullptr, *out_mem.second);
   }
+  MKLDNNStream::Get()->RegisterPrim(FCFwd.GetIpFwd());
   CommitOutput(out_data[fullc::kOut], out_mem);
   MKLDNNStream::Get()->Submit();
 }
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 7cb14503b1c..c133b63623a 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -30,8 +30,8 @@
 #endif  // MXNET_USE_NNPACK
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_pooling-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 #endif  // MXNET_USE_MKLDNN
-
 namespace mxnet {
 namespace op {
 
@@ -284,7 +284,6 @@ void PoolingGradComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
   }
   FallBackCompute(PoolingGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
 }
-#endif
 
 inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs,
                                       const int dev_mask,
@@ -292,21 +291,11 @@ inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs,
                                       std::vector<int> *in_attrs,
                                       std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
-
-#if MXNET_USE_MKLDNN == 1
   const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                        dispatch_mode, DispatchMode::kFComputeFallback);
-  } else if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                               dispatch_mode, DispatchMode::kFComputeEx);
-  }
-#else
-  CHECK_EQ(out_attrs->size(), 1);
-#endif
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, DispatchMode::kFCompute);
+  bool support_mkldnn_pool = SupportMKLDNNPooling(param);
+
+  return MKLDNNStorageType(attrs, dev_mask, support_mkldnn_pool,
+                           dispatch_mode, in_attrs, out_attrs);
 }
 
 inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
@@ -317,21 +306,12 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
   const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), GetNumBackInputs(param));
   CHECK_EQ(out_attrs->size(), 1);
+  bool support_mkldnn_pool = SupportMKLDNNPooling(param);
 
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet()) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                               dispatch_mode, DispatchMode::kFComputeFallback);
-  } else if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                               dispatch_mode, DispatchMode::kFComputeEx);
-  }
-#else
-  CHECK_EQ(in_attrs->size(), 3);
-#endif
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, DispatchMode::kFCompute);
+  return MKLDNNStorageType(attrs, dev_mask, support_mkldnn_pool,
+                           dispatch_mode, in_attrs, out_attrs);
 }
+#endif
 
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
@@ -408,11 +388,14 @@ For each window ``X``, the mathematical expression for Lp pooling is:
     return std::vector<std::string>{"output"};
 })
 .set_attr_parser(PoolingParamParser)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", PoolingStorageType)
+#endif
 .set_attr<nnvm::FInferType>("FInferType", PoolingType)
 .set_attr<nnvm::FInferShape>("FInferShape", PoolingShape)
 .set_attr<FCompute>("FCompute<cpu>", PoolingCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", PoolingComputeExCPU)
 #endif
 .set_attr<nnvm::FGradient>("FGradient",
@@ -437,11 +420,12 @@ NNVM_REGISTER_OP(_backward_Pooling)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
-#endif
 .set_attr<FInferStorageType>("FInferStorageType",
                              BackwardPoolingStorageType)
+#endif
 .set_attr_parser(PoolingParamParser)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", PoolingGradComputeExCPU)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", PoolingGradCompute<cpu>);
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 4a19db7c36b..c063e385f63 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -51,7 +51,7 @@ struct log_softmax_fwd {
 };
 
 
-template<typename OP, typename DType, int ndim>
+template<typename OP, bool negate, typename DType, int ndim>
 inline void Softmax(Stream<cpu> *s, DType *in, DType *out,
                     Shape<ndim> shape, int axis, const DType temperature) {
   index_t M = shape[axis];
@@ -65,30 +65,37 @@ inline void Softmax(Stream<cpu> *s, DType *in, DType *out,
   for (int i = 0; i < static_cast<int>(N); ++i) {
     index_t base = unravel_dot(i, sshape, stride);
 
-    DType mmax = in[base];
+    DType mmax = negate ? -in[base] : in[base];
+    DType val;
     for (index_t j = 1; j < M; ++j) {
-      if (mmax < in[base + j*sa]) mmax = in[base + j*sa];
+      val = negate ? -in[base + j*sa] : in[base + j*sa];
+      if (mmax < val) mmax = val;
     }
 
     DType sum = DType(0);
+    DType in_val;
     // By default temperature is 1.0, and only in reinforcement training
     // users would set it to other values.
     // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
     if (temperature == 1.0) {
       for (index_t j = 0; j < M; ++j) {
-        sum += std::exp(in[base + j*sa] - mmax);
+        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+        sum += std::exp(in_val - mmax);
       }
 
       for (index_t j = 0; j < M; ++j) {
-        out[base + j*sa] = OP::Map(in[base + j*sa] - mmax, sum);
+        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+        out[base + j*sa] = OP::Map(in_val - mmax, sum);
       }
     } else {
       for (index_t j = 0; j < M; ++j) {
-        sum += std::exp((in[base + j*sa] - mmax)/temperature);
+        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+        sum += std::exp((in_val - mmax)/temperature);
       }
 
       for (index_t j = 0; j < M; ++j) {
-        out[base + j*sa] = OP::Map((in[base + j*sa] - mmax)/temperature, sum);
+        in_val = negate ? -in[base + j*sa] : in[base + j*sa];
+        out[base + j*sa] = OP::Map((in_val - mmax)/temperature, sum);
       }
     }
   }
@@ -111,7 +118,7 @@ struct log_softmax_bwd {
 };
 
 
-template<typename OP1, typename OP2, int Req, typename DType, int ndim>
+template<typename OP1, typename OP2, int Req, bool negate, typename DType, int ndim>
 inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
                         DType *igrad, Shape<ndim> shape, int axis,
                         const DType temperature) {
@@ -137,12 +144,16 @@ inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
     DType final_result;
     if (temperature == 1.0) {
       for (index_t j = 0; j < M; ++j) {
-        final_result = OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
+        final_result = negate ?
+                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) :
+                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
         KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
       }
     } else {
       for (index_t j = 0; j < M; ++j) {
-        final_result = OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature;
+        final_result = negate ?
+                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature :
+                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / temperature;
         KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
       }
     }
@@ -151,7 +162,7 @@ inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
 
 
 #ifdef __CUDACC__
-template<int x_bits, typename OP, typename DType, int ndim>
+template<int x_bits, typename OP, bool negate, typename DType, int ndim>
 __global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axis,
                                        Shape<ndim> sshape, Shape<ndim> stride,
                                        const double temperature) {
@@ -163,7 +174,7 @@ __global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axi
 
   red::maximum::SetInitValue(smem[x]);
   for (index_t i = x; i < M; i += x_size) {
-    red::maximum::Reduce(smem[x], in[base + i*sa]);
+    red::maximum::Reduce(smem[x], negate ? -in[base + i*sa] : in[base + i*sa]);
   }
   __syncthreads();
   cuda::Reduce1D<red::maximum, x_bits>(smem);
@@ -172,9 +183,11 @@ __global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axi
   __syncthreads();
 
   red::sum::SetInitValue(smem[x]);
+  DType val;
   for (index_t i = x; i < M; i += x_size) {
-    red::sum::Reduce(smem[x], static_cast<DType>(expf((in[base + i*sa] - smax)/
-    static_cast<DType>(temperature))));
+    val = negate ? -in[base + i*sa]:in[base + i*sa];
+    red::sum::Reduce(
+      smem[x], static_cast<DType>(expf((val - smax) / static_cast<DType>(temperature))));
   }
   __syncthreads();
   cuda::Reduce1D<red::sum, x_bits>(smem);
@@ -183,11 +196,12 @@ __global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axi
   __syncthreads();
 
   for (index_t i = x; i < M; i += x_size) {
-    out[base + i*sa] = OP::Map((in[base + i*sa] - smax)/static_cast<DType>(temperature), ssum);
+    val = negate ? -in[base + i*sa] : in[base + i*sa];
+    out[base + i*sa] = OP::Map((val - smax)/static_cast<DType>(temperature), ssum);
   }
 }
 
-template<typename OP, typename DType, int ndim>
+template<typename OP, bool negate, typename DType, int ndim>
 inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
                     Shape<ndim> shape, int axis, const double temperature) {
   const int x_bits = 7;
@@ -198,14 +212,14 @@ inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
 
-  softmax_compute_kernel<x_bits, OP, DType, ndim>
+  softmax_compute_kernel<x_bits, OP, negate, DType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       in, out, M, axis, sshape, stride, temperature);
   MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
 }
 
 
-template<int x_bits, typename OP1, typename OP2, int Req, typename DType, int ndim>
+template<int x_bits, typename OP1, typename OP2, int Req, bool negate, typename DType, int ndim>
 __global__ void softmax_gradient_kernel(DType *out, DType *ograd, DType *igrad,
                                         index_t M, int axis, Shape<ndim> sshape,
                                         Shape<ndim> stride, const double temperature) {
@@ -228,13 +242,15 @@ __global__ void softmax_gradient_kernel(DType *out, DType *ograd, DType *igrad,
   DType final_result;
   for (index_t i = x; i < M; i += x_size) {
     final_result =
-      OP2::Map(ograd[base + i*sa], out[base + i*sa], ssum) / static_cast<DType>(temperature);
-    KERNEL_ASSIGN(igrad[base + i*sa], Req, final_result);
+      negate ?
+      -OP2::Map(ograd[base + i*sa], out[base + i*sa], ssum) :
+      OP2::Map(ograd[base + i*sa], out[base + i*sa], ssum);
+    KERNEL_ASSIGN(igrad[base + i*sa], Req, final_result / static_cast<DType>(temperature));
   }
 }
 
 
-template<typename OP1, typename OP2, int Req, typename DType, int ndim>
+template<typename OP1, typename OP2, int Req, bool negate, typename DType, int ndim>
 inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
                         DType *igrad, Shape<ndim> shape, int axis,
                         const double temperature) {
@@ -246,7 +262,7 @@ inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
 
-  softmax_gradient_kernel<x_bits, OP1, OP2, Req, DType, ndim>
+  softmax_gradient_kernel<x_bits, OP1, OP2, Req, negate, DType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       out, ograd, igrad, M, axis, sshape, stride, temperature);
   MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_gradient_kernel);
@@ -267,7 +283,7 @@ struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
   }
 };
 
-template<typename xpu, typename OP>
+template<typename xpu, typename OP, bool negate = false>
 void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
                     const std::vector<TBlob>& inputs,
@@ -283,19 +299,19 @@ void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
   TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (shape.ndim() == 2) {
-      Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-              outputs[0].dptr<DType>(), shape.get<2>(), axis,
-              static_cast<DType>(temperature));
+      Softmax<OP, negate>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+                          outputs[0].dptr<DType>(), shape.get<2>(), axis,
+                          static_cast<DType>(temperature));
     } else {
-      Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-              outputs[0].dptr<DType>(), shape.get<3>(), axis,
-              static_cast<DType>(temperature));
+      Softmax<OP, negate>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+                          outputs[0].dptr<DType>(), shape.get<3>(), axis,
+                          static_cast<DType>(temperature));
     }
   });
 }
 
 
-template<typename xpu, typename OP1, typename OP2>
+template<typename xpu, typename OP1, typename OP2, bool negate = false>
 void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
                         const std::vector<TBlob>& inputs,
@@ -311,13 +327,13 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
       if (shape.ndim() == 2) {
-        SoftmaxGrad<OP1, OP2, Req>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
-                                   inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
-                                   shape.get<2>(), axis, static_cast<DType>(temperature));
+        SoftmaxGrad<OP1, OP2, Req, negate>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
+                                           inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
+                                           shape.get<2>(), axis, static_cast<DType>(temperature));
       } else {
-        SoftmaxGrad<OP1, OP2, Req>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
-                                   inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
-                                   shape.get<3>(), axis, static_cast<DType>(temperature));
+        SoftmaxGrad<OP1, OP2, Req, negate>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
+                                           inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
+                                           shape.get<3>(), axis, static_cast<DType>(temperature));
       }
     });
   });
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index c58f382bbad..81e775cac52 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -25,8 +25,11 @@
 #include "./softmax-inl.h"
 #include "../tensor/elemwise_unary_op.h"
 #include "../tensor/elemwise_binary_op.h"
+#include "../operator_common.h"
+#if MXNET_USE_MKLDNN == 1
 #include "mkldnn/mkldnn_base-inl.h"
 #include "mkldnn/mkldnn_ops-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -50,7 +53,6 @@ static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
   FallBackCompute(SoftmaxCompute<cpu, mxnet_op::softmax_fwd>, attrs, ctx,
                   inputs, req, outputs);
 }
-#endif
 
 inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
                                       const int dev_mask,
@@ -60,19 +62,10 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
 
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  // We only run MKLDNN op if it runs on CPU.
-  if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
-    wanted_mode = DispatchMode::kFComputeFallback;
-  else if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, static_cast<NDArrayStorageType>((*in_attrs)[0]),
-                             dispatch_mode, wanted_mode);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
+#endif
 
 MXNET_OPERATOR_REGISTER_UNARY(softmax)
 .describe(R"code(Applies the softmax function.
@@ -105,9 +98,10 @@ Example::
 })
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxComputeExCPU)
-#endif
 .set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
+#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmax"})
 .add_arguments(SoftmaxParam::__FIELDS__());
 
@@ -116,6 +110,45 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_softmax)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd>);
 
+MXNET_OPERATOR_REGISTER_UNARY(softmin)
+.describe(R"code(Applies the softmin function.
+
+The resulting array contains elements in the range (0,1) and the elements along the given axis sum
+up to 1.
+
+.. math::
+   softmin(\mathbf{z/t})_j = \frac{e^{-z_j/t}}{\sum_{k=1}^K e^{-z_k/t}}
+
+for :math:`j = 1, ..., K`
+
+t is the temperature parameter in softmax function. By default, t equals 1.0
+
+Example::
+
+  x = [[ 1.  2.  3.]
+       [ 3.  2.  1.]]
+
+  softmin(x,axis=0) = [[ 0.88079703,  0.5,  0.11920292],
+                       [ 0.11920292,  0.5,  0.88079703]]
+
+  softmin(x,axis=1) = [[ 0.66524094,  0.24472848,  0.09003057],
+                       [ 0.09003057,  0.24472848,  0.66524094]]
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output"};
+})
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd, true>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmin"})
+.add_arguments(SoftmaxParam::__FIELDS__());
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_softmin)
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
+                                                        mxnet_op::softmax_bwd, true>);
+
 MXNET_OPERATOR_REGISTER_UNARY(log_softmax)
 .describe(R"code(Computes the log softmax of the input.
 This is equivalent to computing softmax followed by log.
diff --git a/src/operator/nn/softmax.cu b/src/operator/nn/softmax.cu
index 8274642c81b..254e726d5e2 100644
--- a/src/operator/nn/softmax.cu
+++ b/src/operator/nn/softmax.cu
@@ -35,6 +35,13 @@ NNVM_REGISTER_OP(_backward_softmax)
 .set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd>);
 
+NNVM_REGISTER_OP(softmin)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::softmax_fwd, true>);
+
+NNVM_REGISTER_OP(_backward_softmin)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, op::mshadow_op::mul,
+                                                        mxnet_op::softmax_bwd, true>);
+
 NNVM_REGISTER_OP(log_softmax)
 .set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::log_softmax_fwd>);
 
diff --git a/src/operator/subgraph/common.h b/src/operator/subgraph/common.h
new file mode 100644
index 00000000000..22058d556e0
--- /dev/null
+++ b/src/operator/subgraph/common.h
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_COMMON_H_
+#define MXNET_OPERATOR_SUBGRAPH_COMMON_H_
+
+#include <string>
+#include <set>
+#include <vector>
+#include "../elemwise_op_common.h"
+#include "../../executor/exec_pass.h"
+
+namespace mxnet {
+namespace op {
+
+inline uint32_t DefaultSubgraphOpNumInputs(const nnvm::NodeAttrs& attrs) {
+  const nnvm::Symbol& sym = *attrs.subgraphs[0];
+  return sym.ListInputNames(nnvm::Symbol::kAll).size();
+}
+
+inline uint32_t DefaultSubgraphOpNumOutputs(const nnvm::NodeAttrs& attrs) {
+  const nnvm::Symbol& sym = *attrs.subgraphs[0];
+  return sym.ListOutputNames().size();
+}
+
+inline std::vector<std::string> DefaultSubgraphOpListInputs(const nnvm::NodeAttrs& attrs) {
+  const nnvm::Symbol& sym = *attrs.subgraphs[0];
+  return sym.ListInputNames(nnvm::Symbol::kAll);
+}
+
+inline std::vector<std::string> DefaultSubgraphOpListOutputs(const nnvm::NodeAttrs& attrs) {
+  const nnvm::Symbol& sym = *attrs.subgraphs[0];
+  return sym.ListOutputNames();
+}
+
+inline bool DefaultSubgraphOpShape(const nnvm::NodeAttrs& attrs,
+                                   std::vector<TShape> *in_shapes,
+                                   std::vector<TShape> *out_shapes) {
+  using namespace exec;
+  const nnvm::Symbol& subgraph_sym = *attrs.subgraphs[0];
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_shapes->size());
+  CHECK_EQ(idx_g.outputs().size(), out_shapes->size());
+
+  // Put the input and output shapes to the shape vector.
+  nnvm::ShapeVector shapes(idx_g.num_node_entries());
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_shapes->size());
+  for (size_t i = 0; i < in_shapes->size(); i++) {
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    shapes[eid] = in_shapes->at(i);
+  }
+  CHECK_EQ(g.outputs.size(), out_shapes->size());
+  for (size_t i = 0; i < out_shapes->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    shapes[eid] = out_shapes->at(i);
+  }
+
+  // Infer shape of the graph.
+  g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+  g = exec::InferShape(std::move(g));
+
+  // Copy the inferred shape back to the input shapes and the output shapes.
+  shapes = g.GetAttr<nnvm::ShapeVector>("shape");
+  // assign to in_shapes
+  for (size_t i = 0; i < in_shapes->size(); ++i) {
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    SHAPE_ASSIGN_CHECK(*in_shapes, i, shapes[eid]);
+  }
+  // assign to out_shapes
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    SHAPE_ASSIGN_CHECK(*out_shapes, i, shapes[eid]);
+  }
+  // Check if we have inferred the shapes correctly.
+  return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
+}
+
+inline bool DefaultSubgraphOpType(const nnvm::NodeAttrs& attrs,
+                                  std::vector<int> *in_types,
+                                  std::vector<int> *out_types) {
+  const nnvm::Symbol& subgraph_sym = *attrs.subgraphs[0];
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_types->size());
+  CHECK_EQ(idx_g.outputs().size(), out_types->size());
+
+  // Put the input and output data types to the dtype vector.
+  nnvm::DTypeVector types(idx_g.num_node_entries(), -1);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_types->size());
+  for (size_t i = 0; i < in_types->size(); i++) {
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    types[eid] = in_types->at(i);
+  }
+  CHECK_EQ(g.outputs.size(), out_types->size());
+  for (size_t i = 0; i < out_types->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    types[eid] = out_types->at(i);
+  }
+
+  // Infer data type of the graph.
+  g.attrs["dtype"] = std::make_shared<dmlc::any>(std::move(types));
+  g = exec::InferType(std::move(g));
+
+  types = g.GetAttr<nnvm::DTypeVector>("dtype");
+  // assign to in_types
+  for (size_t i = 0; i < in_types->size(); ++i) {
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    TYPE_ASSIGN_CHECK(*in_types, i, types[eid]);
+  }
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    TYPE_ASSIGN_CHECK(*out_types, i, types[eid]);
+  }
+  // Check if we have inferred the dtypes correctly.
+  return g.GetAttr<size_t>("dtype_num_unknown_nodes") == 0;
+}
+
+inline bool DefaultSubgraphOpStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int>* in_stypes,
+                                         std::vector<int>* out_stypes) {
+  const nnvm::Symbol& subgraph_sym = *attrs.subgraphs[0];
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_stypes->size());
+  CHECK_EQ(idx_g.outputs().size(), out_stypes->size());
+  exec::DevMaskVector dev_masks(idx_g.num_node_entries(), dev_mask);
+
+  // Put the input and output storages to the storage vector.
+  StorageTypeVector stypes(idx_g.num_node_entries(), kUndefinedStorage);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_stypes->size());
+  for (size_t i = 0; i < in_stypes->size(); i++) {
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    stypes[eid] = in_stypes->at(i);
+  }
+  CHECK_EQ(g.outputs.size(), out_stypes->size());
+  for (size_t i = 0; i < out_stypes->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    stypes[eid] = out_stypes->at(i);
+  }
+
+  // Infer storage type of the graph.
+  bool dev_match = g.attrs.count("dev_mask") &&
+                   g.GetAttr<exec::DevMaskVector>("dev_mask") == dev_masks;
+  if (!dev_match) {
+    g.attrs["dev_mask"] = std::make_shared<dmlc::any>(std::move(dev_masks));
+  }
+  g.attrs["storage_type"] = std::make_shared<dmlc::any>(std::move(stypes));
+  g = exec::InferStorageType(std::move(g));
+
+  stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  // assign to in_types
+  for (size_t i = 0; i < in_stypes->size(); ++i) {
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    STORAGE_TYPE_ASSIGN_CHECK(*in_stypes, i, stypes[eid]);
+  }
+
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    STORAGE_TYPE_ASSIGN_CHECK(*out_stypes, i, stypes[eid]);
+  }
+  // Check if we have inferred the storages correctly.
+  return g.GetAttr<size_t>("storage_type_num_unknown_nodes") == 0;
+}
+
+inline ExecType DefaultSubgraphOpExecType(const nnvm::NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+}
+
+inline std::vector<uint32_t> DefaultSubgraphOpMutableInputs(const nnvm::NodeAttrs& attrs) {
+  const nnvm::Symbol& subgraph_sym = *attrs.subgraphs[0];
+  const std::vector<std::string> input_names = subgraph_sym.ListInputNames(nnvm::Symbol::kAll);
+  const std::vector<std::string> immutable_input_names =
+    subgraph_sym.ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+  const std::vector<std::string> mutable_input_names =
+    subgraph_sym.ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+  CHECK_EQ(immutable_input_names.size() + mutable_input_names.size(), input_names.size());
+  std::vector<uint32_t> ret;
+  size_t i1 = 0, i2 = 0;
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    if (i1 < immutable_input_names.size() && input_names[i] == immutable_input_names[i1]) {
+      ++i1;
+    } else {
+      CHECK(i2 < mutable_input_names.size());
+      CHECK_EQ(input_names[i], mutable_input_names[i2]);
+      ++i2;
+      ret.push_back(i);
+    }
+  }
+  return ret;
+}
+
+inline std::vector<ResourceRequest> DefaultSubgraphOpResourceRequest(const nnvm::NodeAttrs& attrs) {
+  const nnvm::Symbol& subgraph_sym = *attrs.subgraphs[0];
+  static auto& fresource = Op::GetAttr<FResourceRequest>("FResourceRequest");
+  std::set<ResourceRequest::Type> resource_types;
+  DFSVisit(subgraph_sym.outputs, [&](const nnvm::NodePtr& node) {
+    if (!node->is_variable() && fresource.count(node->op())) {
+      for (ResourceRequest& r : fresource[node->op()](node->attrs)){
+        resource_types.insert(r.type);
+      }
+    }
+  });
+  return std::vector<ResourceRequest>(resource_types.begin(), resource_types.end());
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_SUBGRAPH_COMMON_H_
diff --git a/src/operator/subgraph/default_subgraph_op.cc b/src/operator/subgraph/default_subgraph_op.cc
new file mode 100644
index 00000000000..d5fb7ee2db6
--- /dev/null
+++ b/src/operator/subgraph/default_subgraph_op.cc
@@ -0,0 +1,112 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+#include <mxnet/ndarray.h>
+#include "./common.h"
+#include "../../imperative/imperative_utils.h"
+#include "../../imperative/cached_op.h"
+
+namespace mxnet {
+namespace op {
+
+#define DEBUG_SUBGRAPH 0
+
+class DefaultSubgraphOperator {
+ public:
+  explicit DefaultSubgraphOperator(const Symbol& sym) : subgraph_sym_(sym) {
+    subgraph_exec_.reset(new CachedOp(sym, {{"static_alloc", "true"},
+                                            {"static_shape", "true"}}));
+  }
+
+  void Forward(const OpContext& ctx,
+               const std::vector<NDArray>& inputs,
+               const std::vector<OpReqType>& req,
+               const std::vector<NDArray>& outputs);
+  void Backward(const OpContext& ctx,
+                const std::vector<NDArray>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<NDArray>& outputs) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+ private:
+  nnvm::Symbol subgraph_sym_;
+  CachedOpPtr subgraph_exec_;
+};
+
+void DefaultSubgraphOperator::Forward(const OpContext& ctx,
+                                      const std::vector<NDArray>& inputs,
+                                      const std::vector<OpReqType>& req,
+                                      const std::vector<NDArray>& outputs) {
+  std::vector<NDArray> tmp_inputs = inputs;
+  std::vector<NDArray*> input_ptrs;
+  input_ptrs.reserve(inputs.size());
+  for (auto& nd : tmp_inputs) {
+    input_ptrs.push_back(&nd);
+  }
+  std::vector<NDArray> tmp_outputs = outputs;
+  std::vector<NDArray*> output_ptrs;
+  for (auto& nd : tmp_outputs) {
+    output_ptrs.push_back(&nd);
+  }
+#if DEBUG_SUBGRAPH
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    LOG(INFO) << "inputs[" << i << "].version = " << inputs[i].version();
+  }
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    LOG(INFO) << "outputs[" << i << "].version = " << outputs[i].version();
+  }
+#endif
+  subgraph_exec_->Forward(subgraph_exec_, input_ptrs, output_ptrs);
+}
+
+OpStatePtr CreateDefaultSubgraphOpState(const NodeAttrs& attrs,
+                                        Context ctx,
+                                        const std::vector<TShape>& in_shapes,
+                                        const std::vector<int>& in_types) {
+  return OpStatePtr::Create<DefaultSubgraphOperator>(*attrs.subgraphs[0]);
+}
+
+void DefaultSubgraphOpForward(const OpStatePtr& state_ptr,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+  DefaultSubgraphOperator& op = state_ptr.get_state<DefaultSubgraphOperator>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
+NNVM_REGISTER_OP(_default_subgraph_op)
+.describe(R"code(_default_subgraph_op)code" ADD_FILELINE)
+.set_num_inputs(DefaultSubgraphOpNumInputs)
+.set_num_outputs(DefaultSubgraphOpNumOutputs)
+.set_attr<nnvm::FListInputNames>("FListInputNames", DefaultSubgraphOpListInputs)
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", DefaultSubgraphOpListOutputs)
+.set_attr<FCreateOpState>("FCreateOpState", CreateDefaultSubgraphOpState)
+.set_attr<nnvm::FInferShape>("FInferShape", DefaultSubgraphOpShape)
+.set_attr<nnvm::FInferType>("FInferType", DefaultSubgraphOpType)
+.set_attr<FInferStorageType>("FInferStorageType", DefaultSubgraphOpStorageType)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", DefaultSubgraphOpForward)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs", DefaultSubgraphOpMutableInputs)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<FExecType>("FExecType", DefaultSubgraphOpExecType)
+.add_argument("data", "NDArray-or-Symbol[]", "input data list");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/subgraph/default_subgraph_op.cu b/src/operator/subgraph/default_subgraph_op.cu
new file mode 100644
index 00000000000..008826b21d7
--- /dev/null
+++ b/src/operator/subgraph/default_subgraph_op.cu
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file default_subgraph_op.cu
+ * \brief GPU Implementation of subgraph operations
+ */
+
+#include <mxnet/ndarray.h>
+#include "./common.h"
+#include "../../imperative/imperative_utils.h"
+#include "../../imperative/cached_op.h"
+
+namespace mxnet {
+namespace op {
+
+void DefaultSubgraphOpForward(const OpStatePtr& state_ptr,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs);
+
+NNVM_REGISTER_OP(_default_subgraph_op)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", DefaultSubgraphOpForward);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/subgraph/default_subgraph_property.cc b/src/operator/subgraph/default_subgraph_property.cc
new file mode 100644
index 00000000000..c8d3e9ffd43
--- /dev/null
+++ b/src/operator/subgraph/default_subgraph_property.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <vector>
+#include <string>
+#include "./common.h"
+#include "./subgraph_property.h"
+
+namespace mxnet {
+namespace op {
+
+/*
+ * This selects nodes for a subgraph that only contains operators
+ * in a given set and it visits nodes via both input and output links.
+ */
+class ContainOpSelector: public SubgraphSelector {
+ public:
+  explicit ContainOpSelector(const std::unordered_set<std::string>& op_names)
+    : op_names_(op_names) {}
+
+  virtual bool Select(const nnvm::Node &seed_node) {
+    return !seed_node.is_variable() && op_names_.count(seed_node.op()->name);
+  }
+
+  virtual bool SelectInput(const nnvm::Node &cur_node, const nnvm::Node &input_node) {
+    return !input_node.is_variable() && op_names_.count(input_node.op()->name);
+  }
+
+  virtual bool SelectOutput(const nnvm::Node &cur_node, const nnvm::Node &output_node) {
+    return !output_node.is_variable() && op_names_.count(output_node.op()->name);
+  }
+ private:
+  const std::unordered_set<std::string>& op_names_;
+};
+
+/*
+ * This subgraph property finds a subgraph whose nodes have only operators
+ * within a set. The operators in the subgraph will be executed by _default_subgraph_op.
+ */
+class DefaultSubgraphProperty: public SubgraphProperty {
+ public:
+  static SubgraphPropertyPtr Create() { return std::make_shared<DefaultSubgraphProperty>(); }
+  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                           const int subgraph_id = 0) const {
+    nnvm::NodePtr n = nnvm::Node::Create();
+    n->attrs.op = Op::Get("_default_subgraph_op");
+    n->attrs.name = "_default_subgraph_op" + std::to_string(subgraph_id);
+    n->attrs.subgraphs.push_back(std::make_shared<nnvm::Symbol>(sym));
+    return n;
+  }
+  virtual SubgraphSelectorPtr CreateSubgraphSelector() const {
+    return std::make_shared<ContainOpSelector>(
+        this->GetAttr<std::unordered_set<std::string>>("op_names"));
+  }
+};
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(default, DefaultSubgraphProperty);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/subgraph/partition_graph.cc b/src/operator/subgraph/partition_graph.cc
new file mode 100644
index 00000000000..315f7eec00c
--- /dev/null
+++ b/src/operator/subgraph/partition_graph.cc
@@ -0,0 +1,774 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file partition_graph.cc
+ * \brief
+ */
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <mxnet/op_attr_types.h>
+#include <unordered_set>
+#include <stack>
+#include <queue>
+
+#include "./subgraph_property.h"
+
+namespace nnvm {
+NodePtr CreateVariableNode(const std::string& name);
+}
+
+namespace mxnet {
+
+namespace op {
+
+using nnvm::Symbol;
+using nnvm::Node;
+using nnvm::NodePtr;
+using nnvm::NodeEntry;
+using nnvm::Graph;
+
+#define DEBUG_SUBGRAPH 0
+
+namespace sg {  // sg stands for subgraph
+
+struct SimpleNode;
+using SimpleNodePtr = std::shared_ptr<SimpleNode>;
+
+/*!
+ * \brief Node of the undirected graph which replicates the network structures
+ * of the computational graph. It is used to ease the graph traversal for finding
+ * subgraphs.
+ */
+struct SimpleNode {
+  static SimpleNodePtr Create() {
+    return std::make_shared<SimpleNode>();
+  }
+  SimpleNode() : label(-1), node(nullptr) {}
+  /*! subgraph label */
+  int label;
+  /*! the original node in the computational graph it references*/
+  nnvm::Node* node;
+  /*!
+   * \brief output nodes of the current node
+   * key is node ptr and value is an array of indices standing for the entry indices
+   * in key->inputs whose source is the current node.
+   */
+  std::unordered_map<nnvm::Node*, std::vector<size_t>> outputs;
+};  // struct SimpleNode
+
+#if DEBUG_SUBGRAPH
+void PrintSubgraph(const std::vector<SimpleNode*>& simple_nodes) {
+  std::string op_names = "";
+  for (size_t i = 0; i < simple_nodes.size(); ++i) {
+    op_names += simple_nodes[i]->node->attrs.name + ' ';
+  }
+  LOG(INFO) << "Subgraph node names: " << op_names;
+}
+
+void PrintNodeEntry(const nnvm::NodeEntry& entry) {
+  std::string ret = "NodeEntry: node_name=" + entry.node->attrs.name
+    + ", index=" + std::to_string(entry.index) + ", version=" + std::to_string(entry.version);
+  LOG(INFO) << ret;
+}
+
+void PrintNodeEntries(const std::vector<nnvm::NodeEntry*>& entries) {
+  for (size_t i = 0; i < entries.size(); ++i) {
+    PrintNodeEntry(*entries[i]);
+  }
+}
+#endif
+
+/*!
+ * \brief Given a MXNet computational graph, create an undirected graph from it.
+ * \param g the MXNet computational graph
+ * \param simple_nodes the nodes of undirected graph in top sorted order
+ */
+void CreateSimpleGraph(const Graph& g,
+                       std::vector<SimpleNodePtr>* simple_nodes) {
+  const auto& indexed_graph = g.indexed_graph();
+  simple_nodes->reserve(indexed_graph.num_nodes());
+  DFSVisit(g.outputs, [&](const NodePtr& node) {
+    SimpleNodePtr sn = SimpleNode::Create();
+    sn->node = node.get();
+    for (size_t i = 0; i < sn->node->inputs.size(); ++i) {
+      const auto& e = sn->node->inputs[i];
+      const auto input_nid = indexed_graph.node_id(e.node.get());
+      CHECK_LT(input_nid, simple_nodes->size());
+      auto& input_node_outputs = (*simple_nodes)[input_nid]->outputs;
+      auto it = input_node_outputs.find(sn->node);
+      if (it == input_node_outputs.end()) {
+        input_node_outputs.emplace(sn->node, std::vector<size_t>{i});
+      } else {
+        it->second.push_back(i);
+      }
+    }
+    simple_nodes->emplace_back(std::move(sn));
+  });
+}
+
+/*!
+ * \brief Reset labels of the subgraph nodes to the original state
+ * and clear the vector of subgraph nodes.
+ */
+void ResetNodeLabels(const nnvm::Graph& g,
+                     const std::vector<SimpleNodePtr>& simple_nodes,
+                     std::vector<nnvm::Node*>* subgraph_nodes) {
+  for (auto n : *subgraph_nodes) {
+    const auto nid = g.indexed_graph().node_id(n);
+    simple_nodes[nid]->label = -1;
+  }
+  subgraph_nodes->clear();
+}
+
+/*!
+ * \brief This function traverses the nodes in a computation graph from a starting
+ * node following the input edges and output edges, and marks all nodes that
+ * can be accessed from the starting node. Before the function returns,
+ * it will conduct checking whether there is a loop between the potential subgraph
+ * and the outside nodes. If so, add the node that should break the loop
+ * in excluded_nodes and return false. Otherwise, return true.
+ * \param g the whole graph
+ * \subgraph_selector determines whether the visited node should be choosen or not
+ * \label the label of the current subgraph
+ * \snid node id of the seed simple node
+ * \simple_nodes all simple nodes in the top sorted order
+ * \subgraph_nodes all the nodes belonging to the same subgraph of seed node
+ * \excluded_nodes set of nodes that should be excluded from the current subgraph
+ */
+bool LabelSubgraph(const Graph& g,
+                   SubgraphSelectorPtr subgraph_selector,
+                   const int label,
+                   const size_t snid,  // simple node id, this is a seed
+                   const std::vector<SimpleNodePtr>& simple_nodes,
+                   std::vector<nnvm::Node*>* subgraph_nodes,
+                   std::unordered_set<const nnvm::Node*>* excluded_nodes = nullptr) {
+  const auto& indexed_graph = g.indexed_graph();
+  std::queue<SimpleNode*> node_queue;
+  if (!excluded_nodes || !excluded_nodes->count(simple_nodes[snid]->node)) {
+    CHECK_EQ(simple_nodes[snid]->label, -1);
+    simple_nodes[snid]->label = label;
+    node_queue.push(simple_nodes[snid].get());
+  }
+  // key: nodes that serve as input/output nodes to the subgraph
+  // value: pair of vectors of nodes in the subgraph. The first vector contains the
+  // output nodes of the key in the subgraph, and the second vector contains the
+  // input nodes of the key in the subgraph.
+  // If a non-subgraph node has inputs from the subgraph and the other non-subgraph node
+  // has outputs to the subgraph, and the first non-subgraph node is an ancestor
+  // of the second non-subgraph node, there exits a cycle.
+  // When breaking the cycle, we want to start from removing the node with the largest node id
+  // in the subgraph.
+  std::unordered_map<const nnvm::Node*,
+    std::pair<std::vector<const nnvm::Node*>,
+              std::vector<const nnvm::Node*>>> non_subgraph_node_map;
+  while (!node_queue.empty()) {
+    SimpleNode* cur_node = node_queue.front();
+    node_queue.pop();
+    subgraph_nodes->push_back(cur_node->node);
+    // get qualified adjacent input nodes
+    for (auto& e : cur_node->node->inputs) {
+      const bool select_input = (!excluded_nodes || !excluded_nodes->count(e.node.get()))
+        && subgraph_selector->SelectInput(*cur_node->node, *e.node);
+      if (select_input) {
+        // e.node is a subgraph node
+        const auto nid = indexed_graph.node_id(e.node.get());
+        CHECK_LT(nid, simple_nodes.size());
+        // this node has not been visited yet
+        if (simple_nodes[nid]->label == -1) {
+          simple_nodes[nid]->label = label;
+          node_queue.push(simple_nodes[nid].get());
+        }
+      } else {
+        // e.node is an input node of the subgraph
+        non_subgraph_node_map[e.node.get()].first.push_back(cur_node->node);
+      }
+    }
+    // get qualified output nodes
+    for (auto it = cur_node->outputs.begin(); it != cur_node->outputs.end(); ++it) {
+      const bool select_output = (!excluded_nodes || !excluded_nodes->count(it->first))
+          && subgraph_selector->SelectOutput(*cur_node->node, *it->first);
+      if (select_output) {
+        // it->first is a subgraph node
+        const auto nid = indexed_graph.node_id(it->first);
+        CHECK_LT(nid, simple_nodes.size());
+        // this node has not been visited yet
+        if (simple_nodes[nid]->label == -1) {
+          simple_nodes[nid]->label = label;
+          node_queue.push(simple_nodes[nid].get());
+        }
+      } else {
+        // it->first is an output node of the subgraph
+        non_subgraph_node_map[it->first].second.push_back(cur_node->node);
+      }
+    }
+  }
+  // prepare to check if there is a cycle
+  auto node_cmp = [&] (const nnvm::Node* node1, const nnvm::Node* node2) {
+    return indexed_graph.node_id(node1) < indexed_graph.node_id(node2);
+  };
+  std::vector<const nnvm::Node*> non_subgraph_nodes;
+  non_subgraph_nodes.reserve(non_subgraph_node_map.size());
+  for (auto& kv : non_subgraph_node_map) {
+    auto& output_nodes = kv.second.first;
+    std::sort(output_nodes.begin(), output_nodes.end(), node_cmp);
+    auto& input_nodes = kv.second.second;
+    std::sort(input_nodes.begin(), input_nodes.end(), node_cmp);
+    non_subgraph_nodes.push_back(kv.first);
+  }
+  // check whether there is a cycle between the subgraph and its input/output nodes
+  auto is_ancestor = [&](const nnvm::Node* ancestor, const nnvm::Node* descendant,
+                         const std::vector<nnvm::Node*>& snodes) {
+    if (ancestor == descendant) return true;
+    std::stack<const nnvm::Node*> s;
+    s.push(descendant);
+    size_t count = 0;
+    while (!s.empty()) {
+      CHECK_LT(count, indexed_graph.num_nodes()) << "Finding ancestor failed. There is probably"
+                                                    " a loop in the graph";
+      ++count;
+      const nnvm::Node* top = s.top();
+      s.pop();
+      if (top == ancestor) {
+        return true;
+      }
+      for (const auto& entry : top->inputs) {
+        // when searching for the ancestor, the path cannot cross any subgraph node
+        auto it = std::find(snodes.begin(), snodes.end(), entry.node.get());
+        if (it == snodes.end()) {
+          s.push(entry.node.get());
+        }
+      }
+    }
+    return false;
+  };
+  std::sort(non_subgraph_nodes.begin(), non_subgraph_nodes.end(), node_cmp);
+  int excluded_node_id = -1;
+  for (size_t i = 0; i < non_subgraph_nodes.size(); ++i) {
+    auto it1 = non_subgraph_node_map.find(non_subgraph_nodes[i]);
+    CHECK(it1 != non_subgraph_node_map.end());
+    auto& output_nodes = it1->second.first;  // has been top sorted
+    auto& input_nodes = it1->second.second;  // has been top sorted
+    if (!output_nodes.empty() && !input_nodes.empty()) {
+      // there is a loop between node i and the subgraph
+      const auto node_id = std::max(indexed_graph.node_id(output_nodes.back()),
+                                    indexed_graph.node_id(input_nodes.back()));
+      excluded_node_id = std::max(excluded_node_id, static_cast<int>(node_id));
+    } else if (!input_nodes.empty()) {
+      // node i is an input to the subgraph, find out if there is a node j
+      // which is an output of the subgraph and also a child of node i.
+      for (size_t j = i + 1; j < non_subgraph_nodes.size(); ++j) {
+        auto it2 = non_subgraph_node_map.find(non_subgraph_nodes[j]);
+        CHECK(it2 != non_subgraph_node_map.end());
+        // i is topologically before j, j might be a direct/indirect output node of i
+        CHECK_LT(indexed_graph.node_id(it1->first), indexed_graph.node_id(it2->first));
+        if (!it2->second.first.empty() && is_ancestor(it1->first, it2->first, *subgraph_nodes)) {
+          // found a loop
+          const auto node_id = std::max(indexed_graph.node_id(input_nodes.back()),
+                                        indexed_graph.node_id(it2->second.first.back()));
+          excluded_node_id = std::max(excluded_node_id, static_cast<int>(node_id));
+        }
+      }
+    }
+  }
+
+  if (excluded_node_id != -1) {
+    CHECK_LT(excluded_node_id, static_cast<int>(simple_nodes.size()));
+    CHECK_NE(excluded_node_id, static_cast<int>(snid))
+      << "A cycle is found in the computational graph between nodes "
+      << simple_nodes[excluded_node_id]->node->attrs.name << " and "
+      << simple_nodes[snid]->node->attrs.name;
+    excluded_nodes->insert(simple_nodes[excluded_node_id]->node);
+    ResetNodeLabels(g, simple_nodes, subgraph_nodes);
+    return false;
+  }
+  std::sort(subgraph_nodes->begin(), subgraph_nodes->end(), node_cmp);
+  return true;
+}
+
+/*!
+ * \brief Finds all the nodes belonging to the same subgraph given a seed node.
+ * \param g the whole graph
+ * \subgraph_selector determines whether the visited node should be choosen or not
+ * \label the label of the current subgraph
+ * \snid node id of the seed simple node
+ * \simple_nodes all simple nodes in the top sorted order
+ * \subgraph_nodes all the nodes belonging to the same subgraph of seed node
+ * \return Subgraph node candidates sorted in the topological order
+ */
+void PreSelectSubgraphNodes(const Graph& g,
+                            SubgraphSelectorPtr subgraph_selector,
+                            const int label,
+                            const size_t snid,
+                            const std::vector<SimpleNodePtr>& simple_nodes,
+                            std::vector<nnvm::Node*>* subgraph_nodes) {
+  std::unordered_set<const nnvm::Node*> excluded_nodes;
+  const size_t max_num_retry = simple_nodes.size() * simple_nodes.size();
+  size_t count = 0;
+  bool success = false;
+  while (!success && count < max_num_retry) {
+    success = LabelSubgraph(g, subgraph_selector, label, snid, simple_nodes,
+                            subgraph_nodes, &excluded_nodes);
+    if (!success) {
+      CHECK(!excluded_nodes.empty());
+      std::string excluded_node_names;
+      for (auto node : excluded_nodes) {
+        excluded_node_names += node->attrs.name + ", ";
+      }
+      LOG(INFO) << "Found a cycle when BFS from node " << simple_nodes[snid]->node->attrs.name
+                << ". Excluding nodes " << excluded_node_names << "and retrying";
+    }
+    ++count;
+  }
+  if (!success) {
+    LOG(INFO) << "Tried " << count << " times of finding subgraphs starting from node "
+              << simple_nodes[snid]->node->attrs.name << " without success because a loop "
+                  "is always found between the subgraph and some other nodes. Will treat "
+                  "seed node " << simple_nodes[snid]->node->attrs.name
+              << "as a subgraph with one node";
+    CHECK(subgraph_nodes->empty());
+    simple_nodes[snid]->label = label;
+    subgraph_nodes->push_back(simple_nodes[snid]->node);
+  }
+}
+
+/*!
+ * \brief Given a vector of nodes, group them into individual subgraphs
+ * based upon their connectivity.
+ */
+void PostProcessNodeCandidates(const nnvm::Graph& g,
+                               const std::vector<nnvm::Node*>& nodes,
+                               const std::vector<SimpleNodePtr>& simple_nodes,
+                               std::vector<std::vector<SimpleNode*>>* subgraphs,
+                               size_t* subgraph_id) {
+  const auto& indexed_graph = g.indexed_graph();
+  std::unordered_set<nnvm::Node*> node_set(nodes.begin(), nodes.end());
+  auto simple_node_cmp = [&] (const SimpleNode* node1, const SimpleNode* node2) {
+    return indexed_graph.node_id(node1->node) < indexed_graph.node_id(node2->node);
+  };
+  for (auto node : nodes) {
+    if (!node_set.count(node)) {
+      // The node has been included in a subgraph
+      continue;
+    }
+    std::queue<nnvm::Node*> q;
+    q.push(node);
+    CHECK_EQ(node_set.erase(node), 1U);
+    subgraphs->emplace_back();
+    const auto nid = indexed_graph.node_id(node);
+    simple_nodes[nid]->label = *subgraph_id;
+    subgraphs->back().push_back(simple_nodes[nid].get());
+    while (!q.empty()) {
+      nnvm::Node* cur_node = q.front();
+      q.pop();
+      for (auto& e : cur_node->inputs) {
+        auto in_it = node_set.find(e.node.get());
+        if (in_it != node_set.end()) {
+          q.push(*in_it);
+          const auto in_nid = indexed_graph.node_id(*in_it);
+          simple_nodes[in_nid]->label = *subgraph_id;
+          subgraphs->back().push_back(simple_nodes[in_nid].get());
+          node_set.erase(in_it);
+        }
+      }
+      const auto cur_nid = indexed_graph.node_id(cur_node);
+      const SimpleNode* cur_snode = simple_nodes[cur_nid].get();
+      for (const auto& kv : cur_snode->outputs) {
+        const auto out_it = node_set.find(kv.first);
+        if (out_it != node_set.end()) {
+          q.push(*out_it);
+          const auto out_nid = indexed_graph.node_id(*out_it);
+          simple_nodes[out_nid]->label = *subgraph_id;
+          subgraphs->back().push_back(simple_nodes[out_nid].get());
+          node_set.erase(out_it);
+        }
+      }
+    }
+    ++(*subgraph_id);
+    std::sort(subgraphs->back().begin(), subgraphs->back().end(), simple_node_cmp);
+  }
+  CHECK(node_set.empty());
+}
+
+/*!
+ * \brief Finds subgraphs with all nodes that meet certain criteria.
+ * All nodes in a subgraph are marked with the same label.
+ */
+void FindSubgraphs(Graph* g,
+                   const SubgraphProperty &subg_prop,
+                   const std::vector<SimpleNodePtr>& simple_nodes,
+                   std::vector<std::vector<SimpleNode*>>* subgraph_nodes) {
+  const auto& indexed_graph = g->indexed_graph();
+  CHECK_EQ(indexed_graph.num_nodes(), simple_nodes.size());
+  auto node_cmp = [&] (const nnvm::Node* node1, const nnvm::Node* node2) {
+    return indexed_graph.node_id(node1) < indexed_graph.node_id(node2);
+  };
+  size_t subgraph_id = 0;
+  for (size_t i = 0; i < simple_nodes.size(); ++i) {
+    nnvm::Node* node = simple_nodes[i]->node;
+    auto subgraph_selector = subg_prop.CreateSubgraphSelector();
+    if (subgraph_selector->Select(*node) && simple_nodes[i]->label == -1) {
+      // pre-select nodes that can be grouped in a subgraph
+      std::vector<nnvm::Node*> preselected_nodes;
+      PreSelectSubgraphNodes(*g, subgraph_selector, subgraph_id, i, simple_nodes,
+                             &preselected_nodes);
+
+      // filter out unqualified pre-selected nodes
+      std::vector<nnvm::Node*> filtered_nodes = subgraph_selector->Filter(preselected_nodes);
+
+      // make sure filtered_nodes is a subset of preselected_nodes
+      for (const auto n : filtered_nodes) {
+        const auto nit = std::find(preselected_nodes.begin(), preselected_nodes.end(), n);
+        CHECK(nit != preselected_nodes.end())
+          << "Node " << n->attrs.name << " is not found in the pre-selected subgraph nodes."
+             " Please make sure that no new nodes were added in your subgraph"
+             " selector's Filter function";
+      }
+
+      // make sure nodes are sorted
+      std::sort(filtered_nodes.begin(), filtered_nodes.end(), node_cmp);
+
+      // reset node labels that are not in filtered nodes
+      for (const auto n : preselected_nodes) {
+        const auto nit = std::find(filtered_nodes.begin(), filtered_nodes.end(), n);
+        if (nit == filtered_nodes.end()) {
+          simple_nodes[indexed_graph.node_id(n)]->label = -1;
+        }
+      }
+      // find out subgraphs from the filtered nodes
+      std::vector<std::vector<SimpleNode*>> subgraphs;
+      PostProcessNodeCandidates(*g, filtered_nodes, simple_nodes, &subgraphs, &subgraph_id);
+      if (!subgraphs.empty()) {
+        subgraph_nodes->insert(subgraph_nodes->end(), subgraphs.begin(), subgraphs.end());
+      }
+    }
+  }
+}
+
+/*!
+ * \brief Sorts entries according to their topological order.
+ * Note that entry ids cannot be used to sort entries.
+ * \param entry_top_order_map mapping from entry pointer to its topological position in the graph
+ * \param entries Node entries to be sorted
+ */
+void SortEntries(const std::unordered_map<const nnvm::NodeEntry*, size_t>& entry_top_order_map,
+                 std::vector<nnvm::NodeEntry*>* entries) {
+  auto entry_cmp = [&](const nnvm::NodeEntry* e1, const nnvm::NodeEntry* e2) {
+    const auto it1 = entry_top_order_map.find(e1);
+    CHECK(it1 != entry_top_order_map.end());
+    const auto it2 = entry_top_order_map.find(e2);
+    CHECK(it2 != entry_top_order_map.end());
+    return it1->second < it2->second;
+  };
+  std::sort(entries->begin(), entries->end(), entry_cmp);
+}
+
+/*!
+ * \brief Given a subgraph, find the output entries of a subgraph.
+ * \param g pointer to the whole graph
+ * \param simple_nods vector of simple nodes in top sorted order
+ * \param subgraph_nodes vector of pointers of simples of a subgraph.
+ * \param entry_top_order_map mapping entry pointer to its top sorted position
+ * \param input_entries input entries of the subgraph
+ */
+void FindInputEntries(const Graph& g,
+                      const std::vector<SimpleNodePtr>& simple_nodes,
+                      const std::vector<SimpleNode*>& subgraph_nodes,
+                      const std::unordered_map<const nnvm::NodeEntry*, size_t>& entry_top_order_map,
+                      std::vector<nnvm::NodeEntry*>* input_entries) {
+  const auto& indexed_graph = g.indexed_graph();
+  int label = -1;
+  for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
+    if (label == -1) {
+      label = subgraph_nodes[i]->label;
+    } else {
+      CHECK_EQ(subgraph_nodes[i]->label, label);
+    }
+    auto& inputs = subgraph_nodes[i]->node->inputs;
+    for (size_t j = 0; j < inputs.size(); ++j) {
+      auto& e = inputs[j];
+      if (indexed_graph.exist(e.node.get())) {
+        // e's source node is not a subgraph node
+        const auto nid = indexed_graph.node_id(e.node.get());
+        // this is a node not belonging to the subgraph
+        if (simple_nodes[nid]->label != label) {
+          input_entries->push_back(&e);
+        }
+      } else {
+        // e's source node is a subgraph node.
+        // In this case, two subgraphs are adjacent.
+        input_entries->push_back(&e);
+      }
+    }
+  }
+  SortEntries(entry_top_order_map, input_entries);
+}
+
+/*!
+ * \brief Given a subgraph, find the output entries of a subgraph.
+ * \param g pointer to the whole graph
+ * \param simple_nods vector of simple nodes in top sorted order
+ * \param subgraph_nodes vector of pointers of simples of a subgraph.
+ * \param entry_top_order_map mapping entry pointer to its top sorted position
+ * \param output_entries output entries of the subgraph
+ */
+void FindOutputEntries(Graph* g,
+                       const std::vector<SimpleNodePtr>& simple_nodes,
+                       const std::vector<SimpleNode*>& subgraph_nodes,
+                       const std::unordered_map<const nnvm::NodeEntry*, size_t>&
+                         entry_top_order_map,
+                       std::vector<nnvm::NodeEntry*>* output_entries) {
+  if (subgraph_nodes.empty()) return;
+  const auto& indexed_graph = g->indexed_graph();
+  int label = -1;
+  for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
+    if (label == -1) {
+      label = subgraph_nodes[i]->label;
+    } else {
+      CHECK_EQ(subgraph_nodes[i]->label, label);
+    }
+    for (auto it = subgraph_nodes[i]->outputs.begin();
+         it != subgraph_nodes[i]->outputs.end(); ++it) {
+      if (indexed_graph.exist(it->first)) {
+        // if the output node is a normal graph node (not a subgraph node)
+        const auto nid = indexed_graph.node_id(it->first);
+        // this is a node not belonging to the current subgraph
+        if (simple_nodes[nid]->label != label) {
+          for (auto idx : it->second) {
+            auto& e = simple_nodes[nid]->node->inputs[idx];
+            output_entries->push_back(&e);
+          }
+        }
+      } else {
+        // if the output node is a subgraph node
+        // two graphs are adjacent
+        for (auto idx : it->second) {
+          output_entries->push_back(&(it->first->inputs[idx]));
+        }
+      }
+    }
+  }
+  // Check if current subgraph contains a node which is the last node
+  // of the whole graph. If so, save its corresponding entry as well.
+  for (size_t i = 0; i < g->outputs.size(); ++i) {
+    auto& entry = g->outputs[i];
+    // The entry might has been updated as an output of
+    // a subgraph node. In this case, no need
+    // to check its source for the current subgraph. Otherwise,
+    // do the following.
+    if (indexed_graph.exist(entry.node.get())) {
+      const auto nid = indexed_graph.node_id(entry.node.get());
+      if (simple_nodes[nid]->label == label) {
+        output_entries->push_back(&entry);
+      }
+    }
+  }
+  SortEntries(entry_top_order_map, output_entries);
+}
+
+/*!
+ * \brief Given a computation graph and a set of input node entries, this function cuts
+ * the node entries and creates new variable nodes as the input nodes of the
+ * subgraph. It returns the nodes that connect to the subgraph directly and
+ * the names of the new variable nodes.
+ */
+void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
+                    std::vector<nnvm::NodeEntry> *orig_entries,
+                    const bool skip_var = false) {
+  orig_entries->resize(input_entries.size());
+  // map for creating unique var nodes for deduplicating entries from the same node
+  std::unordered_map<std::string, int> name_count_map;
+  for (size_t i = 0; i < input_entries.size(); ++i) {
+    nnvm::NodeEntry *e = input_entries[i];
+    // If the node is a variable itself, we may want to skip the node.
+    if (e->node->is_variable() && skip_var) {
+      continue;
+    }
+
+    orig_entries->at(i) = *e;
+    nnvm::Symbol sym;
+    sym.outputs.push_back(*e);
+    const auto output_names = sym.ListOutputNames();
+    CHECK_EQ(output_names.size(), 1U);
+    const std::string& var_name = output_names[0];
+    auto it = name_count_map.find(var_name);
+    if (name_count_map.end() == it) {
+      name_count_map.emplace(var_name, 0);
+    } else {
+      ++(it->second);
+    }
+    nnvm::NodePtr n = nnvm::CreateVariableNode(var_name + std::to_string(name_count_map[var_name]));
+    *e = nnvm::NodeEntry{n, 0, 0};
+  }
+}
+
+/*!
+ * \brief Replace a set of nodes belonging to the same subgraph with a subgrpah node
+ * and keep the subgraph in the subgraph node. The input entries and output entries
+ * of the subgraph node are kept in the same order as the subgraph's.
+ */
+void CreateSubgraphNode(Graph* g,
+                        const std::vector<SimpleNodePtr>& simple_nodes,
+                        const std::vector<SimpleNode*>& subgraph_nodes,
+                        const size_t subgraph_id,
+                        std::unordered_map<const nnvm::NodeEntry*, size_t>* entry_top_order_map) {
+#if DEBUG_SUBGRAPH
+  LOG(INFO) << "Searching for input entries...";
+#endif
+  std::vector<nnvm::NodeEntry*> input_entries;
+  FindInputEntries(*g, simple_nodes, subgraph_nodes, *entry_top_order_map, &input_entries);
+  std::vector<nnvm::NodeEntry> orig_input_entries;
+  CutGraphInputs(input_entries, &orig_input_entries, false);
+#if DEBUG_SUBGRAPH
+  PrintNodeEntries(input_entries);
+  LOG(INFO) << "Searching for output entries...";
+#endif
+  std::vector<nnvm::NodeEntry*> output_entries;
+  FindOutputEntries(g, simple_nodes, subgraph_nodes, *entry_top_order_map, &output_entries);
+
+  // Create a subgraph for the subgraph node
+  nnvm::Symbol sym;
+  sym.outputs.resize(output_entries.size());
+  for (size_t i = 0; i < output_entries.size(); ++i) {
+    sym.outputs[i] = *output_entries[i];
+  }
+  const SubgraphPropertyPtr& subg_prop = g->GetAttr<SubgraphPropertyPtr>("subgraph_property");
+  nnvm::NodePtr n = subg_prop->CreateSubgraphNode(sym, subgraph_id);
+
+  // Connect the external nodes to the subgraph node.
+  for (size_t i = 0; i < output_entries.size(); ++i) {
+    *output_entries[i] = nnvm::NodeEntry{n, static_cast<uint32_t>(i), 0};
+  }
+  n->inputs = orig_input_entries;
+  const auto& indexed_graph = g->indexed_graph();
+  for (size_t i = 0; i < n->inputs.size(); ++i) {
+    auto& e = n->inputs[i];
+    // update entry_top_order_map with newly created orig_input_entries
+    auto it = entry_top_order_map->find(input_entries[i]);
+    CHECK(it != entry_top_order_map->end());
+    entry_top_order_map->emplace(&e, it->second);
+    // update input entries' source simple nodes' outputs map
+    nnvm::Node* node = e.node.get();
+    if (indexed_graph.exist(node)) {
+      const auto nid = indexed_graph.node_id(node);
+      SimpleNode* sn = simple_nodes[nid].get();
+      for (SimpleNode* dest_node : subgraph_nodes) {
+        sn->outputs.erase(dest_node->node);
+      }
+      sn->outputs[n.get()].push_back(i);
+    }
+  }
+#if DEBUG_SUBGRAPH
+  PrintNodeEntries(output_entries);
+#endif
+}
+
+}  // namespace sg
+
+/*!
+ * \brief Sort entries of all the nodes' inputs vectors in the topological order.
+ * This is going to be used to sort input/output entries of subgraphs to keep
+ * the topological order unchanged.
+ */
+void TopSortEntries(const Graph& g,
+                    std::unordered_map<const nnvm::NodeEntry*, size_t>* entry_top_order_map) {
+  CHECK(entry_top_order_map != nullptr);
+  std::unordered_set<const nnvm::Node*> visited;
+  // tuple: (graph node, index of node's inputs, node entry as the output of the graph node)
+  std::stack<std::tuple<nnvm::Node*, size_t, const nnvm::NodeEntry*>> s;
+  auto in_degree = [] (const nnvm::Node* node)->size_t {
+    if (!node) {
+      return 0;
+    }
+    CHECK_EQ(node->control_deps.size(), 0U);
+    return node->inputs.size();
+  };
+  for (auto& e : g.outputs) {
+    nnvm::Node* node = e.node.get();
+    if (visited.count(node) == 0U) {
+      s.emplace(node, 0U, &e);
+      visited.insert(node);
+    } else {
+      // The entry's source node has been visited before.
+      // Marking the order for it.
+      entry_top_order_map->emplace(&e, entry_top_order_map->size());
+    }
+    while (!s.empty()) {
+      auto& top = s.top();
+      if (std::get<1>(top) == in_degree(std::get<0>(top))) {
+        // The node's inputs has been exhausted.
+        entry_top_order_map->emplace(std::get<2>(top), entry_top_order_map->size());
+        s.pop();
+      } else {
+        // The node still has input entries not visited.
+        CHECK_LT(std::get<1>(top), std::get<0>(top)->inputs.size());
+        auto& entry = std::get<0>(top)->inputs[std::get<1>(top)++];
+        nnvm::Node* input_node = entry.node.get();
+        if (visited.count(input_node) == 0U) {
+          // The entry's source node has not been visited.
+          // Push the entry to the stack for marking order later.
+          s.emplace(input_node, 0U, &entry);
+          visited.insert(input_node);
+        } else {
+          // The entry's source node has been visited before.
+          // Marking the order for it.
+          entry_top_order_map->emplace(&entry, entry_top_order_map->size());
+        }
+      }
+    }
+  }
+}
+
+Graph PartitionGraph(Graph&& g) {
+  if (!g.HasAttr("subgraph_property")) {  // treat the whole graph as a subgraph
+    LOG(INFO) << "The graph has no attribute of subgraph_property attached. "
+                 "The original graph is returned.";
+    return g;
+  }
+  using namespace sg;
+  const SubgraphPropertyPtr& subg_prop = g.GetAttr<SubgraphPropertyPtr>("subgraph_property");
+  // top sort NodeEntry of all the nodes' inputs
+  std::unordered_map<const nnvm::NodeEntry*, size_t> entry_top_order_map;
+  TopSortEntries(g, &entry_top_order_map);
+
+  // Create undirected graph for ease of finding subgraphs
+  std::vector<SimpleNodePtr> simple_nodes;
+  CreateSimpleGraph(g, &simple_nodes);
+  std::vector<std::vector<SimpleNode*>> subgraph_nodes;
+  FindSubgraphs(&g, *subg_prop, simple_nodes, &subgraph_nodes);
+  for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
+#if DEBUG_SUBGRAPH
+    std::set<SimpleNode*> simple_node_set(subgraph_nodes[i].begin(), subgraph_nodes[i].end());
+    CHECK_EQ(simple_node_set.size(), subgraph_nodes[i].size());
+    PrintSubgraph(subgraph_nodes[i]);
+#endif
+    CreateSubgraphNode(&g, simple_nodes, subgraph_nodes[i], i, &entry_top_order_map);
+  }
+  return g;
+}
+
+NNVM_REGISTER_PASS(PartitionGraph)
+.describe("Partition a graph according to the user defined rules "
+          "in a derived class of SubgraphProperty")
+.set_body(PartitionGraph)
+.set_change_graph(true);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/subgraph/subgraph_property.h b/src/operator/subgraph/subgraph_property.h
new file mode 100644
index 00000000000..cfbc1f83733
--- /dev/null
+++ b/src/operator/subgraph/subgraph_property.h
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_SUBGRAPH_PROPERTY_H_
+#define MXNET_OPERATOR_SUBGRAPH_SUBGRAPH_PROPERTY_H_
+
+#include <nnvm/node.h>
+#include <dmlc/base.h>
+#include <dmlc/thread_local.h>
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+namespace mxnet {
+namespace op {
+
+/*
+ * This provides criteria for the graph partitioning algorithm to select
+ * nodes to subgraphs.
+ * The algorithm first sorts all the nodes in topological order, and then
+ * loops through the sorted nodes and tries to find a subgraph starting
+ * from each node (we call it a seed node) that satisfies the following two conditions:
+ * 1. The node has not been selected before.
+ * 2. The function Select is called on the node and returns true.
+ *
+ * Expanding from this seed node, we do BFS to traverse the graph.
+ * During the traversal, we call SelectInput and SelectOutput to determine
+ * if a neighboring node of the current node should be selected as a candidate for the subgraph.
+ * The search continues when a new node is selected as a candidate, and terminates when no more
+ * qualified nodes are found. When the search ends, all of the candidate nodes will
+ * be passed to the function Filter to finalize the subgraph. The filtering gives
+ * developers the last opportunity to drop off some of the candidate nodes.
+ * By default, Filter returns all nodes as the subgraph nodes.
+ * If the pre-selected subgraph becomes disconnected because some
+ * nodes are filtered out in the Filter function, the algorithm will automatically convert
+ * the rest of the nodes to multiple valid subgraphs based upon their connectivity.
+ */
+class SubgraphSelector {
+ public:
+  virtual ~SubgraphSelector() {}
+  /*!
+   * \brief Determines if to search for other nodes to form a subgraph from the seed_node.
+   */
+  virtual bool Select(const nnvm::Node &seed_node) = 0;
+  /*!
+   * \brief Determines if to select input_node when traverse to the cur_node.
+   * \param cur_node the node for determining whether its input_node should be selected
+   * \param input_node the input node of the cur_node
+   */
+  virtual bool SelectInput(const nnvm::Node &cur_node, const nnvm::Node &input_node) = 0;
+  /*!
+   * \brief Determines if to select output_node when traverse to the cur_node.
+   * \param cur_node the node for determining whether its output_node should be selected
+   * \param output_node the output node of the cur_node
+   */
+  virtual bool SelectOutput(const nnvm::Node &cur_node, const nnvm::Node &output_node) = 0;
+  // Post processes pre-selected subgraph nodes. Return a list of nodes that
+  // users want to keep in subgraph(s).
+  virtual std::vector<nnvm::Node*> Filter(const std::vector<nnvm::Node*>& candidates) {
+    return candidates;
+  }
+};
+
+using SubgraphSelectorPtr = std::shared_ptr<SubgraphSelector>;
+
+/*!
+ * \brief This provides a set of properties for partitioning a graph into subgraphs,
+ * reconstructing a new graph from the subgraphs and creating a subgraph
+ * operator to execute the subgraph.
+ */
+class SubgraphProperty {
+ public:
+  // the criteria of selecting the subgraph nodes.
+  virtual SubgraphSelectorPtr CreateSubgraphSelector() const = 0;
+  // create an nnvm node for a given subgraph. Here users can customize how to
+  // execute the operators in the subgraph.
+  virtual nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &s,
+                                           const int subgraph_id = 0) const = 0;
+  // set an attr with name in the attr map
+  template<typename T>
+  SubgraphProperty& SetAttr(const std::string& name, const T& value) {
+    attrs_[name] = std::make_shared<dmlc::any>(value);
+    return *this;
+  }
+  // get the attr with the name
+  template<typename T>
+  const T& GetAttr(const std::string& name) const {
+    auto it = attrs_.find(name);
+    CHECK(it != attrs_.end()) << "Cannot find attribute " << name << " in SubgraphProperty";
+    return nnvm::get<T>(*it->second);
+  }
+ protected:
+  std::unordered_map<std::string, std::shared_ptr<nnvm::any>> attrs_;
+};
+
+using SubgraphPropertyPtr = std::shared_ptr<SubgraphProperty>;
+
+class SubgraphPropertyRegistry {
+ public:
+  typedef SubgraphPropertyPtr (*SubgraphPropertyCreateFn)(void);
+  static SubgraphPropertyRegistry* Get() {
+    static SubgraphPropertyRegistry inst;
+    return &inst;
+  }
+
+  SubgraphPropertyPtr CreateSubgraphProperty(const std::string& name) {
+    auto it = prop_fn_map_.find(name);
+    CHECK(it != prop_fn_map_.end()) << "SubgraphProperty " << name
+                                    << " is not found in SubgraphPropertyRegistry";
+    return it->second();
+  }
+
+  SubgraphPropertyCreateFn __REGISTER_OR_GET__(const std::string& name,
+                                               SubgraphPropertyCreateFn fn) {
+    if (prop_fn_map_.count(name) == 0U) {
+      return __REGISTER__(name, fn);
+    } else {
+      return prop_fn_map_.at(name);
+    }
+  }
+
+ private:
+  SubgraphPropertyCreateFn __REGISTER__(const std::string& name, SubgraphPropertyCreateFn fn) {
+    CHECK_EQ(prop_fn_map_.count(name), 0U) << "Subgraph property " << name
+                                           << " has been registered";
+    prop_fn_map_[name] = fn;
+    return prop_fn_map_[name];
+  }
+
+  SubgraphPropertyRegistry() = default;
+  SubgraphPropertyRegistry(const SubgraphPropertyRegistry&) = delete;
+  SubgraphPropertyRegistry(SubgraphPropertyRegistry&&) = delete;
+  SubgraphPropertyRegistry& operator=(const SubgraphPropertyRegistry&) = delete;
+  std::unordered_map<std::string, SubgraphPropertyCreateFn> prop_fn_map_;
+};
+
+// This op name set is for setting the names of operators that should be grouped into
+// subgraphs. In practice, every backend accelerator should have a predefined name set.
+// This set is only used for the testing purpose.
+// key: property name, value: op name set
+typedef dmlc::ThreadLocalStore<std::unordered_map<std::string, std::unordered_set<std::string>>>
+  SubgraphPropertyOpNameSet;
+
+#define MXNET_REGISTER_SUBGRAPH_PROPERTY(Name, SubgraphPropertyType) \
+  static DMLC_ATTRIBUTE_UNUSED auto __make_ ## SubgraphPropertyType ## _ ## Name ## __ = \
+    SubgraphPropertyRegistry::Get()->__REGISTER_OR_GET__(#Name, &SubgraphPropertyType::Create)
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SUBGRAPH_SUBGRAPH_PROPERTY_H_
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 351315ab0c8..0944d255a45 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -147,6 +147,17 @@ struct BroadcastToParam : public dmlc::Parameter<BroadcastToParam> {
   }
 };
 
+struct BroadcastLikeParam : public dmlc::Parameter<BroadcastLikeParam> {
+  dmlc::optional<TShape> lhs_axes;
+  dmlc::optional<TShape> rhs_axes;
+  DMLC_DECLARE_PARAMETER(BroadcastLikeParam) {
+    DMLC_DECLARE_FIELD(lhs_axes).set_default(dmlc::optional<TShape>())
+      .describe("Axes to perform broadcast on in the first input array");
+    DMLC_DECLARE_FIELD(rhs_axes).set_default(dmlc::optional<TShape>())
+      .describe("Axes to copy from the second input array");
+  }
+};
+
 inline int CheckAxis(int axis, int ndim) {
   CHECK(axis < ndim && axis >= -ndim)
     << "axis " << axis << " exceeds the input dimension of " << ndim;
@@ -350,20 +361,60 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& lhs_shape = (*in_attrs)[0];
   TShape& rhs_shape = (*in_attrs)[1];
-  TShape oshape = TShape(rhs_shape);
-  if (lhs_shape.ndim() == 0 || lhs_shape.ndim() == 0) return false;
 
-  CHECK_EQ(lhs_shape.ndim(), rhs_shape.ndim())
-    << "Operand of shape " << lhs_shape << " cannot be broadcasted to " << rhs_shape;
+  if ((lhs_shape.ndim() == 0) || (lhs_shape.ndim() == 0)) {
+    return false;
+  }
 
-  for (index_t i = 0; i < lhs_shape.ndim(); ++i) {
-    if (rhs_shape[i] != 0) {
-      CHECK(lhs_shape[i] == rhs_shape[i] || lhs_shape[i] == 1)
-        << "Array cannot be broadcasted from " << lhs_shape << " to " << rhs_shape;
-    } else {
-      oshape[i] = lhs_shape[i];
+  const BroadcastLikeParam& param = nnvm::get<BroadcastLikeParam>(attrs.parsed);
+  TShape oshape;
+
+  // lhs or rhs or both params were not specified
+  if (!param.lhs_axes.has_value() || !param.rhs_axes.has_value()) {
+    CHECK_EQ(lhs_shape.ndim(), rhs_shape.ndim())
+      << "Operand of shape " << lhs_shape << " cannot be broadcasted to " << rhs_shape;
+
+    oshape = TShape(rhs_shape);
+    for (index_t i = 0; i < lhs_shape.ndim(); ++i) {
+      if (rhs_shape[i] != 0) {
+        CHECK(lhs_shape[i] == rhs_shape[i] || lhs_shape[i] == 1)
+          << "Array cannot be broadcasted from " << lhs_shape << " to " << rhs_shape;
+      } else {
+        oshape[i] = lhs_shape[i];
+      }
+    }
+  } else {
+    auto lhs_axes = param.lhs_axes.value();
+    auto rhs_axes = param.rhs_axes.value();
+
+    CHECK(rhs_axes.ndim() == lhs_axes.ndim())
+      << "Input_axis and other_axis size does not match";
+
+    CHECK(lhs_axes.ndim() > 0)
+      << "Empty axes tuple is not allowed";
+
+    oshape = TShape(lhs_shape);
+    for (index_t i = 0; i < lhs_axes.ndim(); ++i) {
+      auto copyfrom = lhs_axes[i];
+      if (copyfrom < 0) {
+        copyfrom =  lhs_shape.ndim() + copyfrom;
+      }
+      CHECK(copyfrom >= 0 && copyfrom < oshape.ndim())
+        << "Invalid dimension specified in lhs_axes: " << lhs_axes[i];
+
+      auto copyto = rhs_axes[i];
+      if (copyto < 0) {
+        copyto =  rhs_shape.ndim() + copyto;
+      }
+      CHECK(copyto >= 0 && copyto < rhs_shape.ndim())
+        << "Invalid dimension specified in rhs_axes: " << rhs_axes[i];
+
+      CHECK(lhs_shape[copyfrom] == 1) << "Input axis " << lhs_axes[i]
+        << " at dimension " << i << " cannot be broadcasted to " << rhs_shape[copyto];
+      oshape[copyfrom] = rhs_shape[copyto];
     }
   }
+
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return true;
 }
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index 929c3dfcf0a..c3bc9cfd3f0 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -31,6 +31,7 @@ DMLC_REGISTER_PARAMETER(NormParam);
 DMLC_REGISTER_PARAMETER(ReduceAxisParam);
 DMLC_REGISTER_PARAMETER(BroadcastAxesParam);
 DMLC_REGISTER_PARAMETER(BroadcastToParam);
+DMLC_REGISTER_PARAMETER(BroadcastLikeParam);
 
 inline std::string get_reduce_axes_description(const std::string& op_name, int line) {
   std::string doc = R"code(Computes the __op__ of array elements over given axes.
@@ -309,7 +310,11 @@ For example::
    broadcast_like([[1,2,3]], [[5,6,7],[7,8,9]]) = [[ 1.,  2.,  3.],
                                                    [ 1.,  2.,  3.]])
 
+   broadcast_like([9], [1,2,3,4,5], lhs_axes=(0,), rhs_axes=(-1,)) = [9,9,9,9,9]
+
 )code" ADD_FILELINE)
+.set_attr_parser(ParamParser<BroadcastLikeParam>)
+.add_arguments(BroadcastLikeParam::__FIELDS__())
 .set_attr<nnvm::FInferShape>("FInferShape", BroadcastLikeShape)
 .set_attr<FCompute>("FCompute<cpu>", BroadcastCompute<cpu>);
 
diff --git a/src/operator/tensor/control_flow_op.h b/src/operator/tensor/control_flow_op.h
index 94e65109c35..e9aa9f63fae 100644
--- a/src/operator/tensor/control_flow_op.h
+++ b/src/operator/tensor/control_flow_op.h
@@ -189,6 +189,7 @@ inline bool WhereOpShape(const nnvm::NodeAttrs& attrs,
     return true;
   } else if ((*in_attrs)[0].ndim() == 1) {
     CHECK_EQ((*in_attrs)[0].Size(), static_cast<size_t>(tshape[0]));
+    return true;
   }
   return false;
 }
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 9630988165c..1666537e286 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -179,6 +179,9 @@ The storage type of ``add_n`` output depends on storage types of inputs
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+#endif
 .set_attr<nnvm::FInferShape>("FInferShape", ElementWiseSumShape)
 .set_attr<nnvm::FInferType>("FInferType", ElementWiseSumType)
 .set_attr<FInferStorageType>("FInferStorageType", ElementWiseSumForwardInferStorageType)
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index e09a6cccddb..eb070a41127 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -299,7 +299,11 @@ class UnaryOp : public OpBase {
         }
         break;
       case kWriteInplace:
+// cannot check if ptrs are the same for MKLDNN because we may have
+// created copies of input when reordering. WriteInPlace will still write to original array
+#if MXNET_USE_MKLDNN == 0
         CHECK_EQ(inputs[0].dptr_, outputs[0].dptr_);
+#endif
         break;
       case kNullOp:
         break;
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index f7f21f9076a..c3e9c2dc91d 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -206,6 +206,7 @@ MXNET_OPERATOR_REGISTER_UNARY(_copy)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
+.set_attr<bool>("TIsMKLDNN", true)
 #endif
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
@@ -225,6 +226,7 @@ NNVM_REGISTER_OP(_backward_copy)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 4af3a40f42a..304911a02a7 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -123,6 +123,7 @@ struct RangeParam : public dmlc::Parameter<RangeParam> {
   dmlc::optional<double> stop;
   double step;
   int repeat;
+  bool infer_range;
   std::string ctx;
   int dtype;
   DMLC_DECLARE_PARAMETER(RangeParam) {
@@ -140,6 +141,10 @@ struct RangeParam : public dmlc::Parameter<RangeParam> {
     .set_default(1)
     .describe("The repeating time of all elements."
               " E.g repeat=3, the element a will be repeated three times --> a, a, a.");
+    DMLC_DECLARE_FIELD(infer_range)
+    .set_default(false)
+    .describe("Whether to infer the stop position from the start, step, repeat, and output tensor"
+              "size.");
     DMLC_DECLARE_FIELD(ctx)
     .set_default("")
     .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
@@ -176,7 +181,7 @@ struct InitOpWithScalarParam : dmlc::Parameter<InitOpWithScalarParam> {
 inline void RangeParamParser(nnvm::NodeAttrs* attrs) {
   RangeParam param;
   param.Init(attrs->dict);
-  if (!static_cast<bool>(param.stop)) {
+  if (!static_cast<bool>(param.infer_range) && !static_cast<bool>(param.stop)) {
     param.stop = param.start;
     param.start = 0;
   }
@@ -471,6 +476,9 @@ inline bool RangeShape(const nnvm::NodeAttrs& attrs,
     << "Range does not support step=0, received " << param.step;
   CHECK(param.repeat > 0)
     << "Range only supports repeat > 0, received " << param.repeat;
+  if (param.infer_range && !param.stop.has_value()) {
+    return false;
+  }
   if (param.step > 0) {
     CHECK(param.start < param.stop.value())
       << "Invalid range (start, stop, step) = "
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index 92d0958c463..6d669c19bca 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -275,6 +275,64 @@ TEST(Engine, basics) {
   LOG(INFO) << "All pass";
 }
 
+TEST(Engine, VarVersion) {
+  const size_t num_engines = 3;
+  std::vector<mxnet::Engine*> engines(num_engines);
+  engines[0] = mxnet::engine::CreateNaiveEngine();
+  engines[1] = mxnet::engine::CreateThreadedEnginePooled();
+  engines[2] = mxnet::engine::CreateThreadedEnginePerDevice();
+  std::string type_names[3] = {"NaiveEngine", "ThreadedEnginePooled", "ThreadedEnginePerDevice"};
+  for (size_t k = 0; k < num_engines; ++k) {
+    auto engine = engines[k];
+    std::vector<mxnet::Engine::OprHandle> oprs;
+
+    LOG(INFO) << "Testing var as a read dependency in " << type_names[k];
+    auto var = engine->NewVariable();
+    EXPECT_EQ(var->version(), 0U);
+    for (int i = 0; i < 10; ++i) {
+      oprs.push_back(engine->NewOperator(
+          [i](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
+            Foo(ctx, i);
+            cb();
+          },
+          {var}, {}));
+      engine->Push(oprs.at(i), mxnet::Context{});
+    }
+    engine->WaitForAll();
+    EXPECT_EQ(var->version(), 0U);
+    for (auto&& i : oprs) {
+      engine->DeleteOperator(i);
+    }
+    engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
+    engine->WaitForAll();
+
+    LOG(INFO) << "Testing var as a write dependency in " << type_names[k];
+    var = engine->NewVariable();
+    EXPECT_EQ(var->version(), 0U);
+    oprs.clear();
+    for (int i = 0; i < 10; ++i) {
+      oprs.push_back(engine->NewOperator(
+          [i](mxnet::RunContext ctx, mxnet::Engine::CallbackOnComplete cb) {
+            Foo(ctx, i);
+            cb();
+          },
+          {}, {var}));
+      engine->Push(oprs.at(i), mxnet::Context{});
+    }
+    engine->WaitForAll();
+    EXPECT_EQ(var->version(), 10U);
+    for (auto&& i : oprs) {
+      engine->DeleteOperator(i);
+    }
+    engine->DeleteVariable([](mxnet::RunContext) {}, mxnet::Context{}, var);
+    engine->WaitForAll();
+
+    var = nullptr;
+    oprs.clear();
+    LOG(INFO) << "All pass";
+  }
+}
+
 #ifdef _OPENMP
 
 struct TestSaveAndRestoreOMPState {
diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py
index 8950a927083..d8ffca25a3f 100644
--- a/tests/nightly/model_backwards_compatibility_check/common.py
+++ b/tests/nightly/model_backwards_compatibility_check/common.py
@@ -29,6 +29,13 @@
 import re
 from mxnet.test_utils import assert_almost_equal
 
+try:
+    cmp             # Python 2
+except NameError:
+    # See: https://docs.python.org/3.0/whatsnew/3.0.html#ordering-comparisons
+    def cmp(x, y):  # Python 3
+        return (x > y) - (x < y)
+
 # Set fixed random seeds.
 mx.random.seed(7)
 np.random.seed(7)
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
index a60498c8786..5eeb52f516e 100644
--- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -35,11 +35,13 @@
     'chapter02_supervised-learning/environment',
     'chapter03_deep-neural-networks/kaggle-gluon-kfold',
     'chapter04_convolutional-neural-networks/deep-cnns-alexnet',  # > 10 mins.
+    'chapter05_recurrent-neural-networks/rnns-gluon', # > 10 mins.
     'chapter06_optimization/gd-sgd-scratch',  # Overflow warning is intended.
     'chapter06_optimization/gd-sgd-gluon',  # Overflow warning is intended.
     'chapter07_distributed-learning/multiple-gpus-scratch',
     'chapter07_distributed-learning/multiple-gpus-gluon',
     'chapter07_distributed-learning/training-with-multiple-machines',
+    'chapter08_computer-vision/visual-question-answer', # > 10 mins.
     'chapter11_recommender-systems/intro-recommender-systems',  # Early draft, non-working.
     'chapter12_time-series/intro-forecasting-gluon',
     'chapter12_time-series/intro-forecasting-2-gluon',
@@ -176,9 +178,6 @@ def test_lstm_scratch(self):
     def test_gru_scratch(self):
         assert _test_notebook('chapter05_recurrent-neural-networks/gru-scratch')
 
-    def test_rnns_gluon(self):
-        assert _test_notebook('chapter05_recurrent-neural-networks/rnns-gluon')
-
     # Chapter 6
 
     def test_optimization_intro(self):
@@ -228,9 +227,6 @@ def test_object_detection(self):
     def test_fine_tuning(self):
         assert _test_notebook('chapter08_computer-vision/fine-tuning')
 
-    def test_visual_question_answer(self):
-        assert _test_notebook('chapter08_computer-vision/visual-question-answer')
-
     # Chapter 9
 
     def test_tree_lstm(self):
diff --git a/tests/python-pytest/onnx/export/onnx_backend_test.py b/tests/python-pytest/onnx/export/onnx_backend_test.py
index 1fbfde5977e..19bf6993e7c 100644
--- a/tests/python-pytest/onnx/export/onnx_backend_test.py
+++ b/tests/python-pytest/onnx/export/onnx_backend_test.py
@@ -45,6 +45,12 @@
     'test_abs',
     'test_sum',
     'test_tanh',
+    'test_cos',
+    'test_sin',
+    'test_tan',
+    'test_acos',
+    'test_asin',
+    'test_atan'
     'test_ceil',
     'test_floor',
     'test_concat',
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 42d65dab5fd..69375afdfe0 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -111,12 +111,16 @@ def test_gluon_ctc_consistency():
 
 @with_seed()
 def test_global_norm_clip_multi_device():
-    x1 = mx.nd.ones((3,3), ctx=mx.gpu(0))
-    x2 = mx.nd.ones((4,4), ctx=mx.cpu(0))
-    norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
-    assert norm == 5.0
-    assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
-    assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
+    for check_isfinite in [True, False]:
+        x1 = mx.nd.ones((3,3), ctx=mx.gpu(0))
+        x2 = mx.nd.ones((4,4), ctx=mx.cpu(0))
+        norm = gluon.utils.clip_global_norm([x1, x2], 1.0, check_isfinite=check_isfinite)
+        if check_isfinite:
+            assert norm == 5.0
+        else:
+            assert norm.asscalar() == 5.0
+        assert_almost_equal(x1.asnumpy(), np.ones((3, 3)) / 5)
+        assert_almost_equal(x2.asnumpy(), np.ones((4, 4)) / 5)
 
 
 def _check_batchnorm_result(input, num_devices=1, cuda=False):
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 5612b0a647e..1fc2c8e922d 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -41,6 +41,7 @@
 from test_sparse_ndarray import *
 from test_sparse_operator import *
 from test_ndarray import *
+from test_subgraph_op import *
 
 set_default_context(mx.gpu(0))
 del test_support_vector_machine_l1_svm  # noqa
@@ -261,7 +262,6 @@ def test_fft():
 
 
 @with_seed()
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10087")
 def test_batchnorm_with_type():
   ctx_list_v1_2D = [
     {'ctx': mx.cpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}},
@@ -287,12 +287,12 @@ def test_batchnorm_with_type():
   ]
 
   ctx_list_v2_3D = [
-    {'ctx': mx.cpu(0), 'norm_data': (4, 2, 3, 5, 5), 'type_dict': {'norm_data': np.float16}},
-    {'ctx': mx.cpu(0), 'norm_data': (4, 2, 3, 5, 5), 'type_dict': {'norm_data': np.float32}},
-    {'ctx': mx.cpu(0), 'norm_data': (4, 2, 3, 5, 5), 'type_dict': {'norm_data': np.float64}},
-    {'ctx': mx.gpu(0), 'norm_data': (4, 2, 3, 5, 5), 'type_dict': {'norm_data': np.float16}},
-    {'ctx': mx.gpu(0), 'norm_data': (4, 2, 3, 5, 5), 'type_dict': {'norm_data': np.float32}},
-    {'ctx': mx.gpu(0), 'norm_data': (4, 2, 3, 5, 5), 'type_dict': {'norm_data': np.float64}}
+    {'ctx': mx.cpu(0), 'norm_data': (3, 2, 3, 2, 3), 'type_dict': {'norm_data': np.float16}},
+    {'ctx': mx.cpu(0), 'norm_data': (3, 2, 3, 2, 3), 'type_dict': {'norm_data': np.float32}},
+    {'ctx': mx.cpu(0), 'norm_data': (3, 2, 3, 2, 3), 'type_dict': {'norm_data': np.float64}},
+    {'ctx': mx.gpu(0), 'norm_data': (3, 2, 3, 2, 3), 'type_dict': {'norm_data': np.float16}},
+    {'ctx': mx.gpu(0), 'norm_data': (3, 2, 3, 2, 3), 'type_dict': {'norm_data': np.float32}},
+    {'ctx': mx.gpu(0), 'norm_data': (3, 2, 3, 2, 3), 'type_dict': {'norm_data': np.float64}}
   ]
 
   # V1, 2D
@@ -1917,6 +1917,65 @@ def test_softmax_activation():
         assert_almost_equal(cpu_a.grad.asnumpy(), gpu_a.grad.asnumpy(),
                 atol = 1e-3, rtol = 1e-3)
 
+
+@with_seed()
+def test_bilinear_sampler_versions():
+    data = mx.sym.Variable('data')
+    grid = mx.sym.Variable('grid')
+    sym1 = mx.sym.BilinearSampler(data=data, grid=grid)
+    sym2 = mx.sym.BilinearSampler(data=data, grid=grid, cudnn_off=True)
+    sym3 = mx.sym.BilinearSampler(data=data, grid=grid)
+
+    test_cases = [[(1,3,15,16),(1,2,10,10)],
+                 [(1,6,7,16),(1,2,10,4)],
+                 [(1,7,3,16),(1,2,8,11)],
+                 [(1,9,50,50),(1,2,50,50)]]
+
+    for item in test_cases:
+        data_shape, grid_shape = item
+        # kWriteTo
+        exe_cpu = sym1.simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='write')
+        exe_gpu = sym2.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
+        exe_cudnn = sym3.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
+        exe_list = [exe_cpu, exe_gpu, exe_cudnn]
+        ref_idx = 0
+        test_data = np.random.uniform(low=-0.1, high=0.1,size=data_shape).astype(np.float32)
+        test_grid = np.random.uniform(low=-2, high=2, size=grid_shape).astype(np.float32)
+        for exe in exe_list:
+            exe.arg_dict['data'][:] = test_data
+            exe.arg_dict['grid'][:] = test_grid
+            exe.forward(is_train=True)
+            assert_almost_equal(exe_list[0].outputs[0].asnumpy(), exe.outputs[0].asnumpy(), rtol=1e-3, atol=1e-5)
+
+        out_grad = np.random.uniform(low=-0.01, high=0.01,size=data_shape[:2] + grid_shape[2:]).astype(np.float32)
+        for exe in exe_list:
+            exe.backward(mx.nd.array(out_grad))
+            assert_almost_equal(exe.grad_dict['data'].asnumpy(), exe_list[ref_idx].grad_dict['data'].asnumpy(), rtol=1e-3, atol=1e-5)
+            assert_almost_equal(exe.grad_dict['grid'].asnumpy(), exe_list[ref_idx].grad_dict['grid'].asnumpy(), rtol=1e-3, atol=1e-5)
+
+        data_grad = exe_list[ref_idx].grad_dict['data'].asnumpy()
+        grid_grad = exe_list[ref_idx].grad_dict['grid'].asnumpy()
+
+        # kAddTo
+        exe_cpu_addto = sym1.simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='add')
+        exe_gpu_addto = sym2.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
+        exe_cudnn_addto = sym3.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
+        exe_list = [exe_cpu_addto, exe_gpu_addto, exe_cudnn_addto]
+        data_initial_grad = np.random.normal(size=exe_list[ref_idx].grad_dict['data'].shape).astype(np.float32)
+        grid_initial_grad = np.random.normal(size=exe_list[ref_idx].grad_dict['grid'].shape).astype(np.float32)
+        for exe in exe_list:
+            exe.arg_dict['data'][:] = test_data
+            exe.arg_dict['grid'][:] = test_grid
+            exe.grad_dict['data'][:] = data_initial_grad
+            exe.grad_dict['grid'][:] = grid_initial_grad
+            exe.forward(is_train=True)
+            exe.backward(mx.nd.array(out_grad))
+            assert_almost_equal(exe.grad_dict['data'].asnumpy(), exe_list[ref_idx].grad_dict['data'].asnumpy(), rtol=1e-3, atol=1e-5)
+            assert_almost_equal(exe.grad_dict['grid'].asnumpy(), exe_list[ref_idx].grad_dict['grid'].asnumpy(), rtol=1e-3, atol=1e-5)
+        assert_almost_equal(exe_list[ref_idx].grad_dict['data'].asnumpy(), data_grad + data_initial_grad, rtol=1e-3, atol=1e-5)
+        assert_almost_equal(exe_list[ref_idx].grad_dict['grid'].asnumpy(), grid_grad + grid_initial_grad, rtol=1e-3, atol=1e-5)
+
+
 def test_context_num_gpus():
     # Test that num_gpus reports at least one GPU, as the test is run on a GPU host.
     assert mx.context.num_gpus() > 0
diff --git a/tests/python/gpu/test_tvm_bridge.py b/tests/python/gpu/test_tvm_bridge.py
index 4b1105a0585..5c87536bdba 100644
--- a/tests/python/gpu/test_tvm_bridge.py
+++ b/tests/python/gpu/test_tvm_bridge.py
@@ -19,6 +19,7 @@
 import logging
 import mxnet as mx
 import numpy as np
+import unittest
 
 def test_tvm_bridge():
     # only enable test if TVM is available
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 03f3c76bb65..17fc29c8111 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -22,6 +22,7 @@
 import os
 import numpy as np
 import mxnet as mx
+import unittest
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
 from mxnet import gluon
 from mxnet.gluon import nn
@@ -93,7 +94,7 @@ def __getitem__(self, key):
     # below line triggers different execution thread
     for _ in loader:
         y = net(mx.nd.array(np.ones(X))).asnumpy()
-        # output should be 016711406 (non-mkldnn mode output) 
+        # output should be 016711406 (non-mkldnn mode output)
         assert_almost_equal(y[0, 0, 0, 0], 0.016711406)
         break
 
@@ -242,6 +243,125 @@ def check_batchnorm_training(stype):
         check_batchnorm_training(stype)
 
 
+@with_seed()
+def test_softmax():
+    def check_softmax_training(stype):
+        for shape in [(2, 3), (2, 3, 2, 2)]:
+            data_tmp = np.random.normal(-0.1, 0.1, size=shape)
+
+            data = mx.symbol.Variable('data', stype=stype)
+            in_location = [mx.nd.array(data_tmp).tostype(stype)]
+
+            test = mx.symbol.softmax(data, axis=-1)
+            check_numeric_gradient(test, in_location, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
+
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_softmax_training(stype)
+
+
+@with_seed()
+def test_pooling():
+    def check_pooling_training(stype):
+        for shape in [(3, 3, 10), (3, 3, 20, 20)]:
+            data_tmp = np.random.normal(-0.1, 0.1, size=shape)
+            data = mx.symbol.Variable('data', stype=stype)
+            in_location = [mx.nd.array(data_tmp).tostype(stype)]
+
+            if np.array(shape).shape[0] == 3:
+                test = mx.symbol.Pooling(data=data, kernel=(3,), stride=(2), pool_type='avg')
+            elif np.array(shape).shape[0] == 4:
+                test = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type='avg')
+            else:
+                return 0
+            check_numeric_gradient(test, in_location, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
+
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_pooling_training(stype)
+
+
+@with_seed()
+def test_activation():
+    def check_activation_training(stype):
+        for shape in [(2, 3, 3), (2, 3, 2, 2)]:
+            data_tmp = np.random.normal(-0.1, 1, size=shape)
+
+            data = mx.symbol.Variable('data', stype=stype)
+            in_location = [mx.nd.array(data_tmp).tostype(stype)]
+
+            test = mx.symbol.Activation(data, act_type="relu")
+            check_numeric_gradient(test, in_location, numeric_eps=1e-6, rtol=0.16, atol=1e-4)
+
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_activation_training(stype)
+
+
+def test_convolution():
+    def check_convolution_training(stype):
+        for shape in [(3, 3, 10), (3, 3, 10, 10)]:
+            data_tmp = np.random.normal(-0.1, 1, size=shape)
+            data = mx.symbol.Variable('data', stype=stype)
+
+            if np.array(shape).shape[0] == 3:
+                test = mx.symbol.Convolution(data=data, kernel=(3,), stride=(2), num_filter=4)
+                weight_tmp = np.random.normal(-0.1, 0.1, size=(4, 3, 3))
+            elif np.array(shape).shape[0] == 4:
+                test = mx.symbol.Convolution(data=data, kernel=(3, 3), stride=(2, 2), num_filter=4)
+                weight_tmp = np.random.normal(-0.1, 0.1, size=(4, 3, 3, 3))
+            else:
+                return 0
+            bias_tmp = np.random.normal(0.1, 0.1, size=(4,))
+            in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(weight_tmp).tostype(stype),
+                           mx.nd.array(bias_tmp).tostype(stype)]
+            check_numeric_gradient(test, in_location, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
+
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_convolution_training(stype)
+
+
+def test_Deconvolution():
+    def check_Deconvolution_training(stype):
+        for shape in [(3, 3, 10), (3, 3, 10, 10)]:
+            data_tmp = np.random.randint(256, size=shape)
+            data = mx.symbol.Variable('data', stype=stype)
+
+            if np.array(shape).shape[0] == 3:
+                test = mx.symbol.Deconvolution(data=data, kernel=(3,), stride=(2), num_filter=4)
+                weight_tmp = np.random.normal(-0.1, 0.1, size=(3, 4, 3))
+            elif np.array(shape).shape[0] == 4:
+                test = mx.symbol.Deconvolution(data=data, kernel=(3, 3), stride=(2, 2), num_filter=4)
+                weight_tmp = np.random.normal(-0.1, 0.1, size=(3, 4, 3, 3))
+            else:
+                return 0
+            bias_tmp = np.random.normal(0.1, 0.1, size=(4,))
+            in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(weight_tmp).tostype(stype),
+                           mx.nd.array(bias_tmp).tostype(stype)]
+            check_numeric_gradient(test, in_location, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
+
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_Deconvolution_training(stype)
+
+
+@with_seed()
+def test_LRN():
+    def check_LRN_training(stype):
+        for shape in [(3, 4, 5, 5)]:
+            data_tmp = np.random.normal(-0.1, 0.1, size=shape)
+            data = mx.symbol.Variable('data', stype=stype)
+            in_location = [mx.nd.array(data_tmp).tostype(stype)]
+
+            test = mx.symbol.LRN(data, nsize=3)
+            check_numeric_gradient(test, in_location, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
+
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_LRN_training(stype)
+
+
 @with_seed()
 def test_fullyconnected():
     def check_fullyconnected_training(stype):
@@ -260,6 +380,50 @@ def check_fullyconnected_training(stype):
     for stype in stypes:
         check_fullyconnected_training(stype)
 
+@with_seed()
+def test_non_mkldnn_fcomputeex():
+    # test special case where MKLDNN formatted NDArray feeds into non-mkldnn fcomputeex operator
+    # conv is example where MKLDNN NDArray is created from regular NDArrays
+    # CustomOps is example of non-mkldnn fcomputeex operator
+
+    @mx.operator.register("custom")
+    class CustomProp(mx.operator.CustomOpProp):
+        def __int__(self):
+            super(CustomProp, self).__init__(need_top_grad=False)
+
+        def list_arguments(self):
+            return ['data']
+
+        def list_outputs(self):
+            return ['output']
+
+        def infer_shape(self, in_shape):
+            data_shape = in_shape[0]
+            output_shape = in_shape[0]
+            return [data_shape], [output_shape], []
+
+        def infer_type(self, in_type):
+            dtype = in_type[0]
+            return [dtype], [dtype], []
+
+        def create_operator(self, ctx, shapes, dtypes):
+            return Custom()
+
+
+    class Custom(mx.operator.CustomOp):
+        def forward(self, is_train, req, in_data, out_data, aux):
+            print(in_data[0])
+            self.assign(out_data[0], req[0], in_data[0])
+
+        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+            self.assign(in_grad[0], req[0], out_grad)
+
+    data = mx.symbol.Variable('data')
+    conv = mx.sym.Convolution(data=data, kernel=(5, 5), pad=(1, 1), stride=(1,1), num_filter=8, name="conv", no_bias=True)
+    custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
+    exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
+    exec1.forward()[0].wait_to_read()
+
 
 if __name__ == '__main__':
     install.test_mkldnn_install()
diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index 1c23c916197..dd5a4d6d315 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -2146,6 +2146,15 @@ def func3(data):
         for i in range(len(out1)):
             assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
 
+def test_foreach_with_unkown_dim():
+    # MXNet supports using 0 as placeholder for unknown dimensions in shape
+    step = lambda data, states: (data + states[0], [states[0] * 2])
+    # input shape with NCHW format and N is unknown
+    data = mx.sym.var('data', shape=(0, 3, 32, 32))
+    states = [mx.sym.var('state')]
+    outs, states = mx.sym.contrib.foreach(step, data, states)
+    _, output_shape, _ = outs.infer_shape_partial()
+    assert_allclose((0, 3, 32, 32), output_shape[0])
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 61b441a5f84..bf9f5a77c84 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -735,10 +735,10 @@ def test_sequential_warning():
 @with_seed()
 def test_global_norm_clip():
     stypes = ['default', 'row_sparse']
-    def check_global_norm_clip(stype):
+    def check_global_norm_clip(stype, check_isfinite):
         x1 = mx.nd.ones((3,3)).tostype(stype)
         x2 = mx.nd.ones((4,4)).tostype(stype)
-        norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
+        norm = gluon.utils.clip_global_norm([x1, x2], 1.0, check_isfinite=check_isfinite)
         assert norm == 5.0
         assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
         assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
@@ -746,11 +746,12 @@ def check_global_norm_clip(stype):
         x3 = mx.nd.array([1.0, 2.0, float('nan')]).tostype(stype)
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
-            gluon.utils.clip_global_norm([x1, x3], 2.0)
-            assert len(w) == 1
+            gluon.utils.clip_global_norm([x1, x3], 2.0, check_isfinite=check_isfinite)
+            assert len(w) == check_isfinite
 
     for stype in stypes:
-        check_global_norm_clip(stype)
+        for check_isfinite in [True, False]:
+            check_global_norm_clip(stype, check_isfinite)
 
 @with_seed()
 def test_embedding():
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index 921a5704d54..28d4ec262c0 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -106,6 +106,23 @@ def check_init(kv, key):
     check_init(mx.kv.create(), 3)
     check_init(mx.kv.create(), 'a')
 
+@with_seed()
+def test_pull():
+    """test pull"""
+    def check_pull(kv):
+        a = mx.nd.ones(shape)
+        b = mx.nd.zeros(shape)
+        kv.init('1', mx.nd.zeros(shape))
+        kv.push('1', [a,a,a,a])
+        kv.pull('1', b)
+        check_diff_to_scalar(b, 4)
+        kv.init('2', mx.nd.zeros(shape))
+        kv.pull('2', b)
+        check_diff_to_scalar(b, 0)
+
+    check_pull(mx.kv.create('device'))
+    check_pull(mx.kv.create())
+
 @with_seed()
 def test_list_kv_pair():
     """list key-value pair push & pull"""
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index c9bc0cd1e1e..c48801ec1ce 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -56,9 +56,9 @@ def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=
         if isinstance(out1, mx.nd.NDArray):
             out1 = out1.asnumpy()
         if dtype == np.float16:
-            assert_almost_equal(out1, out2, rtol=2e-3)
+            assert_almost_equal(out1, out2, rtol=2e-3, atol=1e-5)
         else:
-            assert_almost_equal(out1, out2)
+            assert_almost_equal(out1, out2, atol=1e-5)
 
 def random_ndarray(dim):
     shape = tuple(np.random.randint(1, int(1000**(1.0/dim)), size=dim))
@@ -119,7 +119,7 @@ def test_ndarray_setitem():
     assert same(x.asnumpy(), x_np)
 
 
-@with_seed(0)
+@with_seed()
 def test_ndarray_elementwise():
     nrepeat = 10
     maxdim = 4
@@ -240,20 +240,18 @@ def test_ndarray_scalar():
     assert(np.sum(d.asnumpy()) < 1e-5)
 
 
-@with_seed(0)
+@with_seed()
 def test_ndarray_pickle():
     maxdim = 5
-    nrepeat = 10
-    for repeat in range(nrepeat):
-        for dim in range(1, maxdim):
-            a = random_ndarray(dim)
-            b = mx.nd.empty(a.shape)
-            a[:] = np.random.uniform(-10, 10, a.shape)
-            b[:] = np.random.uniform(-10, 10, a.shape)
-            a = a + b
-            data = pkl.dumps(a)
-            a2 = pkl.loads(data)
-            assert np.sum(a.asnumpy() != a2.asnumpy()) == 0
+    for dim in range(1, maxdim):
+        a = random_ndarray(dim)
+        b = mx.nd.empty(a.shape)
+        a[:] = np.random.uniform(-10, 10, a.shape)
+        b[:] = np.random.uniform(-10, 10, a.shape)
+        a = a + b
+        data = pkl.dumps(a)
+        a2 = pkl.loads(data)
+        assert np.sum(a.asnumpy() != a2.asnumpy()) == 0
 
 
 @with_seed()
@@ -551,8 +549,27 @@ def test_broadcast_like():
             err = np.square(ndarray_ret - numpy_ret).mean()
             assert err < 1E-8
 
+    def test_broadcast_like_axis():
+        testcases = [
+            # Lhs shape, rhs shape, lhs axis, rhs axis, result
+            [(1, 2, 1, 3), (5, 6, 7, 8), (0,2), (1,3), (6, 2, 8, 3)],
+            [(1,), (5,), (0,), (-1,), (5,)],
+            [(1, 7, 9, 1, 1), (9,), (-2,), (0,), (1, 7, 9, 9, 1)],
+            [(1, 7, 9, 1, 1), (9, 1), (-2, -1), (-2, -1), (1, 7, 9, 9, 1)],
+            [(2, 1), (1, 7, 9, 1, 1), (1,), (-3,), (2, 9)]
+        ]
+        
+        for test_data in testcases:
+            lhs = mx.nd.random.uniform(shape=test_data[0])
+            rhs = mx.nd.random.uniform(shape=test_data[1])
+            output = mx.nd.broadcast_like(lhs, rhs, lhs_axes=test_data[2], rhs_axes=test_data[3])
+
+            assert_exception(mx.nd.broadcast_like, mx.base.MXNetError, lhs, rhs, lhs_axes=(), rhs_axes=())
+            assert output.shape == test_data[4]
+
     test_broadcast_to()
     test_broadcast_like()
+    test_broadcast_like_axis()
 
 
 @with_seed()
@@ -622,6 +639,7 @@ def test_arange():
     assert_almost_equal(pred, gt)
 
 @with_seed()
+@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12310")
 def test_order():
     ctx = default_context()
     dat_size = 5
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index e1e5c9e61c2..9842a69e18d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -267,11 +267,8 @@ def test_rnnrelu_dropout():
     out[0].wait_to_read()
 
 def np_softmax(x, axis=-1, temperature=1.0):
-    # fix for old numpy on Travis not supporting keepdims
-    # x = x - np.max(x, axis=-1, keepdims=True)
     x = x - np.max(x, axis=axis, keepdims=True)
     x = np.exp(x/temperature)
-    # x /= np.sum(x, axis=-1, keepdims=True)
     x /= np.sum(x, axis=axis, keepdims=True)
     return x
 
@@ -1586,7 +1583,6 @@ def check_batchnorm_training(stype):
     check_batchnorm_training('default')
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12219")
 @with_seed()
 def test_convolution_grouping():
     for dim in [1, 2, 3]:
@@ -1609,7 +1605,7 @@ def test_convolution_grouping():
         exe1 = y1.simple_bind(default_context(), x=shape)
         exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
         for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
-            arr1[:] = np.random.normal(size=arr1.shape)
+            arr1[:] = np.float32(np.random.normal(size=arr1.shape))
             arr2[:] = arr1
         exe1.forward(is_train=True)
         exe1.backward(exe1.outputs[0])
@@ -1617,7 +1613,7 @@ def test_convolution_grouping():
         exe2.backward(exe2.outputs[0])
 
         for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
-            np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
+            np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
 
 
 @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12203")
@@ -1844,7 +1840,6 @@ def test_bmod(a, b):
         #c = a % b
         c = mx.sym.cast(a, dtype='float64') % mx.sym.cast(b, dtype='float64')
         # '%' is sensitive to the precision of the calculation.  Force numpy to match mxnet's float32.
-        #check_binary_op_forward(c, lambda a, b: np.float32(a) % np.float32(b), gen_binary_data)
         check_binary_op_forward(c, lambda a, b: np.float32(a) % np.float32(b), gen_binary_data, rtol=0, atol=0)
         check_binary_op_backward(c,
             lambda g_out, a, b: (g_out, - g_out * (np.float32(a) // np.float32(b))), gen_binary_data)
@@ -1913,10 +1908,16 @@ def test_bdiv(a, b):
         check_binary_op_forward(c, lambda a, b: a / b, gen_broadcast_data, mx_nd_func=mx.nd.divide)
         check_binary_op_backward(c, lambda g_out, a, b: (g_out / b, - g_out * a / (b * b)), gen_broadcast_data)
 
-    def test_bmod(a, b):
+    def test_bmod(a_, b_):
+        # Python and numpy operate only in double so to avoid numerical errors we have to use
+        # doubles as well. This was a flaky test before when using float32. seed 1688524483, 1768433044
+        a = mx.sym.cast(a_, dtype='float64')
+        b = mx.sym.cast(b_, dtype='float64')
+        # '%' is sensitive to the precision of the calculation.  Force numpy to match mxnet's float32.
         c = mx.sym.broadcast_mod(a, b)
         check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data, atol=1, mx_nd_func=mx.nd.modulo)
-        check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out * (a // b)), gen_broadcast_data, atol=1)
+        check_binary_op_backward(c,
+                                 lambda g_out, a, b: (g_out, - g_out * (np.float32(a) // np.float32(b))), gen_binary_data)
 
     def test_bmod_int(a, b):
         c = mx.sym.broadcast_mod(mx.sym.cast(a, dtype='int32'), mx.sym.cast(b, dtype='int32'))
@@ -1974,13 +1975,7 @@ def test_bxor(a, b):
     test_bminus(a, b)
     test_bmul(a, b)
     test_bdiv(a, b)
-    '''
-    Flaky Test Disabled due to master build failure:
-    http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1248/pipeline
-    Github Issue: https://github.com/apache/incubator-mxnet/issues/11838
-
     test_bmod(a, b)
-    '''
     test_bmod_int(a, b)
     test_bpow(a, b)
     test_bequal(a, b)
@@ -3121,11 +3116,9 @@ def check_l2_normalization(in_shape, mode, dtype, norm_eps=1e-10):
     # compare numpy + mxnet
     assert_almost_equal(exe.outputs[0].asnumpy(), np_out, rtol=1e-2 if dtype is 'float16' else 1e-5, atol=1e-5)
     # check gradient
-    check_numeric_gradient(out, [in_data], numeric_eps=1e-3, rtol=1e-2, atol=1e-3)
+    check_numeric_gradient(out, [in_data], numeric_eps=1e-3, rtol=1e-2, atol=5e-3)
 
 
-# @haojin2: getting rid of the fixed seed as the flakiness could not be reproduced.
-# tracked at: https://github.com/apache/incubator-mxnet/issues/11717
 @with_seed()
 def test_l2_normalization():
     for dtype in ['float16', 'float32', 'float64']:
@@ -3646,10 +3639,18 @@ def test_arange():
                 nd_out = mx.nd.arange(*config, repeat=repeats, dtype=dtype)
                 assert_almost_equal(np_out, nd_out.asnumpy())
 
+    def test_arange_inferstop():
+        s = mx.sym.arange(start=0, stop=None, infer_range=True)
+        s = mx.sym.elemwise_add(s, mx.sym.zeros(shape=[5]))
+        exe = s.bind(ctx=mx.cpu(), args={})
+        exe.forward()
+        assert_almost_equal(exe.outputs[0].asnumpy(), np.array([0,1,2,3,4]))
+
     test_basic_val_init(mx.sym.zeros, np.zeros, (3, 4), np.float32)
     test_basic_val_init(mx.sym.ones, np.ones, 3, np.int32)
     test_basic_val_init(mx.sym.ones, np.ones, (2, 2, 3), np.float16)
     test_arange()
+    test_arange_inferstop()
 
 
 @with_seed()
@@ -3944,166 +3945,6 @@ def test_grid_generator():
         assert_almost_equal(exe_add.grad_dict['flow'].asnumpy(), grad_est + flow_grad_npy, rtol=1e-3, atol=1e-5)
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12248")
-def test_bilinear_sampler():
-    from math import floor
-
-    def between(x, lowerbound, upperbound):
-        return x>=lowerbound and x<=upperbound
-
-    def bilinear_forward_numpy(data, grid):
-
-        batchsize = data.shape[0]
-        input_height = data.shape[2]
-        input_width = data.shape[3]
-        num_channel = data.shape[1]
-
-        output_height = grid.shape[2]
-        output_width = grid.shape[3]
-        out = np.zeros(data.shape[:2] + grid.shape[2:], dtype=np.float32)
-
-        for i in range(batchsize):
-            for yout in range(output_height):
-                for xout in range(output_width):
-
-                    xcoord = np.float32((grid[i, 0, yout, xout] + 1) * (input_width-1) / 2.0)
-                    ycoord = np.float32((grid[i, 1, yout, xout] + 1) * (input_height-1) / 2.0)
-
-                    xInTopLeft = int(floor(xcoord))
-                    xWeightTopLeft = np.float32(1-(xcoord - xInTopLeft))
-
-                    yInTopLeft = int(floor(ycoord))
-                    yWeightTopLeft = np.float32(1-(ycoord - yInTopLeft))
-
-                    # interpolation
-                    for channel in range(num_channel):
-
-                        inTopLeft = data[i,channel,yInTopLeft, xInTopLeft] \
-                            if between(xInTopLeft,0,input_width-1) and between(yInTopLeft,0,input_height-1) else 0.0
-                        inTopRight = data[i,channel,yInTopLeft, xInTopLeft+1] \
-                            if between(xInTopLeft+1,0,input_width-1) and between(yInTopLeft,0,input_height-1) else 0.0
-                        inBottomLeft = data[i,channel,yInTopLeft+1, xInTopLeft] \
-                            if between(xInTopLeft,0,input_width-1) and between(yInTopLeft+1,0,input_height-1) else 0.0
-                        inBottomRight = data[i,channel,yInTopLeft+1, xInTopLeft+1] \
-                            if between(xInTopLeft+1,0,input_width-1) and between(yInTopLeft+1,0,input_height-1) else 0.0
-
-                        out[i,channel,yout,xout] = xWeightTopLeft * yWeightTopLeft * inTopLeft\
-                                +  (1-xWeightTopLeft)*yWeightTopLeft * inTopRight\
-                                +  xWeightTopLeft * (1-yWeightTopLeft) * inBottomLeft\
-                            +(1-xWeightTopLeft) * (1-yWeightTopLeft) * inBottomRight
-        return out
-
-    def bilinear_backward_numpy(out_grad, data, grid):
-
-        data_grad = np.zeros(data.shape, dtype=np.float32)
-        grid_grad = np.zeros(grid.shape, dtype=np.float32)
-
-        batchsize = data.shape[0]
-        input_height = data.shape[2]
-        input_width = data.shape[3]
-        num_channel = data.shape[1]
-        output_height = grid.shape[2]
-        output_width = grid.shape[3]
-
-        for i in range(batchsize):
-            for yout in range(output_height):
-                for xout in range(output_width):
-
-                    top_left_y_gw = np.float32(0.0);
-                    top_left_x_gw = np.float32(0.0);
-
-                    xcoord = np.float32((grid[i, 0, yout, xout] + 1) * (input_width-1) / 2.0)
-                    ycoord = np.float32((grid[i, 1, yout, xout] + 1) * (input_height-1) / 2.0)
-
-                    xInTopLeft = int(floor(xcoord))
-                    xWeightTopLeft = np.float32(1-(xcoord - xInTopLeft))
-
-                    yInTopLeft = int(floor(ycoord))
-                    yWeightTopLeft = np.float32(1-(ycoord - yInTopLeft))
-
-                    topLeftDotProduct = np.float32(0)
-                    topRightDotProduct = np.float32(0)
-                    bottomLeftDotProduct = np.float32(0)
-                    bottomRightDotProduct = np.float32(0)
-
-                    for channel in range(num_channel):
-                        # left top
-                        if between(xInTopLeft,0,input_width-1) and between(yInTopLeft,0,input_height-1):
-                            topLeftDotProduct += data[i,channel,yInTopLeft, xInTopLeft] * \
-                                out_grad[i,channel,yout,xout]
-                            data_grad[i, channel, yInTopLeft, xInTopLeft] += xWeightTopLeft * \
-                                yWeightTopLeft * out_grad[i,channel,yout,xout]
-                        # right top
-                        if between(xInTopLeft+1,0,input_width-1) and between(yInTopLeft,0,input_height-1):
-                            topRightDotProduct += data[i, channel, yInTopLeft,xInTopLeft+1] * \
-                                out_grad[i, channel, yout,xout]
-                            data_grad[i, channel,yInTopLeft, xInTopLeft+1] += (1-xWeightTopLeft) * \
-                                yWeightTopLeft * out_grad[i,channel,yout,xout]
-                        # left bottom
-                        if between(xInTopLeft,0,input_width-1) and between(yInTopLeft+1,0,input_height-1):
-                            bottomLeftDotProduct += data[i, channel,yInTopLeft+1, xInTopLeft] * \
-                                out_grad[i,channel,yout,xout]
-                            data_grad[i,channel,yInTopLeft+1,xInTopLeft]+=xWeightTopLeft * \
-                                (1-yWeightTopLeft)* out_grad[i,channel,yout,xout]
-                        # right bottom
-                        if between(xInTopLeft+1,0,input_width-1) and between(yInTopLeft+1,0,input_height-1):
-                            bottomRightDotProduct += data[i,channel,yInTopLeft+1, xInTopLeft+1] * \
-                                out_grad[i,channel,yout,xout]
-                            data_grad[i,channel,yInTopLeft+1,xInTopLeft+1]+= (1-xWeightTopLeft) * \
-                                (1-yWeightTopLeft)*out_grad[i,channel,yout,xout]
-
-                    yf = np.float32(-xWeightTopLeft * topLeftDotProduct + xWeightTopLeft*bottomLeftDotProduct - \
-                        (1-xWeightTopLeft)* topRightDotProduct + (1-xWeightTopLeft)*bottomRightDotProduct)
-                    xf = np.float32(-yWeightTopLeft * topLeftDotProduct + yWeightTopLeft*topRightDotProduct - \
-                        (1-yWeightTopLeft)*bottomLeftDotProduct + (1-yWeightTopLeft)*bottomRightDotProduct)
-
-                    grid_grad[i,0,yout,xout] = xf * (input_width-1) / 2.0
-                    grid_grad[i,1,yout,xout] = yf * (input_height-1) / 2.0
-
-        return data_grad, grid_grad
-
-    data = mx.sym.Variable('data')
-    grid = mx.sym.Variable('grid')
-    net = mx.sym.BilinearSampler(data=data,grid=grid)
-
-    test_case = [[(1,3,15,16),(1,2,10,10)],
-                 [(1,6,7,16),(1,2,10,4)],
-                 [(1,7,3,16),(1,2,8,11)],
-                 [(1,9,50,50),(1,2,50,50)]]
-
-    for ctx in [default_context()]:
-        for item in test_case:
-            data_shape, grid_shape = item
-            exe = net.simple_bind(data=data_shape,grid=grid_shape,ctx=ctx,grad_req='write')
-            # check forward
-            exe.arg_dict['data'][:] = np.random.uniform(low=-0.1, high=0.1,size=data_shape).astype(np.float32)
-            exe.arg_dict['grid'][:] = np.random.uniform(low=-2, high=2, size=grid_shape).astype(np.float32)
-            exe.forward(is_train=True)
-            out = bilinear_forward_numpy(exe.arg_dict['data'].asnumpy(), exe.arg_dict['grid'].asnumpy())
-            assert_almost_equal(exe.outputs[0].asnumpy(), out, rtol=1e-3,atol=1e-5)
-
-            # check backward
-            out_grad = np.random.uniform(low=-0.01, high=0.01,size=data_shape[:2] + grid_shape[2:]).astype(np.float32)
-            exe.backward(mx.nd.array(out_grad))
-            data_grad, grid_grad = bilinear_backward_numpy(out_grad,exe.arg_dict['data'].asnumpy(),
-                                                       exe.arg_dict['grid'].asnumpy())
-            assert_almost_equal(exe.grad_dict['data'].asnumpy(), data_grad, rtol=1e-3, atol=1e-5)
-            assert_almost_equal(exe.grad_dict['grid'].asnumpy(), grid_grad, rtol=1e-3, atol=1e-5)
-
-            # check kAddTo
-            exe_addto = net.simple_bind(data=data_shape, grid=grid_shape, ctx=ctx, grad_req='add')
-            data_initial_grid = np.random.normal(size=exe_addto.grad_dict['data'].shape).astype(np.float32)
-            grid_initial_grid = np.random.normal(size=exe_addto.grad_dict['grid'].shape).astype(np.float32)
-            exe_addto.arg_dict['data'][:] = exe.arg_dict['data'][:]
-            exe_addto.arg_dict['grid'][:] = exe.arg_dict['grid'][:]
-            exe_addto.grad_dict['data'][:] = data_initial_grid
-            exe_addto.grad_dict['grid'][:] = grid_initial_grid
-            exe_addto.forward(is_train=True)
-            exe_addto.backward(mx.nd.array(out_grad))
-            assert_almost_equal(exe_addto.grad_dict['data'].asnumpy(), data_grad + data_initial_grid, rtol=1e-3,atol=1e-5)
-            assert_almost_equal(exe_addto.grad_dict['grid'].asnumpy(), grid_grad + grid_initial_grid, rtol=1e-3,atol=1e-5)
-
-
 @with_seed()
 def test_index2d():
     for _ in range(30):
@@ -4507,6 +4348,14 @@ def test_invalid_shape():
                                              y=mx.nd.array([[8,9],[10,11],[12,13]]),
                                              condition=mx.nd.array([1,0])), MXNetError)
 
+    def test_1d_cond():
+        cond = mx.nd.array([1, 0, 1])
+        x = mx.nd.array([[2, 3], [4, 5], [6, 7]])
+        y = mx.nd.array([[7, 8], [9, 10], [10, 11]])
+        expect_out = np.array([[2, 3], [9, 10], [6, 7]])
+        out = mx.nd.where(cond, x, y).asnumpy()
+        assert(expect_out.all() == out.all())
+
     test_where_helper((5, 9), True)
     test_where_helper((5, 9), False)
     test_where_helper((5, 7, 9), True)
@@ -4518,6 +4367,28 @@ def test_invalid_shape():
     test_where_numeric_gradient((5, 7, 9), True)
     test_where_numeric_gradient((5, 7, 9), False)
     test_invalid_shape()
+    test_1d_cond()
+
+
+@with_seed()
+def test_softmin():
+    for ndim in range(1, 5):
+        for dtype in [np.float16, np.float32, np.float64]:
+            rtol, atol = (1e-2, 5e-3) if dtype is np.float16 else (1e-3, 1e-3)
+            shape = np.random.randint(1, 5, size=ndim)
+            axis = np.random.randint(-ndim, ndim)
+            data = np.random.uniform(-2, 2, size=shape).astype(dtype)
+            data = data / 10 if dtype is np.float16 else data
+            sym = mx.sym.softmin(axis=axis)
+            expected_fwd = np_softmax(-data, axis=axis)
+            expected_bwd = np.zeros(shape)
+            check_symbolic_forward(sym, [data], [expected_fwd], atol=atol, dtype=dtype)
+            for req in ['null', 'add', 'write']:
+                check_symbolic_backward(sym, [data], [np.ones(expected_fwd.shape)], [expected_bwd],
+                                        rtol=rtol, atol=atol, grad_req=req, dtype=dtype)
+            if dtype is not np.float16:
+                check_numeric_gradient(sym, [data], rtol=rtol, atol=atol, dtype=dtype)
+
 
 @with_seed()
 def test_new_softmax():
@@ -5728,6 +5599,7 @@ def test_stack():
 
 
 @with_seed()
+@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12329")
 def test_dropout():
     def zero_count(array, ratio):
         zeros = 0
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 449cdb42346..496a61f356b 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -1040,6 +1040,58 @@ def test_adagrad():
                                               g_stype='row_sparse')
 
 
+def test_factor_scheduler():
+    base_lr = 1
+    step = 100
+    factor = 0.1
+    sched = mx.lr_scheduler.FactorScheduler(step, factor, stop_factor_lr=1e-4, base_lr=base_lr,
+                                        warmup_steps=20, warmup_begin_lr=0.1, warmup_mode='constant')
+
+    assert (sched(0) == 0.1)
+    np.testing.assert_almost_equal(sched(10), 0.1)
+    assert (sched(21) == base_lr), sched(21)
+    np.testing.assert_almost_equal(sched(101), base_lr * factor)
+    np.testing.assert_almost_equal(sched(201), base_lr * factor * factor)
+    np.testing.assert_almost_equal(sched(1000), 1e-4)
+
+def test_multifactor_scheduler():
+    base_lr = 0.1
+    steps = [15, 25]
+    factor = 0.1
+    sched = mx.lr_scheduler.MultiFactorScheduler(steps, factor, base_lr=base_lr,
+                                        warmup_steps=10, warmup_begin_lr=0.05, warmup_mode='linear')
+
+    assert sched(0) == 0.05
+    np.testing.assert_almost_equal(sched(5), 0.05 + (base_lr - 0.05)/2)
+    np.testing.assert_almost_equal(sched(15), base_lr)
+    np.testing.assert_almost_equal(sched(16), base_lr * factor)
+    np.testing.assert_almost_equal(sched(20), base_lr * factor)
+    np.testing.assert_almost_equal(sched(26), base_lr * factor * factor)
+    np.testing.assert_almost_equal(sched(100), base_lr * factor * factor)
+
+def test_poly_scheduler():
+    base_lr = 3
+    final_lr = 0
+    steps = 1000
+    poly_sched = mx.lr_scheduler.PolyScheduler(steps, base_lr=base_lr, pwr=2, final_lr=final_lr,
+                                    warmup_steps=100, warmup_begin_lr=0, warmup_mode='linear')
+
+    np.testing.assert_almost_equal(poly_sched(0), 0)
+    np.testing.assert_almost_equal(poly_sched(50), float(base_lr)/2)
+    np.testing.assert_almost_equal(poly_sched(100), base_lr)
+    assert (poly_sched(101) <  poly_sched(100))
+    assert (poly_sched(500) < 1.6)
+    np.testing.assert_almost_equal(poly_sched(steps), final_lr)
+
+def test_cosine_scheduler():
+    # also tests case without warmup
+    base_lr = 3
+    final_lr = 0.1
+    steps = 1000
+    cosine_sched = mx.lr_scheduler.CosineScheduler(steps, base_lr=base_lr, final_lr=final_lr)
+    np.testing.assert_almost_equal(cosine_sched(0), base_lr)
+    np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
+    assert (cosine_sched(500) > 1.5)
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_subgraph_op.py b/tests/python/unittest/test_subgraph_op.py
new file mode 100644
index 00000000000..40d609ad354
--- /dev/null
+++ b/tests/python/unittest/test_subgraph_op.py
@@ -0,0 +1,238 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import ctypes
+import mxnet as mx
+from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array, c_str
+from mxnet.symbol import Symbol
+import numpy as np
+from mxnet.test_utils import assert_almost_equal
+
+
+def test_subgraph_exe():
+    def _check_subgraph_exe1(sym, op_names):
+        """Use the partitioned sym to simple_bind an executor and compare the outputs
+        with those of the original executor"""
+        out = SymbolHandle()
+        check_call(_LIB.MXPartitionGraphByOpNames(sym.handle, c_str('default'), mx_uint(len(op_names)),
+                                                  c_str_array(op_names), ctypes.byref(out)))
+
+        partitioned_sym = Symbol(out)
+        assert partitioned_sym.list_inputs() == sym.list_inputs()
+        assert partitioned_sym.list_arguments() == sym.list_arguments()
+        assert partitioned_sym.list_auxiliary_states() == sym.list_auxiliary_states()
+        exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+        partitioned_exe = partitioned_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+        input_names = sym.list_inputs()
+        for name in input_names:
+            if name in exe.arg_dict:
+                exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape)
+                partitioned_exe.arg_dict[name][:] = exe.arg_dict[name]
+            else:
+                assert name in exe.aux_dict
+                exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape)
+                partitioned_exe.aux_dict[name][:] = exe.aux_dict[name]
+        exe.forward()
+        partitioned_exe.forward()
+        assert len(exe.outputs) == len(partitioned_exe.outputs)
+        for i in range(len(exe.outputs)):
+            assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(),
+                                np.zeros(shape=(1,)))
+
+    def _check_subgraph_exe2(sym, op_names):
+        """Use env var MXNET_SUBGRAPH_BACKEND=default to trigger graph partitioning in simple_bind
+        and compare results of the partitioned sym and the original sym."""
+        def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
+            if subgraph_backend is not None:
+                os.environ['MXNET_SUBGRAPH_BACKEND'] = subgraph_backend
+                check_call(_LIB.MXSetSubgraphPropertyOpNames(c_str(subgraph_backend), mx_uint(len(op_names)),
+                                                             c_str_array(op_names)))
+            exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+            input_names = sym.list_inputs()
+            for name in input_names:
+                if name in exe.arg_dict:
+                    exe.arg_dict[name][:] = mx.nd.random.uniform(shape=exe.arg_dict[name].shape)\
+                        if original_exec is None else original_exec.arg_dict[name]
+                else:
+                    assert name in exe.aux_dict
+                    exe.aux_dict[name][:] = mx.nd.random.uniform(shape=exe.aux_dict[name].shape)\
+                        if original_exec is None else original_exec.aux_dict[name]
+            exe.forward()
+            if subgraph_backend is not None:
+                check_call(_LIB.MXRemoveSubgraphPropertyOpNames(c_str(subgraph_backend)))
+                del os.environ['MXNET_SUBGRAPH_BACKEND']
+            return exe
+
+        original_exec = get_executor(sym)
+        partitioned_exec = get_executor(sym, 'default', op_names, original_exec)
+        outputs1 = original_exec.outputs
+        outputs2 = partitioned_exec.outputs
+        assert len(outputs1) == len(outputs2)
+        for i in range(len(outputs1)):
+            assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
+
+    def _check_subgraph_exe3(sym, op_names):
+        """Use the partitioned sym to bind an executor and compare the outputs
+        with those of the original executor"""
+        out = SymbolHandle()
+        check_call(_LIB.MXPartitionGraphByOpNames(sym.handle, c_str('default'), mx_uint(len(op_names)),
+                                                  c_str_array(op_names), ctypes.byref(out)))
+
+        partitioned_sym = Symbol(out)
+        input_names = sym.list_inputs()
+        arg_names = sym.list_arguments()
+        aux_names = sym.list_auxiliary_states()
+        assert partitioned_sym.list_inputs() == input_names
+        assert partitioned_sym.list_arguments() == arg_names
+        assert partitioned_sym.list_auxiliary_states() == aux_names
+        arg_shapes, _, aux_shapes = sym.infer_shape()
+        arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes]
+        aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
+        exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+        partitioned_exe = partitioned_sym.bind(ctx=mx.current_context(), args=arg_array,
+                                               aux_states=aux_array, grad_req='null')
+        exe.forward()
+        partitioned_exe.forward()
+        assert len(exe.outputs) == len(partitioned_exe.outputs)
+        for i in range(len(exe.outputs)):
+            assert_almost_equal((exe.outputs[i] - partitioned_exe.outputs[i]).abs().sum().asnumpy(),
+                                np.zeros(shape=(1,)))
+
+    def _check_subgraph_exe4(sym, op_names):
+        """Use env var MXNET_SUBGRAPH_BACKEND=default to trigger graph partitioning in bind
+        and compare results of the partitioned sym and the original sym."""
+        def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
+            if subgraph_backend is not None:
+                os.environ['MXNET_SUBGRAPH_BACKEND'] = subgraph_backend
+                check_call(_LIB.MXSetSubgraphPropertyOpNames(c_str(subgraph_backend), mx_uint(len(op_names)),
+                                                             c_str_array(op_names)))
+            arg_shapes, _, aux_shapes = sym.infer_shape()
+            if subgraph_backend is None:
+                arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes]
+                aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
+            else:
+                arg_array = None
+                aux_array = None
+            exe = sym.bind(ctx=mx.current_context(),
+                           args=arg_array if subgraph_backend is None else original_exec.arg_arrays,
+                           aux_states=aux_array if subgraph_backend is None else original_exec.aux_arrays,
+                           grad_req='null')
+            exe.forward()
+            if subgraph_backend is not None:
+                check_call(_LIB.MXRemoveSubgraphPropertyOpNames(c_str(subgraph_backend)))
+                del os.environ['MXNET_SUBGRAPH_BACKEND']
+            return exe
+
+        original_exec = get_executor(sym)
+        partitioned_exec = get_executor(sym, 'default', op_names, original_exec)
+        outputs1 = original_exec.outputs
+        outputs2 = partitioned_exec.outputs
+        assert len(outputs1) == len(outputs2)
+        for i in range(len(outputs1)):
+            assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
+
+    def check_subgraph_exe(sym, op_names):
+        _check_subgraph_exe1(sym, op_names)
+        _check_subgraph_exe2(sym, op_names)
+        _check_subgraph_exe3(sym, op_names)
+        _check_subgraph_exe4(sym, op_names)
+
+    def test_network_structure_1():
+        data1 = mx.sym.var('data1', shape=(2, 3, 10, 10))
+        data2 = mx.sym.var('data2')
+        conv1 = mx.sym.Convolution(data=data1, weight=data2, no_bias=True, kernel=(2, 2), num_filter=1)
+        conv2 = mx.sym.Convolution(data=data2, no_bias=True, kernel=(1, 1), num_filter=1)
+        out = mx.sym.Group([conv1, conv2])
+        check_subgraph_exe(out, ['Convolution'])
+
+    def test_network_structure_2():
+        # this tests whether the partitioning algorithm can deal with cycles
+        data = mx.sym.var('data', shape=(2, 3, 10, 10))
+        ret = mx.sym.exp(data)
+        ret1 = mx.sym.cos(ret)
+        ret2 = mx.sym.sin(ret)
+        ret = ret1 + ret2
+        check_subgraph_exe(ret, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus'])
+        check_subgraph_exe(ret, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus'])
+
+    def test_network_structure_3():
+        # this tests whether the partitioned sym can distinguish in_args and aux_states
+        data = mx.sym.var('data', shape=(2, 3, 10, 10))
+        ret = mx.sym.exp(data)
+        ret1 = mx.sym.cos(ret)
+        ret2 = mx.sym.sin(ret)
+        ret = ret1 + ret2
+        ret = mx.sym.BatchNorm(ret)
+        ret = mx.sym.BatchNorm(ret)
+        check_subgraph_exe(ret, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus'])
+        check_subgraph_exe(ret, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus'])
+        check_subgraph_exe(ret, ['exp', 'sin', '_Plus', 'elemwise_add', '_plus', 'BatchNorm'])
+        check_subgraph_exe(ret, ['exp', 'cos', '_Plus', 'elemwise_add', '_plus', 'BatchNorm'])
+        check_subgraph_exe(ret, ['exp', 'BatchNorm'])
+        check_subgraph_exe(ret, ['BatchNorm'])
+
+    def test_network_structure_4():
+        # the last op has multiple duplicate outputs
+        data = mx.sym.var('data', shape=(2, 3, 10, 10))
+        ret = mx.sym.exp(data)
+        ret = mx.sym.Group([ret, ret, ret])
+        check_subgraph_exe(ret, ['exp'])
+
+    def test_network_structure_5():
+        # the subgraph has two duplicate input entries
+        data = mx.sym.var('data', shape=(2, 3, 10, 10))
+        ret = data + data
+        check_subgraph_exe(ret, ['_plus', '_Plus', 'elemwise_add'])
+
+    def test_network_structure_6():
+        def get_graph():
+            data1 = mx.sym.Variable('data1', shape=(3, 3, 10, 10), dtype=np.float32)
+            data2 = mx.sym.Variable('data2', shape=(1, 0, 2, 2))
+            data3 = mx.sym.sin(data2)
+            conv = mx.sym.Convolution(data=data1, weight=data3, kernel=(2, 2), num_filter=1)
+            rets = [(conv, []),
+                    (conv, [mx.sym.sin.__name__]),
+                    (conv, [mx.sym.Convolution.__name__]),
+                    (conv, [mx.sym.sin.__name__, mx.sym.Convolution.__name__])]
+            return rets
+
+        for sym, op_names in get_graph():
+            check_subgraph_exe(sym, op_names)
+
+    def test_network_structure_7():
+        # in this graph, the subgraph node and the other two external nodes form a cycle
+        data = mx.sym.Variable('data', shape=(1,))
+        ret1 = mx.sym.sin(data)
+        ret2 = mx.sym.cos(ret1)
+        for _ in range(5):
+            ret2 = mx.sym.cos(ret2)
+        ret = ret1 + ret2
+        check_subgraph_exe(ret, ['sin', 'elemwise_add', '_plus', '_Plus'])
+
+    test_network_structure_1()
+    test_network_structure_2()
+    test_network_structure_3()
+    test_network_structure_4()
+    test_network_structure_5()
+    test_network_structure_6()
+    test_network_structure_7()
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index aece9a37812..d022c68237a 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -171,7 +171,7 @@ def test_symbol_infer_shape_var():
 def test_symbol_fluent():
     has_grad = set(['flatten', 'expand_dims', 'flip', 'tile', 'transpose', 'sum', 'nansum', 'prod',
                     'nanprod', 'mean', 'max', 'min', 'reshape', 'broadcast_to', 'split',
-                    'broadcast_axes', 'pad', 'swapaxes', 'slice', 'slice_axis', 'slice_like',
+                    'broadcast_axes', 'broadcast_like', 'pad', 'swapaxes', 'slice', 'slice_axis', 'slice_like',
                     'take', 'one_hot', 'pick', 'sort', 'topk', 'argsort', 'argmax', 'argmin',
                     'clip', 'abs', 'sign', 'sin', 'cos', 'tan', 'arcsin', 'arccos', 'arctan',
                     'degrees', 'radians', 'sinh', 'cosh', 'tanh', 'arcsinh', 'arccosh', 'arctanh',
@@ -212,6 +212,7 @@ def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False):
     check_fluent_regular('slice_like', {'axes': (0, -2), 'shape_like': mx.sym.zeros((3, 3))})
     check_fluent_regular('clip', {'a_min': 0.25, 'a_max': 0.75})
     check_fluent_regular('broadcast_axes', {'axis': (2,), 'size': (5,)})
+    check_fluent_regular('broadcast_like', {'rhs': mx.sym.ones((1, 5)), 'lhs_axes': (0,), 'rhs_axes': (1,)}, shape=(1,9))
     check_fluent_regular('pad', {'mode': 'constant', 'pad_width': (0,0,0,0,3,0,0,4)}, shape=(5, 17, 2, 3))
     check_fluent_regular('reshape_like', {'rhs': mx.sym.ones((30, 17))}, shape=(5, 17, 2, 3))
 
diff --git a/tests/travis/is_core_changed.sh b/tests/travis/is_core_changed.sh
deleted file mode 100755
index 7b9eb612384..00000000000
--- a/tests/travis/is_core_changed.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# this is a util script to test whether the "core" of
-# mxnet has changed. Please modify the regex patterns here
-# to ensure the components are covered if you add new "core"
-# components to mxnet
-
-# temporarily disable this b/c the OS X tests are failing mysteriously
-exit 0
-
-# DEBUG
-echo "Files changed in this PR includes:"
-echo "**********************************"
-git diff --name-only HEAD^
-echo "**********************************"
-
-# we ignore examples, and docs
-core_patterns=(
-  '^dmlc-core'
-  '^matlab'
-  '^plugin'
-  '^python'
-  '^src'
-  '^tools'
-  '^R-package'
-  '^amalgamation'
-  '^include'
-  '^mshadow'
-  '^ps-lite'
-  '^scala-package'
-  '^tests'
-)
-
-for pat in ${core_patterns[@]}; do
-  if git diff --name-only HEAD^ | grep "$pat"
-  then
-    exit
-  fi
-done
-
-echo "I think we are good to skip this travis ci run now"
-exit 1 # means nothing has changed
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
deleted file mode 100755
index fd23f0e82b2..00000000000
--- a/tests/travis/run_test.sh
+++ /dev/null
@@ -1,210 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-if ! tests/travis/is_core_changed.sh
-then
-  exit 0
-fi
-
-if [ ${TASK} == "lint" ]; then
-    make lint || exit -1
-    echo "Check documentations of c++ code..."
-    make doc 2>log.txt
-    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
-    echo "---------Error Log----------"
-    cat logclean.txt
-    echo "----------------------------"
-    (cat logclean.txt|grep warning) && exit -1
-    (cat logclean.txt|grep error) && exit -1
-    exit 0
-fi
-
-cp make/config.mk config.mk
-
-if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
-    echo "USE_BLAS=apple" >> config.mk
-    echo "USE_OPENMP=0" >> config.mk
-else
-    # use g++-4.8 for linux
-    if [[ ${CXX} == "g++" ]]; then
-        export CXX=g++-4.8
-    fi
-    echo "USE_BLAS=blas" >> config.mk
-fi
-echo "CXX=${CXX}" >>config.mk
-echo "USE_PROFILER=1" >> config.mk
-
-if [ ${TASK} == "build" ]; then
-    if [ ${TRAVIS_OS_NAME} == "linux" ]; then
-        echo "USE_CUDA=1" >> config.mk
-        ./dmlc-core/scripts/setup_nvcc.sh $NVCC_PREFIX
-    fi
-    make all
-    exit $?
-fi
-
-if [ ${TASK} == "cpp_test" ]; then
-    make -f dmlc-core/scripts/packages.mk gtest
-    echo "GTEST_PATH="${CACHE_PREFIX} >> config.mk
-    make test || exit -1
-    export MXNET_ENGINE_INFO=true
-    ./build/tests/cpp/mxnet_test
-    exit 0
-fi
-
-if [ ${TASK} == "r_test" ]; then
-    make all || exit -1
-    # use cached dir for storing data
-    rm -rf ${PWD}/data
-    mkdir -p ${CACHE_PREFIX}/data
-    ln -s ${CACHE_PREFIX}/data ${PWD}/data
-
-    set -e
-    export _R_CHECK_TIMINGS_=0
-
-    if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
-        wget https://cran.rstudio.com/bin/macosx/R-latest.pkg  -O /tmp/R-latest.pkg
-        sudo installer -pkg "/tmp/R-latest.pkg" -target /
-        Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
-    fi
-
-    cd R-package
-    Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cran.rstudio.com')); install_deps(dependencies = TRUE)"
-    cd ..
-
-    make rpkg
-#    R CMD check --no-examples --no-manual --no-vignettes --no-build-vignettes mxnet_*.tar.gz
-    R CMD INSTALL mxnet_*.tar.gz
-
-    Rscript tests/travis/r_vignettes.R
-
-    wget http://data.mxnet.io/mxnet/data/Inception.zip
-    unzip Inception.zip && rm -rf Inception.zip
-    wget http://data.mxnet.io/mxnet/data/mnist.zip
-    unzip mnist.zip && rm -rf mnist.zip
-
-    cat CallbackFunctionTutorial.R \
-    fiveMinutesNeuralNetwork.R \
-    mnistCompetition.R \
-    ndarrayAndSymbolTutorial.R > r_test.R
-
-    Rscript r_test.R || exit -1
-
-    exit 0
-fi
-
-if [ ${TASK} == "python_test" ]; then
-    make all || exit -1
-    # use cached dir for storing data
-    rm -rf ${PWD}/data
-    mkdir -p ${PWD}/data
-
-    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        python -m nose -v tests/python/unittest || exit -1
-        python3 -m nose -v tests/python/unittest || exit -1
-        # make cython3
-        # cython tests
-        # export MXNET_ENFORCE_CYTHON=1
-        # python3 -m nose tests/python/unittest || exit -1
-        python3 -m nose -v tests/python/train || exit -1
-        python -m nose -v tests/python/doctest || exit -1
-        python3 -m nose -v tests/python/doctest || exit -1
-    else
-        nosetests -v tests/python/unittest || exit -1
-        nosetests3 -v tests/python/unittest || exit -1
-        nosetests3 -v tests/python/train || exit -1
-        nosetests -v tests/python/doctest || exit -1
-        nosetests3 -v tests/python/doctest || exit -1
-    fi
-    exit 0
-fi
-
-if [ ${TASK} == "julia" ]; then
-    make all || exit -1
-    # use cached dir for storing data
-    rm -rf ${PWD}/data
-    mkdir -p ${PWD}/data
-
-    export MXNET_HOME="${PWD}"
-    julia -e 'Pkg.clone("MXNet"); Pkg.checkout("MXNet"); Pkg.build("MXNet"); Pkg.test("MXNet")' || exit -1
-    exit 0
-fi
-
-if [ ${TASK} == "scala_test" ]; then
-    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        LIB_GOMP_PATH=`find /usr/local/lib -name libgomp.dylib | grep -v i386 | head -n1`
-        ln -sf $LIB_GOMP_PATH /usr/local/lib/libgomp.dylib
-    fi
-    make all || exit -1
-    # use cached dir for storing data
-    rm -rf ${PWD}/data
-    mkdir -p ${PWD}/data
-
-    export JAVA_HOME=$(/usr/libexec/java_home)
-
-    make scalapkg || exit -1
-    make scalatest || exit -1
-
-    exit 0
-fi
-
-if [ ${TASK} == "perl_test" ]; then
-    make all || exit -1
-
-    # use cached dir for storing data
-    MXNET_HOME=${PWD}
-    rm -rf ${MXNET_HOME}/perl-package/AI-MXNet/data
-    mkdir -p ${CACHE_PREFIX}/data
-    ln -s ${CACHE_PREFIX}/data ${MXNET_HOME}/perl-package/AI-MXNet/data
-
-    export LD_LIBRARY_PATH=${MXNET_HOME}/lib
-    export PERL5LIB=${HOME}/perl5/lib/perl5
-
-    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make || exit -1
-    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        install_name_tool -change lib/libmxnet.so \
-            ${MXNET_HOME}/lib/libmxnet.so \
-            blib/arch/auto/AI/MXNetCAPI/MXNetCAPI.bundle
-    fi
-    make install || exit -1
-
-    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
-    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
-    make || exit -1
-    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        install_name_tool -change lib/libmxnet.so \
-            ${MXNET_HOME}/lib/libmxnet.so \
-            blib/arch/auto/AI/NNVMCAPI/NNVMCAPI.bundle
-    fi
-    make install || exit -1
-
-    cd ${MXNET_HOME}/perl-package/AI-MXNet/
-    perl Makefile.PL
-    make test || exit -1
-    exit 0
-fi
-
-if [ ${TASK} == "cpp_package_test" ]; then
-    MXNET_HOME=${PWD}
-    make travis -C ${MXNET_HOME}/cpp-package/example
-    exit 0
-fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
deleted file mode 100755
index eec6c23d715..00000000000
--- a/tests/travis/setup.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-if ! tests/travis/is_core_changed.sh
-then
-  exit 0
-fi
-
-if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-    brew update
-    brew tap homebrew/science
-    brew install opencv
-    brew install python3
-    brew install fftw
-    brew install libpng
-    brew install ImageMagick
-    brew install swig
-    if [ ${TASK} == "python_test" ]; then
-        python -m pip install --user nose numpy cython scipy
-        python3 -m pip install --user nose numpy cython scipy
-    fi
-fi
-
-if [ ${TASK} == "lint" ]; then
-    pip install --user cpplint 'pylint==1.4.4' 'astroid==1.3.6'
-fi
-
-if [ ${TASK} == "julia" ]; then
-  mkdir -p ~/julia
-  curl -s -L --retry 7 "https://s3.amazonaws.com/julialang/bin/linux/x64/${JULIA_VER}/julia-${JULIA_VER}-latest-linux-x86_64.tar.gz" | tar -C ~/julia -x -z --strip-components=1 -f -
-  export PATH="${PATH}:${HOME}/julia/bin"
-  julia -e 'versioninfo()'
-fi
-
-if [ ${TASK} == "perl_test" ]; then
-    if [ ${TRAVIS_OS_NAME} == "linux" ]; then
-       cpanm -q -L "${HOME}/perl5" Function::Parameters Hash::Ordered PDL::CCS
-    else
-       sudo sh -c 'curl -L https://cpanmin.us | perl - App::cpanminus'
-       sudo cpanm -q -n PDL Mouse Function::Parameters Hash::Ordered PDL::CCS
-    fi
-fi
diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index f87e98e9212..e59521f2704 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -24,8 +24,15 @@
 # automated test suite.
 # Rules to be in the whitelist:
 # - not a python tutorial
-whitelist = ['c++/basics.md',
+whitelist = ['basic/index.md',
+             'c++/basics.md',
+             'c++/index.md',
+             'embedded/index.md',
              'embedded/wine_detector.md',
+             'gluon/index.md',
+             'nlp/index.md',
+             'onnx/index.md',
+             'python/index.md',
              'r/CallbackFunction.md',
              'r/charRnnModel.md',
              'r/classifyRealImageWithPretrainedModel.md',
@@ -39,7 +46,11 @@
              'scala/char_lstm.md',
              'scala/mnist.md',
              'scala/index.md',
-             'scala/mxnet_scala_on_intellij.md']
+             'scala/mxnet_scala_on_intellij.md',
+             'sparse/index.md',
+             'speech_recognition/index.md',
+             'unsupervised_learning/index.md',
+             'vision/index.md']
 whitelist_set = set(whitelist)
 
 def test_tutorial_downloadable():
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 22d00c181b6..a2442a4f6a0 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -124,6 +124,9 @@ def test_nlp_cnn():
 def test_onnx_super_resolution():
     assert _test_tutorial_nb('onnx/super_resolution')
 
+def test_onnx_export_mxnet_to_onnx():
+    assert _test_tutorial_nb('onnx/export_mxnet_to_onnx')
+
 def test_onnx_fine_tuning_gluon():
     assert _test_tutorial_nb('onnx/fine_tuning_gluon')
 
@@ -139,6 +142,9 @@ def test_python_linear_regression():
 def test_python_logistic_regression() :
     assert _test_tutorial_nb('gluon/logistic_regression_explained')
 
+def test_python_numpy_gotchas() :
+    assert _test_tutorial_nb('gluon/gotchas_numpy_in_mxnet')
+
 def test_python_mnist():
     assert _test_tutorial_nb('python/mnist')
 
@@ -180,3 +186,6 @@ def test_vision_large_scale_classification():
 
 def test_vision_cnn_visualization():
     assert _test_tutorial_nb('vision/cnn_visualization')
+
+def test_control_flow():
+    assert _test_tutorial_nb('control_flow/ControlFlowTutorial')
diff --git a/tools/coreml/pip_package/setup.py b/tools/coreml/pip_package/setup.py
index 18c601d3816..35614271bfd 100644
--- a/tools/coreml/pip_package/setup.py
+++ b/tools/coreml/pip_package/setup.py
@@ -40,7 +40,7 @@ def readme():
         return f.read()
 
 setup(name='mxnet-to-coreml',
-      version='0.1.0',
+      version='0.1.3',
       description='Tool to convert MXNet models into Apple CoreML model format.',
       long_description=readme(),
       classifiers=[


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services