You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/06/11 23:17:16 UTC

[GitHub] larroy closed pull request #11186: Devel arm

larroy closed pull request #11186: Devel arm
URL: https://github.com/apache/incubator-mxnet/pull/11186
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/.gitignore b/.gitignore
index d585672ab7d..416741a5e70 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,3 +166,7 @@ python/.eggs
 *DartConfiguration.tcl
 tests/Makefile
 tests/mxnet_unit_tests
+
+# generated wrappers for ccache
+cc
+cxx
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e57c00b69e9..8a1765a0e67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -321,14 +321,15 @@ endif()
 
 # ---[ OpenCV
 if(USE_OPENCV)
-  find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
+  find_package(OpenCV COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
+    message(STATUS "OpenCV imgcodecs missing")
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
   endif()
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
   list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
   message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}")
-  message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
+  message(STATUS "OpenCV ${OpenCV_VERSION} found (${OpenCV_CONFIG_PATH})")
   add_definitions(-DMXNET_USE_OPENCV=1)
 else(USE_OPENCV)
   message(STATUS "OpenCV Disabled")
@@ -340,7 +341,11 @@ if(USE_OPENMP)
   find_package(OpenMP REQUIRED)
   # This should build on Windows, but there's some problem and I don't have a Windows box, so
   # could a Windows user please fix?
-  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp/CMakeLists.txt AND SYSTEM_ARCHITECTURE STREQUAL "x86_64" AND NOT MSVC)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp/CMakeLists.txt
+     AND SYSTEM_ARCHITECTURE STREQUAL "x86_64"
+     AND NOT MSVC
+     AND NOT CMAKE_CROSSCOMPILING)
+
     # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
     set(OPENMP_STANDALONE_BUILD TRUE)
     set(LIBOMP_ENABLE_SHARED TRUE)
@@ -648,7 +653,7 @@ if(USE_PLUGINS_WARPCTC)
 endif()
 
 
-if(USE_OPENCV)
+if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
   add_executable(im2rec "tools/im2rec.cc")
   if(MSVC)
     target_link_libraries(im2rec mxnet)
@@ -662,6 +667,9 @@ if(USE_OPENCV)
     ${nnvm_LINKER_LIBS}
     ${pslite_LINKER_LIBS}
     )
+else()
+    message(WARNING "OpenCV_VERSION_MAJOR: ${OpenCV_VERSION_MAJOR}, version 3 with imgcodecs \
+    is required for im2rec, im2rec will not be available")
 endif()
 
 target_link_libraries(mxnet PUBLIC dmlc)
diff --git a/Makefile b/Makefile
index 03212841fa3..ff4446ab80c 100644
--- a/Makefile
+++ b/Makefile
@@ -477,7 +477,7 @@ endif
 $(PS_PATH)/build/libps.a: PSLITE
 
 PSLITE:
-	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) ps
+	$(MAKE) CXX="$(CXX)" DEPS_PATH="$(DEPS_PATH)" -C $(PS_PATH) ps
 
 $(DMLC_CORE)/libdmlc.a: DMLCCORE
 
diff --git a/ci/README.md b/ci/README.md
index 1c59a3af7c8..ca46434a30f 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -54,7 +54,7 @@ The artifacts are located in the build/ directory in the project root. In case
 
 ## Add a platform
 
-To add a platform, you should add the appropiate dockerfile in
+To add a platform, you should add the appropriate dockerfile in
 docker/Dockerfile.build.<platform> and add a shell function named
 build_<platform> to the file docker/runtime_functions.sh with build
 instructions for that platform.
@@ -63,3 +63,9 @@ instructions for that platform.
 Due to current limitations of the CMake build system creating artifacts in the
 source 3rdparty folder of the parent mxnet sources concurrent builds of
 different platforms is NOT SUPPORTED.
+
+## ccache
+For all builds a directory from the host system is mapped where ccache will store cached 
+compiled object files (defaults to /tmp/ci_ccache). This will speed up rebuilds 
+significantly. You can set this directory explicitly by setting CCACHE_DIR environment 
+variable. All ccache instances are currently set to be 10 Gigabytes max in size.
diff --git a/ci/build.py b/ci/build.py
index e52fa794bc9..f4c1a3e8d99 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -33,13 +33,15 @@
 import shutil
 import subprocess
 import sys
+import tempfile
 from copy import deepcopy
 from itertools import chain
 from subprocess import call, check_call
 from typing import *
 
+CCACHE_MAXSIZE = '10G'
 
-def get_platforms(path: Optional[str]="docker"):
+def get_platforms(path: Optional[str] = "docker"):
     """Get a list of architectures given our dockerfiles"""
     dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
     dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
@@ -72,11 +74,11 @@ def build_docker(platform: str, docker_binary: str, registry: str) -> None:
     tag = get_docker_tag(platform=platform, registry=registry)
     logging.info("Building container tagged '%s' with %s", tag, docker_binary)
     cmd = [docker_binary, "build",
-        "-f", get_dockerfile(platform),
-        "--build-arg", "USER_ID={}".format(os.getuid()),
-        "--cache-from", tag,
-        "-t", tag,
-        "docker"]
+           "-f", get_dockerfile(platform),
+           "--build-arg", "USER_ID={}".format(os.getuid()),
+           #"--cache-from", tag,
+           "-t", tag,
+           "docker"]
     logging.info("Running command: '%s'", ' '.join(cmd))
     check_call(cmd)
 
@@ -102,8 +104,10 @@ def _get_local_image_id(docker_binary, docker_tag):
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
+
     def is_mxnet_root(path: str) -> bool:
         return os.path.exists(os.path.join(path, ".mxnet_root"))
+
     while not is_mxnet_root(curpath):
         parent = os.path.abspath(os.path.join(curpath, os.pardir))
         if parent == curpath:
@@ -116,10 +120,20 @@ def buildir() -> str:
     return os.path.join(get_mxnet_root(), "build")
 
 
+def default_ccache_dir() -> str:
+    if 'CCACHE_DIR' in os.environ:
+        ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
+        os.makedirs(ccache_dir, exist_ok=True)
+        return ccache_dirpython
+    # Share ccache across containers
+    return os.path.join(tempfile.gettempdir(), "ci_ccache")
+
+
 def container_run(platform: str,
                   docker_binary: str,
                   docker_registry: str,
                   shared_memory_size: str,
+                  local_ccache_dir: str,
                   command: List[str],
                   dry_run: bool = False,
                   into_container: bool = False) -> str:
@@ -128,12 +142,17 @@ def container_run(platform: str,
     local_build_folder = buildir()
     # We need to create it first, otherwise it will be created by the docker daemon with root only permissions
     os.makedirs(local_build_folder, exist_ok=True)
+    os.makedirs(local_ccache_dir, exist_ok=True)
+    logging.info("Using ccache directory: %s", local_ccache_dir)
     runlist = [docker_binary, 'run', '--rm', '-t',
-        '--shm-size={}'.format(shared_memory_size),
-        '-v', "{}:/work/mxnet".format(mx_root), # mount mxnet root
-        '-v', "{}:/work/build".format(local_build_folder), # mount mxnet/build for storing build artifacts
-        '-u', '{}:{}'.format(os.getuid(), os.getgid()),
-        tag]
+               '--shm-size={}'.format(shared_memory_size),
+               '-v', "{}:/work/mxnet".format(mx_root),  # mount mxnet root
+               '-v', "{}:/work/build".format(local_build_folder),  # mount mxnet/build for storing build artifacts
+               '-v', "{}:/work/ccache".format(local_ccache_dir),
+               '-u', '{}:{}'.format(os.getuid(), os.getgid()),
+               '-e', 'CCACHE_MAXSIZE={}'.format(CCACHE_MAXSIZE),
+               '-e', "CCACHE_DIR=/work/ccache",  # this path is inside the container as /work/ccache is mounted
+               tag]
     runlist.extend(command)
     cmd = ' '.join(runlist)
     if not dry_run and not into_container:
@@ -160,19 +179,17 @@ def container_run(platform: str,
 def list_platforms() -> str:
     print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
 
-
 def load_docker_cache(tag, docker_registry) -> None:
     if docker_registry:
         try:
             import docker_cache
-            logging.info('Docker cache download is enabled')
+            logging.info('Docker cache download is enabled from registry %s', docker_registry)
             docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag)
         except Exception:
             logging.exception('Unable to retrieve Docker cache. Continue without...')
     else:
         logging.info('Distributed docker cache disabled')
 
-
 def main() -> int:
     # We need to be in the same directory than the script so the commands in the dockerfiles work as
     # expected. But the script can be invoked from a different path
@@ -187,7 +204,7 @@ def script_name() -> str:
     logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))
 
     parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker
-    containers""",epilog="")
+    containers""", epilog="")
     parser.add_argument("-p", "--platform",
                         help="platform",
                         type=str)
@@ -221,63 +238,75 @@ def script_name() -> str:
                         help="go in a shell inside the container",
                         action='store_true')
 
-    parser.add_argument("--docker-registry",
-                        help="Dockerhub registry name to retrieve cache from",
+    parser.add_argument("-d", "--docker-registry",
+                        help="Dockerhub registry name to retrieve cache from. Default is 'mxnetci'",
                         default='mxnetci',
                         type=str)
 
+    parser.add_argument("-c", "--cache", action="store_true",
+                        help="Enable docker registry cache")
+
     parser.add_argument("command",
                         help="command to run in the container",
                         nargs='*', action='append', type=str)
 
+    parser.add_argument("--ccache-dir",
+                        default=default_ccache_dir(),
+                        help="Ccache directory",
+                        type=str)
+
     args = parser.parse_args()
-    docker_registry = args.docker_registry
+    def use_cache():
+        return args.cache or 'JOB_NAME' in os.environ # we are in Jenkins
+
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
     shared_memory_size = args.shared_memory_size
 
-    print("into container: {}".format(args.into_container))
     if args.list:
         list_platforms()
     elif args.platform:
         platform = args.platform
-        tag = get_docker_tag(platform=platform, registry=docker_registry)
-        load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-        build_docker(platform, docker_binary, registry=docker_registry)
+        tag = get_docker_tag(platform=platform, registry=args.docker_registry)
+        if use_cache():
+            load_docker_cache(tag=tag, docker_registry=args.docker_registry)
+        build_docker(platform, docker_binary, registry=args.docker_registry)
         if args.build_only:
             logging.warning("Container was just built. Exiting due to build-only.")
             return 0
 
         if command:
             container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=command, docker_registry=docker_registry)
+                          command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
         elif args.print_docker_run:
             print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                                command=[], dry_run=True, docker_registry=docker_registry))
+                                command=[], dry_run=True, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir))
         elif args.into_container:
             container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=[], dry_run=False, into_container=True, docker_registry=docker_registry)
+                          command=[], dry_run=False, into_container=True, docker_registry=args.docker_registry,
+                          local_ccache_dir=args.ccache_dir)
         else:
             cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
             logging.info("No command specified, trying default build: %s", ' '.join(cmd))
             container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=cmd, docker_registry=docker_registry)
+                          command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
 
     elif args.all:
         platforms = get_platforms()
         logging.info("Building for all architectures: {}".format(platforms))
         logging.info("Artifacts will be produced in the build/ directory.")
         for platform in platforms:
-            tag = get_docker_tag(platform=platform, registry=docker_registry)
-            load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary)
+            tag = get_docker_tag(platform=platform, registry=args.docker_registry)
+            if use_cache():
+                load_docker_cache(tag=tag, docker_registry=args.docker_registry)
+            build_docker(platform, docker_binary, args.docker_registry)
             if args.build_only:
                 continue
             build_platform = "build_{}".format(platform)
             cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
             shutil.rmtree(buildir(), ignore_errors=True)
             container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=cmd, docker_registry=docker_registry)
+                          command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
             plat_buildir = os.path.join(get_mxnet_root(), build_platform)
             shutil.move(buildir(), plat_buildir)
             logging.info("Built files left in: %s", plat_buildir)
diff --git a/ci/docker/Dockerfile.build.android_arm64 b/ci/docker/Dockerfile.build.android_arm64
index d7687514d7a..3934858d655 100755
--- a/ci/docker/Dockerfile.build.android_arm64
+++ b/ci/docker/Dockerfile.build.android_arm64
@@ -18,13 +18,21 @@
 #
 # Dockerfile to build MXNet for Android ARM64/ARMv8
 
+#FROM ubuntu:16.04 as ccachebuilder
+
+#COPY install/ubuntu_core.sh /work/
+#RUN /work/ubuntu_core.sh
+#COPY install/ubuntu_ccache.sh /work/
+#RUN /work/ubuntu_ccache.sh
+
 FROM dockcross/base:latest
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
+# extract ccache binary into latest context
+#COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
+
 # The cross-compiling emulator
 RUN apt-get update && apt-get install -y \
-  qemu-user \
-  qemu-user-static \
   unzip
 
 ENV CROSS_TRIPLE=aarch64-linux-android
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index c22e000cad1..70bb9fb0eae 100755
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -18,9 +18,19 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
+FROM ubuntu:16.04 as ccachebuilder
+
+COPY install/ubuntu_core.sh /work/
+RUN /work/ubuntu_core.sh
+COPY install/ubuntu_ccache.sh /work/
+RUN /work/ubuntu_ccache.sh
+
 FROM dockcross/base:latest
 MAINTAINER Pedro Larroy "pllarroy@amazon.com"
 
+# extract ccache binary into latest context
+COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
+
 # The cross-compiling emulator
 RUN apt-get update && apt-get install -y \
   qemu-user \
diff --git a/ci/docker/Dockerfile.build.arm64 b/ci/docker/Dockerfile.build.arm64
index ec949600f73..d88b3805fb2 100755
--- a/ci/docker/Dockerfile.build.arm64
+++ b/ci/docker/Dockerfile.build.arm64
@@ -18,22 +18,35 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
+FROM ubuntu:16.04 as ccachebuilder
+
+COPY install/ubuntu_core.sh /work/
+RUN /work/ubuntu_core.sh
+COPY install/ubuntu_ccache.sh /work/
+RUN /work/ubuntu_ccache.sh
+
 # Temporary fix due to https://github.com/apache/incubator-mxnet/issues/10837
 #FROM dockcross/linux-arm64
 FROM mxnetci/dockcross-linux-arm64:05082018
 
+# extract ccache binary into latest context
+COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
+
 ENV ARCH aarch64
 ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
 ENV TARGET ARMV8
 
-WORKDIR /work
+WORKDIR /work/deps
+
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    PREFIX=${CROSS_ROOT} make install
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 20739dabe2e..b97d2782c78 100755
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -18,8 +18,21 @@
 #
 # Dockerfile to build MXNet for ARMv6
 
+FROM ubuntu:16.04 as ccachebuilder
+
+COPY install/ubuntu_core.sh /work/
+RUN /work/ubuntu_core.sh
+COPY install/ubuntu_ccache.sh /work/
+RUN /work/ubuntu_ccache.sh
+
 FROM dockcross/linux-armv6
 
+# extract ccache binary into latest context
+COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
+
+RUN apt-get update
+RUN apt-get install -y unzip
+
 ENV ARCH armv6l
 ENV FC=/usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
@@ -27,11 +40,14 @@ ENV TARGET ARMV6
 
 WORKDIR /work/deps
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    make PREFIX=$CROSS_ROOT install
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
+
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index c2493063518..30c0ea5f459 100755
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -18,15 +18,30 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
+#FROM ubuntu:16.04 as ccachebuilder
+
+#COPY install/ubuntu_core.sh /work/
+#RUN /work/ubuntu_core.sh
+#COPY install/ubuntu_ccache.sh /work/
+#RUN /work/ubuntu_ccache.sh
+
 FROM dockcross/linux-armv7
 
-ENV ARCH armv71
-ENV CC /usr/bin/arm-linux-gnueabihf-gcc
-ENV CXX /usr/bin/arm-linux-gnueabihf-g++
+ENV ARCH armv7l
+ENV HOSTCC gcc
+ENV TARGET ARMV7
+ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
 
-RUN apt-get update && \
-    apt-get install -y libopenblas-dev:armhf && \
-    rm -rf /var/lib/apt/lists/*
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/build
+WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.centos7_cpu b/ci/docker/Dockerfile.build.centos7_cpu
index 92314faf121..076ef5df911 100755
--- a/ci/docker/Dockerfile.build.centos7_cpu
+++ b/ci/docker/Dockerfile.build.centos7_cpu
@@ -24,6 +24,8 @@ WORKDIR /work/deps
 
 COPY install/centos7_core.sh /work/
 RUN /work/centos7_core.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
 COPY install/centos7_python.sh /work/
 RUN /work/centos7_python.sh
 COPY install/ubuntu_mklml.sh /work/
diff --git a/ci/docker/Dockerfile.build.centos7_gpu b/ci/docker/Dockerfile.build.centos7_gpu
index 2d28170f11b..8bf2442731a 100755
--- a/ci/docker/Dockerfile.build.centos7_gpu
+++ b/ci/docker/Dockerfile.build.centos7_gpu
@@ -24,6 +24,8 @@ WORKDIR /work/deps
 
 COPY install/centos7_core.sh /work/
 RUN /work/centos7_core.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
 COPY install/centos7_python.sh /work/
 RUN /work/centos7_python.sh
 
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index c358edb1fb0..4867bb5421d 100755
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -22,22 +22,38 @@
 
 FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
 
+FROM ubuntu:16.04 as ccachebuilder
+
+COPY install/ubuntu_core.sh /work/
+RUN /work/ubuntu_core.sh
+COPY install/ubuntu_ccache.sh /work/
+RUN /work/ubuntu_ccache.sh
+
 # Temporary fix due to https://github.com/apache/incubator-mxnet/issues/10837
 # FROM dockcross/linux-arm64
 FROM mxnetci/dockcross-linux-arm64:05082018
 
+# extract ccache binary into latest context
+COPY --from=ccachebuilder /usr/local/bin/ccache /usr/local/bin/ccache
+
 ENV ARCH aarch64
 ENV FC /usr/bin/${CROSS_TRIPLE}-gfortran
 ENV HOSTCC gcc
 ENV TARGET ARMV8
 
-WORKDIR /work
+WORKDIR /work/deps
+
+COPY install/ubuntu_arm.sh /work/
+RUN /work/ubuntu_arm.sh
+
+COPY install/arm_openblas.sh /work/
+RUN /work/arm_openblas.sh
+
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
-# Build OpenBLAS
-RUN git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git && \
-    cd OpenBLAS && \
-    make -j$(nproc) && \
-    PREFIX=${CROSS_ROOT} make install
+ENV OpenBLAS_HOME=${CROSS_ROOT}
+ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 # Setup CUDA build env (including configuring and copying nvcc)
 COPY --from=cudabuilder /usr/local/cuda /usr/local/cuda
diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
index 4d3c4664363..a87651a4f5a 100755
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -27,6 +27,8 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
+COPY install/ubuntu_ccache.sh /work/
+RUN /work/ubuntu_ccache.sh
 COPY install/ubuntu_python.sh /work/
 RUN /work/ubuntu_python.sh
 COPY install/ubuntu_scala.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu
index 2dc7ef13f21..f82ee75c482 100755
--- a/ci/docker/Dockerfile.build.ubuntu_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu
@@ -24,6 +24,8 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
+COPY install/ubuntu_ccache.sh /work/
+RUN /work/ubuntu_ccache.sh
 COPY install/ubuntu_python.sh /work/
 RUN /work/ubuntu_python.sh
 COPY install/ubuntu_scala.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu b/ci/docker/Dockerfile.build.ubuntu_gpu
index 10971724aaa..5f4bfc5a87d 100755
--- a/ci/docker/Dockerfile.build.ubuntu_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu
@@ -24,6 +24,8 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
+COPY install/ubuntu_ccache.sh /work/
+RUN /work/ubuntu_ccache.sh
 COPY install/ubuntu_python.sh /work/
 RUN /work/ubuntu_python.sh
 COPY install/ubuntu_scala.sh /work/
diff --git a/ci/docker/install/arm_openblas.sh b/ci/docker/install/arm_openblas.sh
new file mode 100755
index 00000000000..fa2e5cae9cb
--- /dev/null
+++ b/ci/docker/install/arm_openblas.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git
+
+cd OpenBLAS
+make -j$(nproc)
+PREFIX=${CROSS_ROOT} make install
+
+cd ..
+
+rm -rf OpenBLAS
diff --git a/ci/docker/install/centos7_ccache.sh b/ci/docker/install/centos7_ccache.sh
new file mode 100755
index 00000000000..846a407001b
--- /dev/null
+++ b/ci/docker/install/centos7_ccache.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Script to build ccache for centos7 based images
+
+set -ex
+
+pushd .
+
+yum -y install autoconf
+yum -y install asciidoc
+
+mkdir -p /work/deps
+cd /work/deps
+
+git clone --recursive -b v3.4.2 https://github.com/ccache/ccache.git
+
+cd ccache
+
+./autogen.sh
+./configure
+make -j$(nproc)
+make install
+
+cd /work/deps
+rm -rf /work/deps/ccache
+
+popd
+
diff --git a/ci/docker/install/ubuntu_arm.sh b/ci/docker/install/ubuntu_arm.sh
new file mode 100755
index 00000000000..becb012bd18
--- /dev/null
+++ b/ci/docker/install/ubuntu_arm.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+apt update
+apt install -y \
+    unzip
diff --git a/ci/docker/install/ubuntu_ccache.sh b/ci/docker/install/ubuntu_ccache.sh
new file mode 100755
index 00000000000..f5102e3a273
--- /dev/null
+++ b/ci/docker/install/ubuntu_ccache.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Script to build ccache for ubuntu based images
+
+set -ex
+
+pushd .
+
+apt update
+apt install -y --no-install-recommends \
+    autoconf \
+    asciidoc \
+    xsltproc
+
+mkdir -p /work/deps
+cd /work/deps
+
+git clone --recursive -b v3.4.2 https://github.com/ccache/ccache.git
+
+cd ccache
+
+./autogen.sh
+./configure
+
+# Don't build documentation #11214
+#perl -pi -e 's!\s+\Q$(installcmd) -d $(DESTDIR)$(mandir)/man1\E!!g' Makefile
+#perl -pi -e 's!\s+\Q-$(installcmd) -m 644 ccache.1 $(DESTDIR)$(mandir)/man1/\E!!g' Makefile
+make -j$(nproc)
+make install
+
+cd /work/deps
+rm -rf /work/deps/ccache
+
+popd
+
diff --git a/ci/docker/install/ubuntu_r.sh b/ci/docker/install/ubuntu_r.sh
index e04e94d6486..4bf1a897e08 100755
--- a/ci/docker/install/ubuntu_r.sh
+++ b/ci/docker/install/ubuntu_r.sh
@@ -27,4 +27,4 @@ gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9
 gpg -a --export E084DAB9 | apt-key add -
 
 apt-get update
-apt-get install -y r-base r-base-dev libxml2-dev libssl-dev libxt-dev
\ No newline at end of file
+apt-get install -y --allow-unauthenticated r-base r-base-dev libxml2-dev libssl-dev libxt-dev
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index fa9de6112ff..83a686f5a29 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -31,31 +31,80 @@ clean_repo() {
     git submodule update --init --recursive
 }
 
+build_ccache_wrappers() {
+    set -ex
 
-# Build commands: Every platform in docker/Dockerfile.build.<platform> should have a corresponding
-# function here with the same suffix:
+    rm -f cc
+    rm -f cxx
+
+    touch cc
+    touch cxx
+
+    if [ -z ${CC+x} ]; then
+        echo "No \$CC set, defaulting to gcc";
+        export CC=gcc
+    fi
+
+    if [ -z ${CXX+x} ]; then
+       echo "No \$CXX set, defaulting to g++";
+       export CXX=g++
+    fi
+
+    # this function is nessesary for cuda enabled make based builds, since nvcc needs just an executable for -ccbin
+
+    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CC} \"\$@\"\n" >> cc
+    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CXX} \"\$@\"\n" >> cxx
+
+    chmod +x cc
+    chmod +x cxx
+
+    export CC=`pwd`/cc
+    export CXX=`pwd`/cxx
+}
+
+build_wheel() {
 
-build_jetson() {
     set -ex
     pushd .
-    mv make/crosscompile.jetson.mk make/config.mk
-    make -j$(nproc)
 
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
+    PYTHON_DIR=${1:-/work/mxnet/python}
+    BUILD_DIR=${2:-/work/build}
+
+    # build
+
+    export MXNET_LIBRARY_PATH=${BUILD_DIR}/libmxnet.so
+
+    cd ${PYTHON_DIR}
     python setup.py bdist_wheel --universal
 
+    # repackage
+
     # Fix pathing issues in the wheel.  We need to move libmxnet.so from the data folder to the
     # mxnet folder, then repackage the wheel.
     WHEEL=`readlink -f dist/*.whl`
     TMPDIR=`mktemp -d`
-    unzip -d $TMPDIR $WHEEL
-    rm $WHEEL
-    cd $TMPDIR
+    unzip -d ${TMPDIR} ${WHEEL}
+    rm ${WHEEL}
+    cd ${TMPDIR}
     mv *.data/data/mxnet/libmxnet.so mxnet
-    zip -r $WHEEL .
-    cp $WHEEL /work/build
-    rm -rf $TMPDIR
+    zip -r ${WHEEL} .
+    cp ${WHEEL} ${BUILD_DIR}
+    rm -rf ${TMPDIR}
+
+    popd
+}
+
+# Build commands: Every platform in docker/Dockerfile.build.<platform> should have a corresponding
+# function here with the same suffix:
+
+build_jetson() {
+    set -ex
+    pushd .
+
+    cp make/crosscompile.jetson.mk ./config.mk
+    make -j$(nproc)
+
+    build_wheel /work/mxnet/python /work/mxnet/lib
     popd
 }
 
@@ -72,42 +121,56 @@ build_armv6() {
     # We do not need OpenMP, since most armv6 systems have only 1 core
 
     cmake \
-        -DCMAKE_TOOLCHAIN_FILE=$CROSS_ROOT/Toolchain.cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DUSE_CUDA=OFF \
         -DUSE_OPENCV=OFF \
         -DUSE_OPENMP=OFF \
         -DUSE_SIGNAL_HANDLER=ON \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=Release\
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
         -DBUILD_CPP_EXAMPLES=OFF \
         -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
+
     ninja
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
-    python setup.py bdist_wheel --universal
-    cp dist/*.whl /work/build
+    build_wheel
     popd
 }
 
 build_armv7() {
     set -ex
     pushd .
+
+    # uncomment for make based build
+    # cp make/crosscompile.armv7.mk ./config.mk
+    # make -j$(nproc)
+
+    # build_wheel /work/mxnet/python /work/mxnet/lib
+
     cd /work/build
-    cmake\
-        -DUSE_CUDA=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+
+    # Lapack functionality will be included and statically linked to openblas.
+    # But USE_LAPACK needs to be set to OFF, otherwise the main CMakeLists.txt
+    # file tries to add -llapack. Lapack functionality though, requires -lgfortran
+    # to be linked additionally.
+
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DCMAKE_CROSSCOMPILING=ON \
+        -DUSE_CUDA=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=ON \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_LAPACK=OFF \
+        -DBUILD_CPP_EXAMPLES=OFF \
+        -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
+
     ninja
-    export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
-    cd /work/mxnet/python
-    python setup.py bdist_wheel --universal
-    cp dist/*.whl /work/build
+    build_wheel
     popd
 }
 
@@ -130,13 +193,14 @@ build_amzn_linux_cpu() {
 build_arm64() {
     cmake\
         -DUSE_CUDA=OFF\
+        -DSUPPORT_F16C=OFF\
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
+        -DCMAKE_BUILD_TYPE=Release\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
     cd /work/mxnet/python
     python setup.py bdist_wheel --universal
@@ -149,6 +213,7 @@ build_android_arm64() {
     cmake\
         -DUSE_CUDA=OFF\
         -DUSE_SSE=OFF\
+        -DSUPPORT_F16C=OFF\
         -DUSE_LAPACK=OFF\
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
@@ -156,7 +221,7 @@ build_android_arm64() {
         -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
-    ninja
+    ninja -v -j1
     export MXNET_LIBRARY_PATH=`pwd`/libmxnet.so
     cd /work/mxnet/python
     python setup.py bdist_wheel --universal
@@ -270,6 +335,10 @@ build_ubuntu_cpu_mkldnn() {
         -j$(nproc)
 }
 
+build_ubuntu_gpu() {
+    build_ubuntu_gpu_cuda91_cudnn7
+}
+
 build_ubuntu_gpu_mkldnn() {
     set -ex
     make  \
@@ -381,7 +450,7 @@ unittest_ubuntu_python3_cpu() {
 
 unittest_ubuntu_python3_cpu_mkldnn() {
     set -ex
-    export PYTHONPATH=./python/ 
+    export PYTHONPATH=./python/
     # MXNET_MKLDNN_DEBUG is buggy and produces false positives
     # https://github.com/apache/incubator-mxnet/issues/10026
     #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index 8fe74ee1904..d2d44b07848 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -1,10 +1,47 @@
-//$('.burger-link').on('click', function(e) { e.stopPropagation() });
-//$('.burger-link').on('touchstart', function(e) { e.stopPropagation() });
+var versionSelect   = defaultVersion = 'v1.2.0';
+var deviceSelect    = 'Linux';
+var languageSelect  = 'Python';
+var processorSelect = 'CPU';
+var environSelect   = 'Pip';
 
 $(document).ready(function () {
     function label(lbl) {
         return lbl.replace(/[ .]/g, '-').toLowerCase();
     }
+
+    function setSelects(){
+        let urlParams = new URLSearchParams(window.location.search);
+        if (urlParams.get('version'))
+            versionSelect = urlParams.get('version');
+        $('li a:contains(' + versionSelect + ')').parent().siblings().removeClass('active');
+        $('li a:contains(' + versionSelect + ')').parent().addClass('active');
+        $('.current-version').html( versionSelect + ' <span class="caret"></span></button>' );
+        if (urlParams.get('device'))
+            deviceSelect = urlParams.get('device');
+        $('button:contains(' + deviceSelect + ')').siblings().removeClass('active');
+        $('button:contains(' + deviceSelect + ')').addClass('active');
+        if (urlParams.get('language'))
+            languageSelect = urlParams.get('language');
+        $('button:contains(' + languageSelect + ')').siblings().removeClass('active');
+        $('button:contains(' + languageSelect + ')').addClass('active');
+        if (urlParams.get('processor'))
+            processorSelect = urlParams.get('processor');
+        $('button:contains(' + processorSelect + ')').siblings().removeClass('active');
+        $('button:contains(' + processorSelect + ')').addClass('active');
+        if (urlParams.get('environ'))
+            environSelect = urlParams.get('environ');
+        $('button:contains(' + environSelect + ')').siblings().removeClass('active');
+        $('button:contains(' + environSelect + ')').addClass('active');
+        showContent();
+        if (window.location.href.includes("/install/index.html")) {
+            if (versionSelect.includes(defaultVersion)) {
+                history.pushState(null, null, '/install/index.html?device=' + deviceSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+            } else {
+                history.pushState(null, null, '/install/index.html?version=' + versionSelect + '&device=' + deviceSelect + '&language=' + languageSelect + '&processor=' + processorSelect);
+            }
+        } 
+    }
+
     function showContent() {
         $('.opt-group .opt').each(function(){
             $('.'+label($(this).text())).hide();
@@ -16,11 +53,35 @@ $(document).ready(function () {
         });
     }
     showContent();
+    setSelects();
     function setContent() {
         var el = $(this);
+        let urlParams = new URLSearchParams(window.location.search);
         el.siblings().removeClass('active');
         el.addClass('active');
+        if ($(this).hasClass("versions")) {
+            $('.current-version').html( $(this).text() + ' <span class="caret"></span></button>' );
+            if (!$(this).text().includes(defaultVersion)) {
+                if (!window.location.search.includes("version")) {
+                    history.pushState(null, null, '/install/index.html' + window.location.search.concat( '&version=' + $(this).text() ));
+                } else {
+                    history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('version'), $(this).text() ));
+                }
+            } else if (window.location.search.includes("version")) {
+                  history.pushState(null, null, '/install/index.html' + window.location.search.replace( 'version', 'prev' ));
+              }
+        }
+        else if ($(this).hasClass("Devices")) {
+            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('device'), $(this).text() ));
+        }
+        else if ($(this).hasClass("languages")) {
+            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('language'), $(this).text() ));
+        }
+        else if ($(this).hasClass("processors")) {
+            history.pushState(null, null, '/install/index.html' + window.location.search.replace( urlParams.get('processor'), $(this).text() ));
+        }
         showContent();
+        //window.location.search = window.location.search.replace( urlParams.get('version'), $(this).text() );
     }
     $('.opt-group').on('click', '.opt', setContent);
 });
diff --git a/docs/install/index.md b/docs/install/index.md
index b6a9a4e9ed2..4b966b62067 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -2,43 +2,57 @@
 
 Indicate your preferred configuration. Then, follow the customized commands to install *MXNet*.
 
+  <div class="dropdown">
+    <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.2.0
+    <span class="caret"></span></button>
+    <ul class="dropdown-menu opt-group">
+      <li class="opt active versions"><a href="#">v1.2.0</a></li>
+      <li class="opt versions"><a href="#">v1.1.0</a></li>
+      <li class="opt versions"><a href="#">v1.0.0</a></li>
+      <li class="opt versions"><a href="#">v0.12.1</a></li>
+      <li class="opt versions"><a href="#">v0.11.0</a></li>
+      <li class="opt versions"><a href="#">master</a></li>
+    </ul>
+  </div>
+
 <script type="text/javascript" src='../_static/js/options.js'></script>
 
 <!-- START - OS Menu -->
 
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Linux</button>
-  <button type="button" class="btn btn-default opt">MacOS</button>
-  <button type="button" class="btn btn-default opt">Windows</button>
-  <button type="button" class="btn btn-default opt">Cloud</button>
-  <button type="button" class="btn btn-default opt">Devices</button>
+  <button type="button" class="btn btn-default opt active Devices">Linux</button>
+  <button type="button" class="btn btn-default opt Devices">MacOS</button>
+  <button type="button" class="btn btn-default opt Devices">Windows</button>
+  <button type="button" class="btn btn-default opt Devices">Cloud</button>
+  <button type="button" class="btn btn-default opt Devices">Devices</button>
 </div>
 
 <!-- START - Language Menu -->
 
 <div class="linux macos windows">
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Python</button>
-  <button type="button" class="btn btn-default opt">Scala</button>
-  <button type="button" class="btn btn-default opt">R</button>
-  <button type="button" class="btn btn-default opt">Julia</button>
-  <button type="button" class="btn btn-default opt">Perl</button>
+  <button type="button" class="btn btn-default opt active languages">Python</button>
+  <button type="button" class="btn btn-default opt languages">Scala</button>
+  <button type="button" class="btn btn-default opt languages">R</button>
+  <button type="button" class="btn btn-default opt languages">Julia</button>
+  <button type="button" class="btn btn-default opt languages">Perl</button>
+  <button type="button" class="btn btn-default opt languages">Cpp</button>
 </div>
 </div>
 
 <!-- No CPU GPU for other Devices -->
 <div class="linux macos windows cloud">
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">CPU</button>
-  <button type="button" class="btn btn-default opt">GPU</button>
+  <button type="button" class="btn btn-default processors opt active">CPU</button>
+  <button type="button" class="btn btn-default processors opt">GPU</button>
 </div>
 </div>
 
 <!-- other devices -->
 <div class="devices">
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Raspberry Pi</button>
-  <button type="button" class="btn btn-default opt">NVIDIA Jetson TX2</button>
+  <button type="button" class="btn btn-default iots opt active">Raspberry Pi</button>
+  <button type="button" class="btn btn-default iots opt">NVIDIA Jetson TX2</button>
 </div>
 </div>
 
@@ -48,10 +62,10 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <div class="python">
 <div class="cpu gpu">
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default opt active">Pip</button>
-  <button type="button" class="btn btn-default opt">Virtualenv</button>
-  <button type="button" class="btn btn-default opt">Docker</button>
-  <button type="button" class="btn btn-default opt">Build from Source</button>
+  <button type="button" class="btn btn-default environs opt active">Pip</button>
+  <button type="button" class="btn btn-default environs opt">Virtualenv</button>
+  <button type="button" class="btn btn-default environs opt">Docker</button>
+  <button type="button" class="btn btn-default environs opt">Build from Source</button>
 </div>
 </div>
 </div>
@@ -81,6 +95,8 @@ $ sudo apt-get install -y wget python gcc
 $ wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
 ```
 
+<div class="v1-2-0">
+
 **Step 2** Install MXNet with OpenBLAS acceleration.
 
 ```bash
@@ -100,6 +116,139 @@ pip install graphviz
 $ pip install mxnet-mkl
 ```
 
+</div> <!-- End of v1-2-0 -->
+
+<div class="v1-1-0">
+
+**Step 2** Install MXNet with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==1.1.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-mkl==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+<div class="v1-0-0">
+
+**Step 2** Install MXNet with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==1.0.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-mkl==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+
+<div class="v0-12-1">
+
+
+**Step 2** Install MXNet with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==0.12.1
+```
+
+For MXNet 0.12.0 -
+
+```bash
+$ pip install mxnet==0.12.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-mkl==0.12.1
+```
+
+For MXNet 0.12.0 -
+
+```bash
+$ pip install mxnet-mkl==0.12.0
+```
+
+</div> <!-- End of v0-12-1-->
+
+<div class="v0-11-0">
+
+
+**Step 2** Install MXNet with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==0.11.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-mkl==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
+
+<div class="master">
+
+
+**Step 2** Install MXNet with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet --pre
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-mkl --pre
+```
+
+</div> <!-- End of master-->
+
 </div> <!-- End of pip -->
 
 <div class="virtualenv">
@@ -140,12 +289,74 @@ Installing *MXNet* with pip requires a latest version of `pip`. Install the late
 $ pip install --upgrade pip
 ```
 
+<div class="v1-2-0">
+
 Install *MXNet* with OpenBLAS acceleration.
 
 ```bash
 $ pip install mxnet
 ```
 
+</div> <!-- End of v1-2-0-->
+
+<div class="v1-1-0">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+<div class="v1-0-0">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+
+<div class="v0-12-1">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==0.12.1
+```
+
+For *MXNet* 0.12.0 -
+
+```bash
+$ pip install mxnet==0.12.0
+```
+
+</div> <!-- End of v0-12-1-->
+
+<div class="v0-11-0">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
+
+<div class="master">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+$ pip install mxnet --pre
+```
+
+</div> <!-- End of master-->
+
+
 **Step 4**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
 sudo apt-get install graphviz
@@ -193,7 +404,6 @@ mxnet/python        latest              00d026968b3c        3 weeks ago
 
 </div> <!-- END of docker -->
 
-
 <div class="build-from-source">
 <br/>
 
@@ -301,6 +511,8 @@ $ sudo apt-get install -y wget python
 $ wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
 ```
 
+<div class="v1-2-0">
+
 **Step 2**  Install *MXNet* with GPU support using CUDA 9.0
 
 ```bash
@@ -320,6 +532,117 @@ pip install graphviz
 $ pip install mxnet-cu90mkl
 ```
 
+</div> <!-- End of v1-2-0-->
+
+
+<div class="v1-1-0">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+
+```bash
+$ pip install mxnet-cu90==1.1.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-cu90mkl==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+
+<div class="v1-0-0">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+
+```bash
+$ pip install mxnet-cu90==1.0.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-cu90mkl==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+
+<div class="v0-12-1">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+
+```bash
+$ pip install mxnet-cu90==0.12.1
+```
+
+For *MXNet* 0.12.0 -
+
+```bash
+$ pip install mxnet-cu90==0.12.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-cu90mkl==0.12.1
+```
+
+For *MXNet* 0.12.0 -
+
+```bash
+$ pip install mxnet-cu90mkl==0.12.0
+```
+
+</div> <!-- End of v0-12-1-->
+
+
+<div class="v0-11-0">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0
+
+```bash
+$ pip install mxnet-cu90==0.11.0
+```
+
+**Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
+```bash
+sudo apt-get install graphviz
+pip install graphviz
+```
+
+**Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
+
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-cu90mkl==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
+
+
 </div> <!-- END of pip -->
 
 <div class="virtualenv">
@@ -361,12 +684,77 @@ Installing *MXNet* with pip requires a latest version of `pip`. Install the late
 (mxnet)$ pip install --upgrade pip
 ```
 
+
+<div class="v1-2-0">
+
 Install *MXNet* with GPU support using CUDA 9.0.
 
 ```bash
 (mxnet)$ pip install mxnet-cu90
 ```
 
+</div> <!-- End of v1-2-0-->
+
+
+<div class="v1-1-0">
+
+Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+(mxnet)$ pip install mxnet-cu90==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+
+<div class="v1-0-0">
+
+Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+(mxnet)$ pip install mxnet-cu90==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+
+<div class="v0-12-1">
+
+Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+(mxnet)$ pip install mxnet-cu90==0.12.1
+```
+
+For *MXNet* 0.12.0 with GPU support using CUDA 9.0.
+
+```bash
+(mxnet)$ pip install mxnet-cu90==0.12.0
+``` 
+
+</div> <!-- End of v0-12-1-->
+
+
+<div class="v0-11-0">
+
+Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+(mxnet)$ pip install mxnet-cu90==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
+
+<div class="master">
+
+Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+(mxnet)$ pip install mxnet-cu90 --pre
+```
+
+</div> <!-- End of master-->
+
 **Step 4**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
 sudo apt-get install graphviz
@@ -633,6 +1021,12 @@ Follow the installation instructions [in this guide](./ubuntu_setup.md) to set u
 
 </div> <!-- End of cpu gpu -->
 </div> <!-- End of scala julia perl -->
+<div class="cpp">
+<div class="cpu gpu">
+<p> To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>. </p>
+<br/>
+</div> <!-- End of cpu gpu -->
+</div> <!-- END - C++-->
 </div> <!-- END - Linux -->
 
 
@@ -668,9 +1062,74 @@ $ pip install --upgrade pip
 $ pip install --upgrade setuptools
 ```
 
+<div class="v1-2-0">
+
+Then use pip to install MXNet:
+
 ```bash
 $ pip install mxnet
 ```
+</div> <!-- End of v1-2-0 -->
+
+
+<div class="v1-1-0">
+
+Then use pip to install MXNet:
+
+```bash
+$ pip install mxnet==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+
+<div class="v1-0-0">
+
+Then use pip to install MXNet:
+
+```bash
+$ pip install mxnet==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+<div class="v0-12-1">
+
+Then use pip to install MXNet:
+
+```bash
+$ pip install mxnet=0.12.1
+```
+
+For MXNet 0.12.0 -
+
+```bash
+$ pip install mxnet=0.12.0
+```
+
+
+</div> <!-- End of v0-12-1-->
+
+
+<div class="v0-11-0">
+
+Then use pip to install MXNet:
+
+```bash
+$ pip install mxnet==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
+
+<div class="master">
+
+Then use pip to install MXNet:
+
+```bash
+$ pip install mxnet --pre
+```
+
+</div> <!-- End of master-->
 
 **Step 3**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
@@ -732,12 +1191,75 @@ Installing *MXNet* with pip requires a latest version of `pip`. Install the late
 (mxnet)$ pip install --upgrade setuptools
 ```
 
+<div class="v1-2-0">
+
 Install *MXNet* with OpenBLAS acceleration.
 
 ```bash
 (mxnet)$ pip install mxnet
 ```
 
+</div> <!-- End of v1-2-0-->
+
+<div class="v1-1-0">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+(mxnet)$ pip install mxnet==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+<div class="v1-0-0">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+(mxnet)$ pip install mxnet==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+
+<div class="v0-12-1">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+(mxnet)$ pip install mxnet==0.12.1
+```
+
+For *MXNet* 0.12.0 -
+
+```bash
+(mxnet)$ pip install mxnet==0.12.0
+```
+
+
+</div> <!-- End of v0-12-1-->
+
+<div class="v0-11-0">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+(mxnet)$ pip install mxnet==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
+
+<div class="master">
+
+Install *MXNet* with OpenBLAS acceleration.
+
+```bash
+(mxnet)$ pip install mxnet --pre
+```
+
+</div> <!-- End of master-->
+
+
 **Step 5**  Install [Graphviz](http://www.graphviz.org/). (Optional, needed for graph visualization using `mxnet.viz` package).
 ```bash
 $ brew install graphviz
@@ -932,6 +1454,10 @@ Follow the installation instructions [in this guide](./osx_setup.md) to set up M
 
 </div> <!-- End of cpu gpu -->
 </div> <!-- End of scala julia perl -->
+<div class="cpp">
+<p>To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>.</p>
+<br/>
+</div>
 </div> <!-- END - Mac OS -->
 
 
@@ -953,12 +1479,76 @@ Follow the installation instructions [in this guide](./osx_setup.md) to set up M
 
 [Anaconda](https://www.anaconda.com/download/) is recommended.
 
+<div class="v1-2-0">
+
 **Step 2**  Install *MXNet*.
 
 ```bash
 $ pip install mxnet
 ```
 
+</div> <!-- End of v1-2-0-->
+
+<div class="v1-1-0">
+
+**Step 2**  Install *MXNet*.
+
+```bash
+$ pip install mxnet==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+<div class="v1-0-0">
+
+**Step 2**  Install *MXNet*.
+
+```bash
+$ pip install mxnet==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+
+<div class="v0-12-1">
+
+**Step 2**  Install *MXNet*.
+
+```bash
+$ pip install mxnet==0.12.1
+```
+
+For *MXNet* 0.12.0 -
+
+```bash
+$ pip install mxnet==0.12.0
+```
+
+
+</div> <!-- End of v0-12-1-->
+
+<div class="v0-11-0">
+
+**Step 2**  Install *MXNet*.
+
+```bash
+$ pip install mxnet==0.11.0
+```
+
+
+</div> <!-- End of v0-11-0-->
+
+<div class="master">
+
+**Step 2**  Install *MXNet*.
+
+```bash
+$ pip install mxnet --pre
+```
+
+</div> <!-- End of master-->
+
+
 </div> <!-- End of pip -->
 
 
@@ -979,12 +1569,73 @@ Follow the installation instructions [in this guide](./windows_setup.md) to set
 
 [Anaconda](https://www.anaconda.com/download/) is recommended.
 
+
+<div class="v1-2-0">
+
 **Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
 
 ```bash
 $ pip install mxnet-cu90
 ```
 
+</div> <!-- End of v1-2-0-->
+
+<div class="v1-1-0">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+$ pip install mxnet-cu90==1.1.0
+```
+
+</div> <!-- End of v1-1-0-->
+
+<div class="v1-0-0">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+$ pip install mxnet-cu90==1.0.0
+```
+
+</div> <!-- End of v1-0-0-->
+
+<div class="v0-12-1">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+$ pip install mxnet-cu90==0.12.1
+```
+
+Install *MXNet* 0.12.0 with GPU support using CUDA 9.0.
+
+```bash
+$ pip install mxnet-cu90==0.12.0
+```
+
+</div> <!-- End of v0-12-1-->
+
+<div class="v0-11-0">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+$ pip install mxnet-cu90==0.11.0
+```
+
+</div> <!-- End of v0-11-0-->
+
+<div class="master">
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 9.0.
+
+```bash
+$ pip install mxnet-cu90 --pre
+```
+
+</div> <!-- End of master-->
+
 Refer to [#8671](https://github.com/apache/incubator-mxnet/issues/8671) for status on CUDA 9.1 support.
 
 </div>
@@ -1122,6 +1773,12 @@ Follow the installation instructions [in this guide](./windows_setup.md) to set
 
 </div> <!-- End of cpu gpu -->
 </div> <!-- End of scala julia perl -->
+<div class="cpp">
+<div class="cpu gpu">
+<p> To build the C++ package, please refer to <a href="build_from_source.html#build-the-c-package">this guide</a>. </p>
+<br/>
+</div> <!-- End of cpu gpu -->
+</div> <!-- End of C++>
 </div> <!-- End of Windows -->
 
 
@@ -1684,7 +2341,7 @@ b
 
 
 <div class="linux">
-  <div class="scala julia perl">
+  <div class="scala julia perl cpp">
     <div class="cpu gpu">
 
 Will be available soon.
@@ -1694,7 +2351,7 @@ Will be available soon.
 </div>
 
 <div class="macos">
-  <div class="scala julia perl">
+  <div class="scala julia perl cpp">
     <div class="cpu gpu">
 
 Will be available soon.
@@ -1716,7 +2373,7 @@ Will be available soon.
 </div>
 </div>
 
-<div class="scala julia perl">
+<div class="scala julia perl cpp">
 <div class="cpu gpu">
 
 Will be available soon.
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index f5427feae2f..3f37ad3ac59 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -23,13 +23,13 @@
 import math
 import mxnet as mx
 
+def get_epoch_size(args, kv):
+    return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size)
 
 def _get_lr_scheduler(args, kv):
     if 'lr_factor' not in args or args.lr_factor >= 1:
         return (args.lr, None)
-    epoch_size = args.num_examples / args.batch_size
-    if 'dist' in args.kv_store:
-        epoch_size /= kv.num_workers
+    epoch_size = get_epoch_size(args, kv)
     begin_epoch = args.load_epoch if args.load_epoch else 0
     if 'pow' in args.lr_step_epochs:
         lr = args.lr
@@ -48,8 +48,10 @@ def _get_lr_scheduler(args, kv):
 
     steps = [epoch_size * (x - begin_epoch)
              for x in step_epochs if x - begin_epoch > 0]
-    return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor))
-
+    if steps:
+        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor))
+    else:
+        return (lr, None)
 
 def _load_model(args, rank=0):
     if 'load_epoch' not in args or args.load_epoch is None:
@@ -153,9 +155,17 @@ def fit(args, network, data_loader, **kwargs):
     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
     logging.basicConfig(level=logging.DEBUG, format=head)
     logging.info('start with arguments %s', args)
+    
+    epoch_size = get_epoch_size(args, kv)
 
     # data iterators
     (train, val) = data_loader(args, kv)
+    if 'dist' in args.kv_store and not 'async' in args.kv_store:
+        logging.info('Resizing training data to %d batches per machine', epoch_size)
+        # resize train iter to ensure each machine has same number of batches per epoch
+        # if not, dist_sync can hang at the end with one machine waiting for other machines
+        train = mx.io.ResizeIter(train, epoch_size)
+
     if args.test_io:
         tic = time.time()
         for i, batch in enumerate(train):
@@ -211,11 +221,7 @@ def fit(args, network, data_loader, **kwargs):
     # A limited number of optimizers have a warmup period
     has_warmup = {'lbsgd', 'lbnag'}
     if args.optimizer in has_warmup:
-        if 'dist' in args.kv_store:
-            nworkers = kv.num_workers
-        else:
-            nworkers = 1
-        epoch_size = args.num_examples / args.batch_size / nworkers
+        nworkers = kv.num_workers
         if epoch_size < 1:
             epoch_size = 1
         macrobatch_size = args.macrobatch_size
diff --git a/example/image-classification/predict-cpp/CMakeLists.txt b/example/image-classification/predict-cpp/CMakeLists.txt
index a2f52b9df3a..c42d1917b76 100644
--- a/example/image-classification/predict-cpp/CMakeLists.txt
+++ b/example/image-classification/predict-cpp/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Check OpenCV
-if(NOT USE_OPENCV OR NOT OpenCV_FOUND)
+if(NOT USE_OPENCV OR NOT OpenCV_FOUND OR OpenCV_VERSION_MAJOR LESS 3)
   message(WARNING "\
-OpenCV should be enabled and found to build image classification example, skipping...")
+OpenCV version >= 3 should be enabled and found to build image classification example, skipping...")
   return()
 endif()
 
diff --git a/example/ssd/README.md b/example/ssd/README.md
index 55387c5fd2d..cc034689c7b 100644
--- a/example/ssd/README.md
+++ b/example/ssd/README.md
@@ -17,7 +17,8 @@ remarkable traits of MXNet.
 Due to the permission issue, this example is maintained in this [repository](https://github.com/zhreshold/mxnet-ssd) separately. You can use the link regarding specific per example [issues](https://github.com/zhreshold/mxnet-ssd/issues).
 
 ### What's new
-* Added live camera capture and detection display (run with --camera flag)
+* Added live camera capture and detection display (run with --camera flag). Example:
+    `./demo.py --camera --cpu --frame-resize 0.5`
 * Added multiple trained models.
 * Added a much simpler way to compose network from mainstream classification networks (resnet, inception...) and [Guide](symbol/README.md).
 * Update to the latest version according to caffe version, with 5% mAP increase.
diff --git a/example/ssd/dataset/cv2Iterator.py b/example/ssd/dataset/cv2Iterator.py
index 469faeac828..0af8c3272fa 100644
--- a/example/ssd/dataset/cv2Iterator.py
+++ b/example/ssd/dataset/cv2Iterator.py
@@ -26,10 +26,19 @@ class CameraIterator():
     """
     def __init__(self, capture=cv2.VideoCapture(0), frame_resize=None):
         self._capture = capture
-        self._frame_resize = frame_resize
+        self._frame_resize = None
         if frame_resize:
-            assert isinstance(frame_resize, tuple) and (len(tuple) == 2), "frame_resize should be a tuple of (x,y)"
-            self._frame_shape = (1, 3, frame_resize[0], frame_resize[1])
+            if isinstance(frame_resize, (tuple, list)) and (len(frame_resize) == 2):
+                self._frame_resize = tuple(map(int, frame_resize))
+                self._frame_shape = (1, 3, self._frame_resize[0], self._frame_resize[1])
+            elif isinstance(frame_resize, float):
+                width = int(self._capture.get(cv2.CAP_PROP_FRAME_WIDTH)*frame_resize)
+                height = int(self._capture.get(cv2.CAP_PROP_FRAME_HEIGHT)*frame_resize)
+                self._frame_shape = (1, 3, width, height)
+                self._frame_resize = (width, height)
+            else:
+                assert False, "frame_resize should be a tuple of (x,y) pixels "
+                "or a float setting the scaling factor"
         else:
             self._frame_shape = (1, 3,
                 int(self._capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
diff --git a/example/ssd/demo.py b/example/ssd/demo.py
index 4ae8b350742..e8194ab8ead 100755
--- a/example/ssd/demo.py
+++ b/example/ssd/demo.py
@@ -109,6 +109,8 @@ def parse_args():
                         help='string of comma separated names, or text filename')
     parser.add_argument('--camera', action='store_true',
                         help="use camera for image capturing")
+    parser.add_argument('--frame-resize', type=str, default=None,
+                        help="resize camera frame to x,y pixels or a float scaling factor")
     args = parser.parse_args()
     return args
 
@@ -127,6 +129,15 @@ def parse_class_names(class_names):
         raise RuntimeError("No valid class_name provided...")
     return class_names
 
+def parse_frame_resize(x):
+    if not x:
+        return x
+    x = list(map(float, x.strip().split(',')))
+    assert len(x) >= 1 and len(x) <= 2, "frame_resize should be a float scaling factor or a tuple of w,h pixels"
+    if len(x) == 1:
+        x = x[0]
+    return x
+
 def parse_data_shape(data_shape_str):
     """Parse string to tuple or int"""
     ds = data_shape_str.strip().split(',')
@@ -160,7 +171,7 @@ def network_path(prefix, network, data_shape):
 def run_camera(args,ctx):
     assert args.batch_size == 1, "only batch size of 1 is supported"
     logging.info("Detection threshold is {}".format(args.thresh))
-    iter = CameraIterator()
+    iter = CameraIterator(frame_resize=parse_frame_resize(args.frame_resize))
     class_names = parse_class_names(args.class_names)
     mean_pixels = (args.mean_r, args.mean_g, args.mean_b)
     data_shape = int(args.data_shape)
diff --git a/make/config.mk b/make/config.mk
index dd67c33cc9e..b65f77c605f 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -37,9 +37,15 @@
 # choice of compiler
 #--------------------
 
+ifndef CC
 export CC = gcc
+endif
+ifndef CXX
 export CXX = g++
+endif
+ifndef NVCC
 export NVCC = nvcc
+endif
 
 # whether compile with options for MXNet developer
 DEV = 0
diff --git a/make/crosscompile.armv7.mk b/make/crosscompile.armv7.mk
new file mode 100644
index 00000000000..47402451d8d
--- /dev/null
+++ b/make/crosscompile.armv7.mk
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet
+#
+#  If you want to change the configuration, please use the following
+#  steps. Assume you are on the root directory of mxnet. First copy the this
+#  file so that any local changes will be ignored by git
+#
+#  $ cp make/config.mk .
+#
+#  Next modify the according entries, and then compile by
+#
+#  $ make
+#
+#  or build in parallel with 8 threads
+#
+#  $ make -j8
+#-------------------------------------------------------------------------------
+
+#---------------------
+# For cross compilation we only explictily set a compiler when one is not already present.
+#--------------------
+
+ifndef CC
+export CC = gcc
+endif
+ifndef CXX
+export CXX = g++
+endif
+ifndef NVCC
+export NVCC = nvcc
+endif
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether to turn on segfault signal handler to log the stack trace
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS =
+
+# the additional compile flags you want to add
+ADD_CFLAGS =
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+USE_CUDA_PATH = NONE
+
+# whether to enable CUDA runtime compilation
+ENABLE_CUDA_RTC = 0
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+#whether to use NCCL library
+USE_NCCL = 0
+#add the path to NCCL library
+USE_NCCL_PATH = NONE
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 0
+
+#whether use libjpeg-turbo for image decode without OpenCV wrapper
+USE_LIBJPEG_TURBO = 0
+#add the path to libjpeg-turbo library
+USE_LIBJPEG_TURBO_PATH = NONE
+
+# use openmp for parallelization
+USE_OPENMP = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+UNAME_S := $(shell uname -s)
+USE_BLAS = openblas
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH =
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL only for BLAS, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+USE_SSE = 0
+
+# Turn off F16C instruction set support
+USE_F16C = 0
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 0
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM = NONE
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 0
+
+#----------------------------
+# performance settings
+#----------------------------
+# Use operator tuning
+USE_OPERATOR_TUNING = 1
+
+# Use gperftools if found
+USE_GPERFTOOLS = 1
+
+# Use JEMalloc if found, and not using gperftools
+USE_JEMALLOC = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+#----------------------------
+# other features
+#----------------------------
+
+# Create C++ interface package
+USE_CPP_PACKAGE = 0
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index acc9c4a5a8a..5bb4961bf01 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -57,10 +57,10 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
-ADD_LDFLAGS =
+ADD_LDFLAGS = -L${CROSS_ROOT}/lib
 
 # the additional compile flags you want to add
-ADD_CFLAGS =
+ADD_CFLAGS = -I${CROSS_ROOT}/include
 
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 818aa3d2a3b..06b91fadcee 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -171,7 +171,7 @@ def check_sha1(filename, sha1_hash):
     return sha1.hexdigest() == sha1_hash
 
 
-def download(url, path=None, overwrite=False, sha1_hash=None):
+def download(url, path=None, overwrite=False, sha1_hash=None, retries=5):
     """Download an given URL
 
     Parameters
@@ -186,6 +186,8 @@ def download(url, path=None, overwrite=False, sha1_hash=None):
     sha1_hash : str, optional
         Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
         but doesn't match.
+    retries : integer, default 5
+        The number of times to attempt the download in case of failure or non 200 return codes
 
     Returns
     -------
@@ -200,26 +202,37 @@ def download(url, path=None, overwrite=False, sha1_hash=None):
             fname = os.path.join(path, url.split('/')[-1])
         else:
             fname = path
+    assert retries >= 0, "Number of retries should be at least 0"
 
     if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
         dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
         if not os.path.exists(dirname):
             os.makedirs(dirname)
-
-        print('Downloading %s from %s...'%(fname, url))
-        r = requests.get(url, stream=True)
-        if r.status_code != 200:
-            raise RuntimeError("Failed downloading url %s"%url)
-        with open(fname, 'wb') as f:
-            for chunk in r.iter_content(chunk_size=1024):
-                if chunk: # filter out keep-alive new chunks
-                    f.write(chunk)
-
-        if sha1_hash and not check_sha1(fname, sha1_hash):
-            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
-                              'The repo may be outdated or download may be incomplete. ' \
-                              'If the "repo_url" is overridden, consider switching to ' \
-                              'the default repo.'.format(fname))
+        while retries+1 > 0:
+            # Disable pyling too broad Exception
+            # pylint: disable=W0703
+            try:
+                print('Downloading %s from %s...'%(fname, url))
+                r = requests.get(url, stream=True)
+                if r.status_code != 200:
+                    raise RuntimeError("Failed downloading url %s"%url)
+                with open(fname, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=1024):
+                        if chunk: # filter out keep-alive new chunks
+                            f.write(chunk)
+                if sha1_hash and not check_sha1(fname, sha1_hash):
+                    raise UserWarning('File {} is downloaded but the content hash does not match.'\
+                                      ' The repo may be outdated or download may be incomplete. '\
+                                      'If the "repo_url" is overridden, consider switching to '\
+                                      'the default repo.'.format(fname))
+                break
+            except Exception as e:
+                retries -= 1
+                if retries <= 0:
+                    raise e
+                else:
+                    print("download failed, retrying, {} attempt{} left"
+                          .format(retries, 's' if retries > 1 else ''))
 
     return fname
 
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index bcdcc9c6408..686802d3c48 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1367,7 +1367,7 @@ def list_gpus():
             pass
     return range(len([i for i in re.split('\n') if 'GPU' in i]))
 
-def download(url, fname=None, dirname=None, overwrite=False):
+def download(url, fname=None, dirname=None, overwrite=False, retries=5):
     """Download an given URL
 
     Parameters
@@ -1385,12 +1385,17 @@ def download(url, fname=None, dirname=None, overwrite=False):
         Default is false, which means skipping download if the local file
         exists. If true, then download the url to overwrite the local file if
         exists.
+    retries : integer, default 5
+        The number of times to attempt the download in case of failure or non 200 return codes
 
     Returns
     -------
     str
         The filename of the downloaded file
     """
+
+    assert retries >= 0, "Number of retries should be at least 0"
+
     if fname is None:
         fname = url.split('/')[-1]
 
@@ -1411,12 +1416,24 @@ def download(url, fname=None, dirname=None, overwrite=False):
         logging.info("%s exists, skipping download", fname)
         return fname
 
-    r = requests.get(url, stream=True)
-    assert r.status_code == 200, "failed to open %s" % url
-    with open(fname, 'wb') as f:
-        for chunk in r.iter_content(chunk_size=1024):
-            if chunk: # filter out keep-alive new chunks
-                f.write(chunk)
+    while retries+1 > 0:
+        # Disable pyling too broad Exception
+        # pylint: disable=W0703
+        try:
+            r = requests.get(url, stream=True)
+            assert r.status_code == 200, "failed to open %s" % url
+            with open(fname, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk: # filter out keep-alive new chunks
+                        f.write(chunk)
+                break
+        except Exception as e:
+            retries -= 1
+            if retries <= 0:
+                raise e
+            else:
+                print("download failed, retrying, {} attempt{} left"
+                      .format(retries, 's' if retries > 1 else ''))
     logging.info("downloaded %s into %s successfully", url, fname)
     return fname
 
diff --git a/src/operator/contrib/bounding_box-inl.cuh b/src/operator/contrib/bounding_box-inl.cuh
new file mode 100644
index 00000000000..fb1dacc11f4
--- /dev/null
+++ b/src/operator/contrib/bounding_box-inl.cuh
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file bounding_box-inl.cuh
+ * \brief bounding box CUDA operators
+ * \author Joshua Zhang
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
+#define MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
+#include <mxnet/operator_util.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename DType>
+struct valid_score {
+  DType thresh;
+  explicit valid_score(DType _thresh) : thresh(_thresh) {}
+  __host__ __device__ bool operator()(const DType x) {
+    return x > thresh;
+  }
+};
+
+template<typename DType>
+int FilterScores(mshadow::Tensor<gpu, 1, DType> out_scores,
+                 mshadow::Tensor<gpu, 1, DType> out_sorted_index,
+                 mshadow::Tensor<gpu, 1, DType> scores,
+                 mshadow::Tensor<gpu, 1, DType> sorted_index,
+                 float valid_thresh) {
+  valid_score<DType> pred(static_cast<DType>(valid_thresh));
+  DType * end_scores = thrust::copy_if(thrust::device, scores.dptr_, scores.dptr_ + scores.MSize(),
+                                       out_scores.dptr_, pred);
+  thrust::copy_if(thrust::device, sorted_index.dptr_, sorted_index.dptr_ + sorted_index.MSize(),
+                  scores.dptr_, out_sorted_index.dptr_, pred);
+  return end_scores - out_scores.dptr_;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
diff --git a/src/operator/contrib/bounding_box-inl.h b/src/operator/contrib/bounding_box-inl.h
index 40dbdd81669..f739dbc8a52 100644
--- a/src/operator/contrib/bounding_box-inl.h
+++ b/src/operator/contrib/bounding_box-inl.h
@@ -49,6 +49,7 @@ enum BoxNMSOpResource {kTempSpace};
 
 struct BoxNMSParam : public dmlc::Parameter<BoxNMSParam> {
   float overlap_thresh;
+  float valid_thresh;
   int topk;
   int coord_start;
   int score_index;
@@ -59,6 +60,8 @@ struct BoxNMSParam : public dmlc::Parameter<BoxNMSParam> {
   DMLC_DECLARE_PARAMETER(BoxNMSParam) {
     DMLC_DECLARE_FIELD(overlap_thresh).set_default(0.5)
     .describe("Overlapping(IoU) threshold to suppress object with smaller score.");
+    DMLC_DECLARE_FIELD(valid_thresh).set_default(0)
+    .describe("Filter input boxes to those whose scores greater than valid_thresh.");
     DMLC_DECLARE_FIELD(topk).set_default(-1)
     .describe("Apply nms to topk boxes with descending scores, -1 to no restriction.");
     DMLC_DECLARE_FIELD(coord_start).set_default(2)
@@ -145,6 +148,33 @@ inline uint32_t BoxNMSNumVisibleOutputs(const NodeAttrs& attrs) {
   return static_cast<uint32_t>(1);
 }
 
+template<typename DType>
+int FilterScores(mshadow::Tensor<cpu, 1, DType> out_scores,
+                 mshadow::Tensor<cpu, 1, DType> out_sorted_index,
+                 mshadow::Tensor<cpu, 1, DType> scores,
+                 mshadow::Tensor<cpu, 1, DType> sorted_index,
+                 float valid_thresh) {
+  index_t j = 0;
+  for (index_t i = 0; i < scores.size(0); i++) {
+    if (scores[i] > valid_thresh) {
+      out_scores[j] = scores[i];
+      out_sorted_index[j] = sorted_index[i];
+      j++;
+    }
+  }
+  return j;
+}
+
+namespace mshadow_op {
+struct less_than : public mxnet_op::tunable {
+  // a is x, b is sigma
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return static_cast<DType>(a < b);
+  }
+};  // struct equal_to
+}   // namespace mshadow_op
+
 struct corner_to_center {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType *data, int stride) {
@@ -198,15 +228,28 @@ MSHADOW_XINLINE DType BoxArea(const DType *box, int encode) {
   }
 }
 
-// compute areas specialized for nms to reduce computation
+/*!
+ * \brief compute areas specialized for nms to reduce computation
+ * 
+ * \param i the launched thread index (total thread num_batch * topk)
+ * \param out 1d array for areas (size num_batch * num_elem)
+ * \param in 1st coordinate of 1st box (buffer + coord_start)
+ * \param indices index to areas and in buffer (sorted_index)
+ * \param batch_start map (b, k) to compact index by indices[batch_start[b] + k]
+ * \param topk effective batch size of boxes, to be mapped to real index
+ * \param stride should be width_elem (e.g. 6 including cls and scores)
+ * \param encode passed to BoxArea to compute area
+ */
 struct compute_area {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
-                                  const DType *indices, int topk, int num_elem,
-                                  int stride, int encode) {
+                                  const DType *indices, const DType *batch_start,
+                                  int topk, int num_elem, int stride, int encode) {
     int b = i / topk;
     int k = i % topk;
-    int index = static_cast<int>(indices[b * num_elem + k]);
+    int pos = static_cast<int>(batch_start[b]) + k;
+    if (pos >= static_cast<int>(batch_start[b + 1])) return;
+    int index = static_cast<int>(indices[pos]);
     int in_index = index * stride;
     out[index] = BoxArea(in + in_index, encode);
   }
@@ -243,6 +286,7 @@ MSHADOW_XINLINE DType Intersect(const DType *a, const DType *b, int encode) {
    *
    * \param i the launched thread index
    * \param index sorted index in descending order
+   * \param batch_start map (b, k) to compact index by indices[batch_start[b] + k]
    * \param input the input of nms op
    * \param areas pre-computed box areas
    * \param k nms topk number
@@ -254,20 +298,25 @@ MSHADOW_XINLINE DType Intersect(const DType *a, const DType *b, int encode) {
    * \param force force suppress regardless of class id
    * \param offset_id class id offset, used when force == false, usually 0
    * \param encode box encoding type, corner(0) or center(1)
-   * \tparam DType the data type
+   * \param DType the data type
    */
 struct nms_impl {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *index, const DType *input,
-                                  const DType *areas, int k, int ref, int num,
+  MSHADOW_XINLINE static void Map(int i, DType *index, const DType *batch_start,
+                                  const DType *input, const DType *areas,
+                                  int k, int ref, int num,
                                   int stride, int offset_box, int offset_id,
                                   float thresh, bool force, int encode) {
     int b = i / k;  // batch
     int pos = i % k + ref + 1;  // position
-    if (index[b * num + ref] < 0) return;  // reference has been suppressed
-    if (index[b * num + pos] < 0) return;  // self been suppressed
-    int ref_offset = static_cast<int>(index[b * num + ref]) * stride + offset_box;
-    int pos_offset = static_cast<int>(index[b * num + pos]) * stride + offset_box;
+    ref = static_cast<int>(batch_start[b]) + ref;
+    pos = static_cast<int>(batch_start[b]) + pos;
+    if (ref >= static_cast<int>(batch_start[b + 1])) return;
+    if (pos >= static_cast<int>(batch_start[b + 1])) return;
+    if (index[ref] < 0) return;  // reference has been suppressed
+    if (index[pos] < 0) return;  // self been suppressed
+    int ref_offset = static_cast<int>(index[ref]) * stride + offset_box;
+    int pos_offset = static_cast<int>(index[pos]) * stride + offset_box;
     if (!force && offset_id >=0) {
       int ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
       int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
@@ -275,23 +324,38 @@ struct nms_impl {
     }
     DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
     intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
-    int ref_area_offset = static_cast<int>(index[b * num + ref]);
-    int pos_area_offset = static_cast<int>(index[b * num + pos]);
+    int ref_area_offset = static_cast<int>(index[ref]);
+    int pos_area_offset = static_cast<int>(index[pos]);
     DType iou = intersect / (areas[ref_area_offset] + areas[pos_area_offset] -
       intersect);
     if (iou > thresh) {
-      index[b * num + pos] = -1;
+      index[pos] = -1;
     }
   }
 };
 
+/*!
+   * \brief Assign output of nms by indexing input
+   * 
+   * \param i the launched thread index (total num_batch)
+   * \param out output array [cls, conf, b0, b1, b2, b3]
+   * \param record book keeping the selected index for backward
+   * \param index compact sorted_index, use batch_start to access
+   * \param batch_start map(b, k) to compact index by index[batch_start[b] + k]
+   * \param k nms topk number
+   * \param num number of input boxes in each batch
+   * \param stride input stride, usually 6 (id-score-x1-y2-x2-y2)
+   */
 struct nms_assign {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType *out, DType *record, const DType *input,
-                                  const DType *index, int k, int num, int stride) {
+                                  const DType *index, const DType *batch_start,
+                                  int k, int num, int stride) {
     int count = 0;
     for (int j = 0; j < k; ++j) {
-      int location = static_cast<int>(index[i * num + j]);
+      int pos = static_cast<int>(batch_start[i]) + j;
+      if (pos >= static_cast<int>(batch_start[i + 1])) return;
+      int location = static_cast<int>(index[pos]);
       if (location >= 0) {
         // copy to output
         int out_location = (i * num + count) * stride;
@@ -352,6 +416,8 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
     Shape<1> sort_index_shape = Shape1(num_batch * num_elem);
     Shape<3> buffer_shape = Shape3(num_batch, num_elem, width_elem);
     index_t workspace_size = 4 * sort_index_shape.Size();
+    Shape<1> batch_start_shape = Shape1(num_batch + 1);
+    workspace_size += batch_start_shape.Size();
     if (req[0] == kWriteInplace) {
       workspace_size += buffer_shape.Size();
     }
@@ -363,10 +429,11 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
     Tensor<xpu, 1, DType> batch_id(scores.dptr_ + scores.MSize(), sort_index_shape,
       s);
     Tensor<xpu, 1, DType> areas(batch_id.dptr_ + batch_id.MSize(), sort_index_shape, s);
+    Tensor<xpu, 1, DType> batch_start(areas.dptr_ + areas.MSize(), batch_start_shape, s);
     Tensor<xpu, 3, DType> buffer = data;
     if (req[0] == kWriteInplace) {
       // make copy
-      buffer = Tensor<xpu, 3, DType>(areas.dptr_ + areas.MSize(), buffer_shape, s);
+      buffer = Tensor<xpu, 3, DType>(batch_start.dptr_ + batch_start.MSize(), buffer_shape, s);
       buffer = F<mshadow_op::identity>(data);
     }
 
@@ -382,19 +449,51 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
       record = reshape(range<DType>(0, num_batch * num_elem), record.shape_);
       return;
     }
-    scores = reshape(slice<2>(buffer, score_index, score_index + 1), scores.shape_);
-    sorted_index = range<DType>(0, num_batch * num_elem);
-    mxnet::op::SortByKey(scores, sorted_index, false);
-    batch_id = F<mshadow_op::floor>(sorted_index / ScalarExp<DType>(num_elem));
-    mxnet::op::SortByKey(batch_id, scores, true);
-    batch_id = F<mshadow_op::floor>(sorted_index / ScalarExp<DType>(num_elem));
-    mxnet::op::SortByKey(batch_id, sorted_index, true);
+
+    // use batch_id and areas as temporary storage
+    Tensor<xpu, 1, DType> all_scores = batch_id;
+    Tensor<xpu, 1, DType> all_sorted_index = areas;
+    all_scores = reshape(slice<2>(buffer, score_index, score_index + 1), all_scores.shape_);
+    all_sorted_index = range<DType>(0, num_batch * num_elem);
+
+    // filter scores but keep original sorted_index value
+    // move valid score and index to the front, return valid size
+    int num_valid = mxnet::op::FilterScores(scores, sorted_index, all_scores, all_sorted_index,
+                                            param.valid_thresh);
+    // if everything is filtered, output -1
+    if (num_valid == 0) {
+      record = -1;
+      out = -1;
+      return;
+    }
+    // mark the invalid boxes before nms
+    if (num_valid < num_batch * num_elem) {
+      slice<0>(sorted_index, num_valid, num_batch * num_elem) = -1;
+    }
+
+    // only sort the valid scores and batch_id
+    Shape<1> valid_score_shape = Shape1(num_valid);
+    Tensor<xpu, 1, DType> valid_scores(scores.dptr_, valid_score_shape, s);
+    Tensor<xpu, 1, DType> valid_sorted_index(sorted_index.dptr_, valid_score_shape, s);
+    Tensor<xpu, 1, DType> valid_batch_id(batch_id.dptr_, valid_score_shape, s);
+
+    // sort index by batch_id then score (stable sort)
+    mxnet::op::SortByKey(valid_scores, valid_sorted_index, false);
+    valid_batch_id = F<mshadow_op::floor>(valid_sorted_index / ScalarExp<DType>(num_elem));
+    mxnet::op::SortByKey(valid_batch_id, valid_sorted_index, true);
+
+    // calculate batch_start: accumulated sum to denote 1st sorted_index for a given batch_index
+    valid_batch_id = F<mshadow_op::floor>(valid_sorted_index / ScalarExp<DType>(num_elem));
+    for (int b = 0; b < num_batch + 1; b++) {
+      slice<0>(batch_start, b, b + 1) = reduce_keepdim<red::sum, false>(
+        F<mshadow_op::less_than>(valid_batch_id, ScalarExp<DType>(b)), 0);
+    }
 
     // pre-compute areas of candidates
     areas = 0;
-    Kernel<compute_area, xpu>::Launch(s, num_batch * topk, areas.dptr_,
-     buffer.dptr_ + coord_start, sorted_index.dptr_, topk, num_elem, width_elem,
-     param.in_format);
+    Kernel<compute_area, xpu>::Launch(s, num_batch * topk,
+     areas.dptr_, buffer.dptr_ + coord_start, sorted_index.dptr_, batch_start.dptr_,
+     topk, num_elem, width_elem, param.in_format);
 
     // apply nms
     // go through each box as reference, suppress if overlap > threshold
@@ -402,16 +501,19 @@ void BoxNMSForward(const nnvm::NodeAttrs& attrs,
     for (int ref = 0; ref < topk; ++ref) {
       int num_worker = topk - ref - 1;
       if (num_worker < 1) continue;
-      Kernel<nms_impl, xpu>::Launch(s, num_batch * num_worker, sorted_index.dptr_,
-        buffer.dptr_, areas.dptr_, num_worker, ref, num_elem, width_elem,
-        coord_start, id_index, param.overlap_thresh, param.force_suppress, param.in_format);
+      Kernel<nms_impl, xpu>::Launch(s, num_batch * num_worker,
+        sorted_index.dptr_, batch_start.dptr_, buffer.dptr_, areas.dptr_,
+        num_worker, ref, num_elem,
+        width_elem, coord_start, id_index,
+        param.overlap_thresh, param.force_suppress, param.in_format);
     }
 
     // store the results to output, keep a record for backward
     record = -1;
     out = -1;
-    Kernel<nms_assign, xpu>::Launch(s, num_batch, out.dptr_, record.dptr_,
-      buffer.dptr_, sorted_index.dptr_, topk, num_elem, width_elem);
+    Kernel<nms_assign, xpu>::Launch(s, num_batch,
+      out.dptr_, record.dptr_, buffer.dptr_, sorted_index.dptr_, batch_start.dptr_,
+      topk, num_elem, width_elem);
 
     // convert encoding
     if (param.in_format != param.out_format) {
diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 6662d932700..2677d2f7947 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -24,6 +24,7 @@
   * \author Joshua Zhang
   */
 
+#include "./bounding_box-inl.cuh"
 #include "./bounding_box-inl.h"
 #include "../elemwise_op_common.h"
 
diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu
index 65a320ded16..9f61212d5c7 100644
--- a/src/operator/nn/convolution.cu
+++ b/src/operator/nn/convolution.cu
@@ -89,8 +89,11 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   int dtype = inputs[conv::kData].type_flag_;
 
-  // If 1D convolution, use MXNet implementation
-  if (param.kernel.ndim() == 1) {
+#if CUDNN_MAJOR < 5
+  if (param_.layout.value() != kNCW &&
+      param_.layout.value() != kNCHW &&
+      param_.layout.value() != kNCDHW) {
+    // Need CuDNN > 5.0 for layout support. use MXNet implementation
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       ConvolutionOp<gpu, DType> op;
       op.Init(param);
@@ -98,6 +101,8 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
     })
     return;
   }
+#endif
+
 #if MXNET_USE_CUDNN == 0 || CUDNN_MAJOR < 7
   if (param.num_filter == param.num_group &&
       param.layout.value() == mshadow::kNCHW &&
@@ -162,8 +167,11 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
   const std::vector<TBlob> &in_grad = outputs;
   int dtype = out_grad.type_flag_;
 
-  // If 1D convolution, use MXNet implementation
-  if (param.kernel.ndim() == 1) {
+#if CUDNN_MAJOR < 5
+  if (param_.layout.value() != kNCW &&
+      param_.layout.value() != kNCHW &&
+      param_.layout.value() != kNCDHW) {
+    // Need CuDNN > 5.0 for layout support. use MXNet implementation
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       ConvolutionOp<gpu, DType> op;
       op.Init(param);
@@ -171,6 +179,7 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
     })
     return;
   }
+#endif
 #if MXNET_USE_CUDNN == 0 || CUDNN_MAJOR < 7
   if (param.num_filter == param.num_group &&
       param.layout.value() == mshadow::kNCHW &&
diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py
index 5618e11a040..a220f08d20d 100644
--- a/tests/python/unittest/test_contrib_operator.py
+++ b/tests/python/unittest/test_contrib_operator.py
@@ -26,20 +26,20 @@
 import unittest
 
 def test_box_nms_op():
-    def test_box_nms_forward(data, expected, thresh=0.5, topk=-1, coord=2, score=1, cid=0,
+    def test_box_nms_forward(data, expected, thresh=0.5, valid=0, topk=-1, coord=2, score=1, cid=0,
                          force=False, in_format='corner', out_format='corner'):
         data = mx.nd.array(data)
-        out = mx.contrib.nd.box_nms(data, overlap_thresh=thresh, topk=topk,
+        out = mx.contrib.nd.box_nms(data, overlap_thresh=thresh, valid_thresh=valid, topk=topk,
                                 coord_start=coord, score_index=score, id_index=cid,
                                 force_suppress=force, in_format=in_format, out_format=out_format)
         assert_almost_equal(out.asnumpy(), expected)
 
-    def test_box_nms_backward(data, grad, expected, thresh=0.5, topk=-1, coord=2, score=1,
+    def test_box_nms_backward(data, grad, expected, thresh=0.5, valid=0, topk=-1, coord=2, score=1,
                           cid=0, force=False, in_format='corner', out_format='corner'):
         in_var = mx.sym.Variable('data')
         arr_data = mx.nd.array(data)
         arr_grad = mx.nd.empty(arr_data.shape)
-        op = mx.contrib.sym.box_nms(in_var, overlap_thresh=thresh, topk=topk,
+        op = mx.contrib.sym.box_nms(in_var, overlap_thresh=thresh, valid_thresh=valid, topk=topk,
                                 coord_start=coord, score_index=score, id_index=cid,
                                 force_suppress=force, in_format=in_format, out_format=out_format)
         exe = op.bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
@@ -158,6 +158,23 @@ def swap_position(data, expected, coord=2, score=1, cid=0, new_col=0):
     thresh = 0.5
     test_box_nms_forward(np.array(boxes), np.array(expected), force=force, thresh=thresh, cid=-1)
 
+    # case8: multi-batch thresh + topk
+    boxes8 = [[[1, 1, 0, 0, 10, 10], [1, 0.4, 0, 0, 10, 10], [1, 0.3, 0, 0, 10, 10]],
+              [[2, 1, 0, 0, 10, 10], [2, 0.4, 0, 0, 10, 10], [2, 0.3, 0, 0, 10, 10]],
+              [[3, 1, 0, 0, 10, 10], [3, 0.4, 0, 0, 10, 10], [3, 0.3, 0, 0, 10, 10]]]
+    expected8 = [[[1, 1, 0, 0, 10, 10], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]],
+                 [[2, 1, 0, 0, 10, 10], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]],
+                 [[3, 1, 0, 0, 10, 10], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]]
+    grad8 = np.random.rand(3, 3, 6)
+    expected_in_grad8 = np.zeros((3, 3, 6))
+    expected_in_grad8[(0, 1, 2), (0, 0, 0), :] = grad8[(0, 1, 2), (0, 0, 0), :]
+    force = False
+    thresh = 0.5
+    valid = 0.5
+    topk = 2
+    test_box_nms_forward(np.array(boxes8), np.array(expected8), force=force, thresh=thresh, valid=valid, topk=topk)
+    test_box_nms_backward(np.array(boxes8), grad8, expected_in_grad8, force=force, thresh=thresh, valid=valid, topk=topk)
+
 def test_box_iou_op():
     def numpy_box_iou(a, b, fmt='corner'):
         def area(left, top, right, bottom):
diff --git a/tests/python/unittest/test_gluon_utils.py b/tests/python/unittest/test_gluon_utils.py
new file mode 100644
index 00000000000..a5d3b1401a3
--- /dev/null
+++ b/tests/python/unittest/test_gluon_utils.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tempfile
+
+import mxnet as mx
+from nose.tools import *
+
+
+@raises(Exception)
+def test_download_retries():
+    mx.gluon.utils.download("http://doesnotexist.notfound")
+
+def test_download_successful():
+    tmp = tempfile.mkdtemp()
+    tmpfile = os.path.join(tmp, 'README.md')
+    mx.gluon.utils.download("https://raw.githubusercontent.com/apache/incubator-mxnet/master/README.md",
+                            path=tmpfile)
+    assert os.path.getsize(tmpfile) > 100
\ No newline at end of file
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index a0604658ee1..7c50a424b40 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -711,9 +711,8 @@ def get_values(ensure_unique):
                  k=dat_size*dat_size*dat_size*dat_size, is_ascend=False)
     assert_almost_equal(nd_ret_argsort, gt)
 
-    # test topk with a big shape
-    a = mx.nd.arange(0, 54686454, step=1, repeat=1)
-    assert_almost_equal(a.topk(k=54686454).asnumpy(), a.asnumpy()[::-1])
+    a = mx.nd.arange(0, 1024, step=1, repeat=1)
+    assert_almost_equal(a.topk(k=1024).asnumpy(), a.asnumpy()[::-1])
 
     # Repeat those tests that don't involve indices.  These should pass even with
     # duplicated input data values (over many repeated runs with different random seeds,
diff --git a/tests/python/unittest/test_test_utils.py b/tests/python/unittest/test_test_utils.py
new file mode 100644
index 00000000000..49f0b932fdd
--- /dev/null
+++ b/tests/python/unittest/test_test_utils.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tempfile
+
+import mxnet as mx
+from nose.tools import *
+
+
+@raises(Exception)
+def test_download_retries():
+    mx.test_utils.download("http://doesnotexist.notfound")
+
+def test_download_successful():
+    tmp = tempfile.mkdtemp()
+    tmpfile = os.path.join(tmp, 'README.md')
+    mx.test_utils.download("https://raw.githubusercontent.com/apache/incubator-mxnet/master/README.md",
+                           fname=tmpfile)
+    assert os.path.getsize(tmpfile) > 100
\ No newline at end of file


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services