You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/08/30 17:27:56 UTC
[GitHub] larroy closed pull request #12276: [Don't merge][Review] A solution to prevent zombie containers locally and in CI

larroy closed pull request #12276: [Don't merge][Review] A solution to prevent zombie containers locally and in CI
URL: https://github.com/apache/incubator-mxnet/pull/12276
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ci/build.py b/ci/build.py
index a9d6a63537f..169c9d57321 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -23,7 +23,7 @@
 """
 
 __author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy'
-__version__ = '0.1'
+__version__ = '0.3'
 
 import argparse
 import glob
@@ -37,17 +37,101 @@
 import platform
 from copy import deepcopy
 from itertools import chain
-from subprocess import call, check_call
+from subprocess import call, check_call, check_output
 from typing import *
 from util import *
-
-CCACHE_MAXSIZE = '500G'
-
-def under_ci() -> bool:
-    """:return: True if we run in Jenkins."""
-    return 'JOB_NAME' in os.environ
-
-def get_platforms(path: Optional[str] = "docker"):
+import pprint
+import requests
+import docker
+import docker.models
+import docker.errors
+import signal
+import atexit
+
+
+class Cleanup:
+    """A class to cleanup containers"""
+    def __init__(self):
+        self.containers = set()
+        self.docker_stop_timeout = 3
+
+    def add_container(self, container: docker.models.containers.Container):
+        assert isinstance(container, docker.models.containers.Container)
+        self.containers.add(container)
+
+    def remove_container(self, container: docker.models.containers.Container):
+        assert isinstance(container, docker.models.containers.Container)
+        self.containers.remove(container)
+
+    def _cleanup_containers(self):
+        if self.containers:
+            logging.warning("Cleaning up containers")
+        else:
+            return
+        try:
+            stop_timeout = int(os.environ.get("DOCKER_STOP_TIMEOUT", self.docker_stop_timeout))
+        except Exception:
+            stop_timeout = 3
+        for container in self.containers:
+            try:
+                container.stop(timeout=stop_timeout)
+                logging.info("☠: stopped container %s", trim_container_id(container.id))
+                container.remove()
+                logging.info("🚽: removed container %s", trim_container_id(container.id))
+            except Exception as e:
+                logging.exception(e)
+                #pass
+        self.containers.clear()
+        logging.info("Cleaning up containers finished.")
+
+    def __call__(self):
+        """Perform cleanup"""
+        self._cleanup_containers()
+
+
+
+def retry(ExceptionToCheck, tries=4, delay_s=1, backoff=2):
+    """Retry calling the decorated function using an exponential backoff.
+
+    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+    :param ExceptionToCheck: the exception to check. may be a tuple of
+        exceptions to check
+    :type ExceptionToCheck: Exception or tuple
+    :param tries: number of times to try (not retry) before giving up
+    :type tries: int
+    :param delay_s: initial delay between retries in seconds
+    :type delay_s: int
+    :param backoff: backoff multiplier e.g. value of 2 will double the delay
+        each retry
+    :type backoff: int
+    """
+    import time
+    from functools import wraps
+    def decorated_retry(f):
+        @wraps(f)
+        def f_retry(*args, **kwargs):
+            mtries, mdelay = tries, delay_s
+            while mtries > 1:
+                try:
+                    return f(*args, **kwargs)
+                except ExceptionToCheck as e:
+                    logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
+                    time.sleep(mdelay)
+                    mtries -= 1
+                    mdelay *= backoff
+            return f(*args, **kwargs)
+
+        return f_retry  # true decorator
+
+    return decorated_retry
+
+def get_dockerfiles_path() -> str:
+    return "docker"
+
+
+def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
     """Get a list of architectures given our dockerfiles"""
     dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
     dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
@@ -57,10 +141,11 @@ def get_platforms(path: Optional[str] = "docker"):
 
 
 def get_docker_tag(platform: str, registry: str) -> str:
+    """:return: docker tag to be used for the container"""
     return "{0}/build.{1}".format(registry, platform)
 
 
-def get_dockerfile(platform: str, path="docker") -> str:
+def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
     return os.path.join(path, "Dockerfile.build.{0}".format(platform))
 
 
@@ -68,18 +153,18 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int) -> None:
+def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, use_cache: bool) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
     :param docker_binary: docker binary to use (docker/nvidia-docker)
     :param registry: Dockerhub registry name
     :param num_retries: Number of retries to build the docker image
+    :param use_cache: will pass cache_from to docker to use the previously pulled tag
     :return: Id of the top level image
     """
-
     tag = get_docker_tag(platform=platform, registry=registry)
-    logging.info("Building container tagged '%s' with %s", tag, docker_binary)
+    logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
     #
     # We add a user with the same group as the executing non-root user so files created in the
     # container match permissions of the local user. Same for the group.
@@ -90,41 +175,29 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
     # cache-from is needed so we use the cached images tagged from the remote via
     # docker pull see: docker_cache.load_docker_cache
     #
+    # This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
+    # So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
+    # script.
+    #
     # This doesn't work with multi head docker files.
-    # 
-
-    for i in range(num_retries):
-        logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries)
-
-        cmd = [docker_binary, "build",
-               "-f", get_dockerfile(platform),
-               "--build-arg", "USER_ID={}".format(os.getuid()),
-               "--build-arg", "GROUP_ID={}".format(os.getgid()),
-               "--cache-from", tag,
-               "-t", tag,
-               "docker"]
+    #
+    cmd = [docker_binary, "build",
+           "-f", get_dockerfile(platform),
+           "--build-arg", "USER_ID={}".format(os.getuid()),
+           "--build-arg", "GROUP_ID={}".format(os.getgid())]
+    if use_cache:
+        cmd.extend(["--cache-from", tag])
+    cmd.extend(["-t", tag, get_dockerfiles_path()])
+
+    @retry(subprocess.CalledProcessError, tries=num_retries)
+    def run_cmd():
         logging.info("Running command: '%s'", ' '.join(cmd))
-        try:
-            check_call(cmd)
-            # Docker build was successful. Call break to break out of the retry mechanism
-            break
-        except subprocess.CalledProcessError as e:
-            saved_exception = e
-            logging.error('Failed to build docker image')
-            # Building the docker image failed. Call continue to trigger the retry mechanism
-            continue
-    else:
-        # Num retries exceeded
-        logging.exception('Exception during build of docker image', saved_exception)
-        logging.fatal('Failed to build the docker image, aborting...')
-        sys.exit(1)
+        check_call(cmd)
 
+    run_cmd()
     # Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
     # check_call would have failed
-    image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
-    if not image_id:
-        raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
-    return image_id
+    return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
 
 
 def _get_local_image_id(docker_binary, docker_tag):
@@ -134,14 +207,17 @@ def _get_local_image_id(docker_binary, docker_tag):
     :return: Image id as string or None if tag does not exist
     """
     cmd = [docker_binary, "images", "-q", docker_tag]
-    image_id_b = subprocess.check_output(cmd)
+    image_id_b = check_output(cmd)
     image_id = image_id_b.decode('utf-8').strip()
+    if not image_id:
+        raise RuntimeError('Unable to find docker image id matching with tag {}'.format(docker_tag))
     return image_id
 
 
 def buildir() -> str:
     return os.path.join(get_mxnet_root(), "build")
 
+
 def default_ccache_dir() -> str:
     # Share ccache across containers
     if 'CCACHE_DIR' in os.environ:
@@ -158,15 +234,36 @@ def default_ccache_dir() -> str:
         return ccache_dir
     return os.path.join(tempfile.gettempdir(), "ci_ccache")
 
+def trim_container_id(cid):
+    return cid[:12]
 
 def container_run(platform: str,
-                  docker_binary: str,
+                  nvidia_runtime: bool,
                   docker_registry: str,
                   shared_memory_size: str,
-                  local_ccache_dir: str,
                   command: List[str],
+                  local_ccache_dir: str,
+                  cleanup: Cleanup,
                   dry_run: bool = False,
-                  interactive: bool = False) -> str:
+                  interactive: bool = False) -> int:
+    CONTAINER_WAIT_S = 600
+    #
+    # Environment setup
+    #
+    environment = {
+        'CCACHE_MAXSIZE': '500G',
+        'CCACHE_TEMPDIR': '/tmp/ccache',  # temp dir should be local and not shared
+        'CCACHE_DIR': '/work/ccache',  # this path is inside the container as /work/ccache is mounted
+        'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache verification.
+    }
+    # These variables are passed to the container to the process tree killer can find runaway process inside the container
+    # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller
+    # https://github.com/jenkinsci/jenkins/blob/578d6bacb33a5e99f149de504c80275796f0b231/core/src/main/java/hudson/model/Run.java#L2393
+    #
+    JENKINS_ENV_VARS = ['BUILD_NUMBER', 'BUILD_ID', 'BUILD_TAG']
+    environment.update({k: os.environ[k] for k in JENKINS_ENV_VARS if k in os.environ})
+    environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})
+
     tag = get_docker_tag(platform=platform, registry=docker_registry)
     mx_root = get_mxnet_root()
     local_build_folder = buildir()
@@ -174,46 +271,105 @@ def container_run(platform: str,
     os.makedirs(local_build_folder, exist_ok=True)
     os.makedirs(local_ccache_dir, exist_ok=True)
     logging.info("Using ccache directory: %s", local_ccache_dir)
-    runlist = [docker_binary, 'run', '--rm', '-t',
+    docker_client = docker.from_env()
+    # Equivalent command
+    docker_cmd_list = [get_docker_binary(nvidia_runtime), 'run',
+               '--rm',
                '--shm-size={}'.format(shared_memory_size),
                '-v', "{}:/work/mxnet".format(mx_root),  # mount mxnet root
                '-v', "{}:/work/build".format(local_build_folder),  # mount mxnet/build for storing build artifacts
                '-v', "{}:/work/ccache".format(local_ccache_dir),
                '-u', '{}:{}'.format(os.getuid(), os.getgid()),
-               '-e', 'CCACHE_MAXSIZE={}'.format(CCACHE_MAXSIZE),
-               '-e', 'CCACHE_TEMPDIR=/tmp/ccache',  # temp dir should be local and not shared
-               '-e', "CCACHE_DIR=/work/ccache",  # this path is inside the container as /work/ccache is mounted
-               '-e', "CCACHE_LOGFILE=/tmp/ccache.log",  # a container-scoped log, useful for ccache verification.
+               '-e', 'CCACHE_MAXSIZE={}'.format(environment['CCACHE_MAXSIZE']),
+               '-e', 'CCACHE_TEMPDIR={}'.format(environment['CCACHE_TEMPDIR']),  # temp dir should be local and not shared
+               '-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']),  # this path is inside the container as /work/ccache is mounted
+               '-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']),  # a container-scoped log, useful for ccache verification.
+               '-ti',
                tag]
-    runlist.extend(command)
-    cmd = '\\\n\t'.join(runlist)
-    ret = 0
-    if not dry_run and not interactive:
-        logging.info("Running %s in container %s", command, tag)
-        logging.info("Executing:\n%s\n", cmd)
-        ret = call(runlist)
-
-    docker_run_cmd = ' '.join(runlist)
-    if not dry_run and interactive:
-        into_cmd = deepcopy(runlist)
-        # -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument
-        idx = into_cmd.index('-u') + 2
-        into_cmd[idx:idx] = ['-ti']
-        cmd = '\\\n\t'.join(into_cmd)
-        logging.info("Executing:\n%s\n", cmd)
-        docker_run_cmd = ' '.join(into_cmd)
-        ret = call(into_cmd)
-
-    if not dry_run and not interactive and ret != 0:
-        logging.error("Running of command in container failed (%s):\n%s\n", ret, cmd)
-        logging.error("You can get into the container by adding the -i option")
-        raise subprocess.CalledProcessError(ret, cmd)
-
-    return docker_run_cmd
+    docker_cmd_list.extend(command)
+    docker_cmd = ' \\\n\t'.join(docker_cmd_list)
+    logging.info("Running %s in container %s", command, tag)
+    logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
+    ret = 0 # return code of the command inside docker
+    if not dry_run:
+
+
+        #############################
+        #
+        signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
+        if nvidia_runtime:
+            runtime='nvidia'
+        else:
+            # runc is default (docker info | grep -i runtime)
+            runtime=None
+
+        container = docker_client.containers.run(
+            tag,
+            runtime=runtime,
+            detach=True,
+            command=command,
+            #auto_remove=True,
+            shm_size=shared_memory_size,
+            user='{}:{}'.format(os.getuid(), os.getgid()),
+            volumes={
+                mx_root:
+                    {'bind': '/work/mxnet', 'mode': 'rw'},
+                local_build_folder:
+                    {'bind': '/work/build', 'mode': 'rw'},
+                local_ccache_dir:
+                    {'bind': '/work/ccache', 'mode': 'rw'},
+            },
+            environment=environment)
+        logging.info("Started container: %s", trim_container_id(container.id))
+        # Race condition:
+        # If the previous call is interrupted then it's possible that the container is not cleaned up
+        # We avoid by masking the signals temporarily
+        cleanup.add_container(container)
+        signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
+        #
+        #############################
+
+        stream = container.logs(stream=True, stdout=True, stderr=True)
+        sys.stdout.flush()
+        for chunk in stream:
+            sys.stdout.buffer.write(chunk)
+            sys.stdout.buffer.flush()
+        sys.stdout.flush()
+        stream.close()
+        try:
+            logging.info("Waiting for status of container %s for %d s.", trim_container_id(container.id), CONTAINER_WAIT_S)
+            wait_result = container.wait(timeout=CONTAINER_WAIT_S)
+            logging.info("Container exit status: %s", wait_result)
+            ret = wait_result.get('StatusCode', 200)
+        except Exception as e:
+            logging.exception(e)
+            ret = 150
+
+        # Stop
+        try:
+            logging.info("Stopping container: %s", trim_container_id(container.id))
+            container.stop()
+        except Exception as e:
+            logging.exception(e)
+            ret = 151
+
+        # Remove
+        try:
+            logging.info("Removing container: %s", trim_container_id(container.id))
+            container.remove()
+        except Exception as e:
+            logging.exception(e)
+            ret = 152
+        cleanup.remove_container(container)
+        containers = docker_client.containers.list()
+        if containers:
+            logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers])
+    return ret
 
 
 def list_platforms() -> str:
-    print("\nSupported platforms:\n{}".format('\n'.join(get_platforms())))
+    return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))
+
 
 def load_docker_cache(tag, docker_registry) -> None:
     if docker_registry:
@@ -221,24 +377,34 @@ def load_docker_cache(tag, docker_registry) -> None:
             import docker_cache
             logging.info('Docker cache download is enabled from registry %s', docker_registry)
             docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag)
+        # noinspection PyBroadException
         except Exception:
             logging.exception('Unable to retrieve Docker cache. Continue without...')
     else:
         logging.info('Distributed docker cache disabled')
 
-def main() -> int:
-    # We need to be in the same directory than the script so the commands in the dockerfiles work as
-    # expected. But the script can be invoked from a different path
-    base = os.path.split(os.path.realpath(__file__))[0]
-    os.chdir(base)
 
-    logging.getLogger().setLevel(logging.INFO)
+def log_environment():
+    instance_id = ec2_instance_id_hostname()
+    if instance_id:
+        logging.info("EC2 Instance id: %s", instance_id)
+    pp = pprint.PrettyPrinter(indent=4)
+    logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))
+
+
+def script_name() -> str:
+    return os.path.split(sys.argv[0])[1]
 
-    def script_name() -> str:
-        return os.path.split(sys.argv[0])[1]
 
+def main() -> int:
+    logging.getLogger().setLevel(logging.INFO)
+    logging.getLogger("requests").setLevel(logging.WARNING)
     logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))
 
+    logging.info("MXNet container based build tool.")
+    log_environment()
+    chdir_to_script_directory()
+
     parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker
     containers""", epilog="")
     parser.add_argument("-p", "--platform",
@@ -284,8 +450,10 @@ def script_name() -> str:
                         default=1,
                         type=int)
 
-    parser.add_argument("-c", "--cache", action="store_true",
-                        help="Enable docker registry cache")
+    parser.add_argument("-c", "--no-dockerhub-cache", action="store_true",
+                        help="Disables use of --cache-from option on docker build, allowing docker"
+                        " to use local layers for caching. If absent, we use the cache from dockerhub"
+                        " which is the default.")
 
     parser.add_argument("command",
                         help="command to run in the container",
@@ -297,46 +465,65 @@ def script_name() -> str:
                         type=str)
 
     args = parser.parse_args()
+
     def use_cache():
-        return args.cache or under_ci()
+        return not args.no_dockerhub_cache or under_ci()
 
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
-    shared_memory_size = args.shared_memory_size
-    num_docker_build_retires = args.docker_build_retries
+
+    # Cleanup on signals and exit
+    cleanup = Cleanup()
+    def signal_handler(signum, _):
+        signal.pthread_sigmask(signal.SIG_BLOCK, {signum})
+        logging.warning("Signal %d received, cleaning up...", signum)
+        cleanup()
+        logging.warning("done. Exiting with error.")
+        sys.exit(1)
+
+    atexit.register(cleanup)
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
 
     if args.list:
-        list_platforms()
+        print(list_platforms())
     elif args.platform:
         platform = args.platform
         tag = get_docker_tag(platform=platform, registry=args.docker_registry)
         if use_cache():
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-        build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires)
+        build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
+                     num_retries=args.docker_build_retries, use_cache=use_cache())
         if args.build_only:
             logging.warning("Container was just built. Exiting due to build-only.")
             return 0
 
+        ret = 0
         if command:
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=command, docker_registry=args.docker_registry,
-                          local_ccache_dir=args.ccache_dir, interactive=args.interactive)
+            ret = container_run(platform=platform, nvidia_runtime=args.nvidiadocker,
+                shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
+                local_ccache_dir=args.ccache_dir, interactive=args.interactive, cleanup=cleanup)
         elif args.print_docker_run:
-            print(container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                                command=[], dry_run=True, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir))
+            ret = container_run(platform=platform, nvidia_runtime=args.nvidiadocker,
+            shared_memory_size=args.shared_memory_size, command=[], dry_run=True, docker_registry=args.docker_registry,
+            local_ccache_dir=args.ccache_dir)
+            command=[]
         elif args.interactive:
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
+            ret = container_run(platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=shared_memory_size,
                           command=command, docker_registry=args.docker_registry,
-                          local_ccache_dir=args.ccache_dir, interactive=args.interactive)
-
+                          local_ccache_dir=args.ccache_dir, interactive=args.interactive, cleanup=cleanup)
         else:
             # With no commands, execute a build function for the target platform
             assert not args.interactive, "when running with -i must provide a command"
-            cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
-            logging.info("No command specified, trying default build: %s", ' '.join(cmd))
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=cmd, docker_registry=args.docker_registry,
-                          local_ccache_dir=args.ccache_dir)
+
+            command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
+            logging.info("No command specified, trying default build: %s", ' '.join(command))
+            ret = container_run(platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=args.shared_memory_size,
+                command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir, cleanup=cleanup)
+
+        if ret != 0:
+            logging.critical("Execution of %s failed with status: %d", command, ret)
+            return(ret)
 
     elif args.all:
         platforms = get_platforms()
@@ -346,15 +533,21 @@ def use_cache():
             tag = get_docker_tag(platform=platform, registry=args.docker_registry)
             if use_cache():
                 load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires)
+            build_docker(platform, docker_binary, args.docker_registry, num_retries=args.docker_build_retries,
+                         use_cache=use_cache())
             if args.build_only:
                 continue
-            build_platform = "build_{}".format(platform)
-            cmd = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
             shutil.rmtree(buildir(), ignore_errors=True)
-            container_run(platform=platform, docker_binary=docker_binary, shared_memory_size=shared_memory_size,
-                          command=cmd, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
-            plat_buildir = os.path.join(get_mxnet_root(), build_platform)
+            build_platform = "build_{}".format(platform)
+            plat_buildir = os.path.abspath(os.path.join(get_mxnet_root(), '..',
+                                                        "mxnet_{}".format(build_platform)))
+            if os.path.exists(plat_buildir):
+                logging.warning("{} already exists, skipping".format(plat_buildir))
+                continue
+            command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
+            # BUILD in docker
+            container_run(platform=platform, nvidia_runtime=args.nvidiadocker, shared_memory_size=shared_memory_size,
+                          command=command, docker_registry=args.docker_registry, local_ccache_dir=args.ccache_dir)
             shutil.move(buildir(), plat_buildir)
             logging.info("Built files left in: %s", plat_buildir)
 
@@ -383,7 +576,7 @@ def use_cache():
 
 ./build.py -a
 
-    Builds for all platforms and leaves artifacts in build_<platform>
+    Builds for all platforms and leaves artifacts in build_<platform>.
 
     """)
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1e38ec48e6c..d3c1411b438 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -567,6 +567,9 @@ build_ubuntu_gpu_cmake() {
     ninja -v
 }
 
+build_ubuntu_blc() {
+    echo "pass"
+}
 
 # Testing
 
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 7a6d1106d38..bebcb25fb8f 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -30,6 +30,7 @@
 import sys
 import subprocess
 import json
+from typing import *
 import build as build_util
 
 
@@ -59,7 +60,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
     return 1 if is_error else 0
 
 
-def _build_save_container(platform, registry, load_cache) -> str:
+def _build_save_container(platform, registry, load_cache) -> Optional[str]:
     """
     Build image for passed platform and upload the cache to the specified S3 bucket
     :param platform: Platform
@@ -77,7 +78,7 @@ def _build_save_container(platform, registry, load_cache) -> str:
     logging.debug('Building %s as %s', platform, docker_tag)
     try:
         # Increase the number of retries for building the cache.
-        image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10)
+        image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10, use_cache=True)
         logging.info('Built %s as %s', docker_tag, image_id)
 
         # Push cache to registry
diff --git a/ci/util.py b/ci/util.py
index 22631f30435..98605bedf76 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -17,6 +17,7 @@
 
 import os
 import contextlib
+import requests
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -41,3 +42,73 @@ def remember_cwd():
     finally: os.chdir(curdir)
 
 
+def retry(target_exception, tries=4, delay_s=1, backoff=2):
+    """Retry calling the decorated function using an exponential backoff.
+
+    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+    :param target_exception: the exception to check. may be a tuple of
+        exceptions to check
+    :type target_exception: Exception or tuple
+    :param tries: number of times to try (not retry) before giving up
+    :type tries: int
+    :param delay_s: initial delay between retries in seconds
+    :type delay_s: int
+    :param backoff: backoff multiplier e.g. value of 2 will double the delay
+        each retry
+    :type backoff: int
+    """
+    import time
+    from functools import wraps
+
+    def decorated_retry(f):
+        @wraps(f)
+        def f_retry(*args, **kwargs):
+            mtries, mdelay = tries, delay_s
+            while mtries > 1:
+                try:
+                    return f(*args, **kwargs)
+                except target_exception as e:
+                    logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
+                    time.sleep(mdelay)
+                    mtries -= 1
+                    mdelay *= backoff
+            return f(*args, **kwargs)
+
+        return f_retry  # true decorator
+
+    return decorated_retry
+
+
+# noinspection SyntaxError
+def under_ci() -> bool:
+    """:return: True if we run in Jenkins."""
+    return 'JOB_NAME' in os.environ
+
+
+def ec2_instance_id_hostname() -> str:
+    if under_ci():
+        result = []
+        try:
+            r = requests.get("http://instance-data/latest/meta-data/instance-id")
+            if r.status_code == 200:
+                result.append(r.content.decode())
+            r = requests.get("http://instance-data/latest/meta-data/public-hostname")
+            if r.status_code == 200:
+                result.append(r.content.decode())
+            return ' '.join(result)
+        except ConnectionError:
+            pass
+        return '?'
+    else:
+        return ''
+
+
+def chdir_to_script_directory():
+    # We need to be in the same directory than the script so the commands in the dockerfiles work as
+    # expected. But the script can be invoked from a different path
+    base = os.path.split(os.path.realpath(__file__))[0]
+    os.chdir(base)
+
+


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services