You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ma...@apache.org on 2018/11/07 09:14:13 UTC

[incubator-mxnet] branch master updated: Fix docker cleanup race condition (#13092)

This is an automated email from the ASF dual-hosted git repository.

marcoabreu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new f8052e4  Fix docker cleanup race condition (#13092)
f8052e4 is described below

commit f8052e4261238ff6c93465b3f0d0f22457f127ce
Author: Chance Bair <ch...@gmail.com>
AuthorDate: Wed Nov 7 10:13:50 2018 +0100

    Fix docker cleanup race condition (#13092)
---
 ci/build.py | 92 +++++++++++++++++++++++++++++++------------------------------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/ci/build.py b/ci/build.py
index e2554d9..8f3fe2d 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -281,7 +281,6 @@ def container_run(platform: str,
             # noinspection PyShadowingNames
             # runc is default (docker info | grep -i runtime)
             runtime = 'nvidia'
-
         container = docker_client.containers.run(
             tag,
             runtime=runtime,
@@ -299,52 +298,55 @@ def container_run(platform: str,
                     {'bind': '/work/ccache', 'mode': 'rw'},
             },
             environment=environment)
-        logging.info("Started container: %s", trim_container_id(container.id))
-        # Race condition:
-        # If the previous call is interrupted then it's possible that the container is not cleaned up
-        # We avoid by masking the signals temporarily
-        cleanup.add_container(container)
-        signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
-        #
-        #############################
-
-        stream = container.logs(stream=True, stdout=True, stderr=True)
-        sys.stdout.flush()
-        for chunk in stream:
-            sys.stdout.buffer.write(chunk)
-            sys.stdout.buffer.flush()
-        sys.stdout.flush()
-        stream.close()
-        try:
-            logging.info("Waiting for status of container %s for %d s.",
-                         trim_container_id(container.id),
-                         container_wait_s)
-            wait_result = container.wait(timeout=container_wait_s)
-            logging.info("Container exit status: %s", wait_result)
-            ret = wait_result.get('StatusCode', 200)
-        except Exception as e:
-            logging.exception(e)
-            ret = 150
-
-        # Stop
         try:
-            logging.info("Stopping container: %s", trim_container_id(container.id))
-            container.stop()
-        except Exception as e:
-            logging.exception(e)
-            ret = 151
+            logging.info("Started container: %s", trim_container_id(container.id))
+            # Race condition:
+            # If the previous call is interrupted then it's possible that the container is not cleaned up
+            # We avoid by masking the signals temporarily
+            cleanup.add_container(container)
+            signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
+            #
+            #############################
+
+            stream = container.logs(stream=True, stdout=True, stderr=True)
+            sys.stdout.flush()
+            for chunk in stream:
+                sys.stdout.buffer.write(chunk)
+                sys.stdout.buffer.flush()
+            sys.stdout.flush()
+            stream.close()
+            try:
+                logging.info("Waiting for status of container %s for %d s.",
+                            trim_container_id(container.id),
+                            container_wait_s)
+                wait_result = container.wait(timeout=container_wait_s)
+                logging.info("Container exit status: %s", wait_result)
+                ret = wait_result.get('StatusCode', 200)
+            except Exception as e:
+                logging.exception(e)
+                ret = 150
 
-        # Remove
-        try:
-            logging.info("Removing container: %s", trim_container_id(container.id))
-            container.remove()
-        except Exception as e:
-            logging.exception(e)
-            ret = 152
-        cleanup.remove_container(container)
-        containers = docker_client.containers.list()
-        if containers:
-            logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers])
+            # Stop
+            try:
+                logging.info("Stopping container: %s", trim_container_id(container.id))
+                container.stop()
+            except Exception as e:
+                logging.exception(e)
+                ret = 151
+
+            # Remove
+            try:
+                logging.info("Removing container: %s", trim_container_id(container.id))
+                container.remove()
+            except Exception as e:
+                logging.exception(e)
+                ret = 152
+            cleanup.remove_container(container)
+            containers = docker_client.containers.list()
+            if containers:
+                logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers])
+        except docker.errors.NotFound as e:
+            logging.info("Container was stopped before cleanup started: %s", e)
     return ret