You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/11/07 09:13:52 UTC

[GitHub] marcoabreu closed pull request #13092: [MXNET-1193] Fix Docker Cleanup Race Condition

marcoabreu closed pull request #13092: [MXNET-1193] Fix Docker Cleanup Race Condition
URL: https://github.com/apache/incubator-mxnet/pull/13092
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ci/build.py b/ci/build.py
index e2554d9b8ce..8f3fe2d1244 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -281,7 +281,6 @@ def container_run(platform: str,
             # noinspection PyShadowingNames
             # runc is default (docker info | grep -i runtime)
             runtime = 'nvidia'
-
         container = docker_client.containers.run(
             tag,
             runtime=runtime,
@@ -299,52 +298,55 @@ def container_run(platform: str,
                     {'bind': '/work/ccache', 'mode': 'rw'},
             },
             environment=environment)
-        logging.info("Started container: %s", trim_container_id(container.id))
-        # Race condition:
-        # If the previous call is interrupted then it's possible that the container is not cleaned up
-        # We avoid by masking the signals temporarily
-        cleanup.add_container(container)
-        signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
-        #
-        #############################
-
-        stream = container.logs(stream=True, stdout=True, stderr=True)
-        sys.stdout.flush()
-        for chunk in stream:
-            sys.stdout.buffer.write(chunk)
-            sys.stdout.buffer.flush()
-        sys.stdout.flush()
-        stream.close()
-        try:
-            logging.info("Waiting for status of container %s for %d s.",
-                         trim_container_id(container.id),
-                         container_wait_s)
-            wait_result = container.wait(timeout=container_wait_s)
-            logging.info("Container exit status: %s", wait_result)
-            ret = wait_result.get('StatusCode', 200)
-        except Exception as e:
-            logging.exception(e)
-            ret = 150
-
-        # Stop
         try:
-            logging.info("Stopping container: %s", trim_container_id(container.id))
-            container.stop()
-        except Exception as e:
-            logging.exception(e)
-            ret = 151
+            logging.info("Started container: %s", trim_container_id(container.id))
+            # Race condition:
+            # If the previous call is interrupted then it's possible that the container is not cleaned up
+            # We avoid by masking the signals temporarily
+            cleanup.add_container(container)
+            signal.pthread_sigmask(signal.SIG_UNBLOCK, {signal.SIGINT, signal.SIGTERM})
+            #
+            #############################
+
+            stream = container.logs(stream=True, stdout=True, stderr=True)
+            sys.stdout.flush()
+            for chunk in stream:
+                sys.stdout.buffer.write(chunk)
+                sys.stdout.buffer.flush()
+            sys.stdout.flush()
+            stream.close()
+            try:
+                logging.info("Waiting for status of container %s for %d s.",
+                            trim_container_id(container.id),
+                            container_wait_s)
+                wait_result = container.wait(timeout=container_wait_s)
+                logging.info("Container exit status: %s", wait_result)
+                ret = wait_result.get('StatusCode', 200)
+            except Exception as e:
+                logging.exception(e)
+                ret = 150
 
-        # Remove
-        try:
-            logging.info("Removing container: %s", trim_container_id(container.id))
-            container.remove()
-        except Exception as e:
-            logging.exception(e)
-            ret = 152
-        cleanup.remove_container(container)
-        containers = docker_client.containers.list()
-        if containers:
-            logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers])
+            # Stop
+            try:
+                logging.info("Stopping container: %s", trim_container_id(container.id))
+                container.stop()
+            except Exception as e:
+                logging.exception(e)
+                ret = 151
+
+            # Remove
+            try:
+                logging.info("Removing container: %s", trim_container_id(container.id))
+                container.remove()
+            except Exception as e:
+                logging.exception(e)
+                ret = 152
+            cleanup.remove_container(container)
+            containers = docker_client.containers.list()
+            if containers:
+                logging.info("Other running containers: %s", [trim_container_id(x.id) for x in containers])
+        except docker.errors.NotFound as e:
+            logging.info("Container was stopped before cleanup started: %s", e)
     return ret
 
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services