You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2018/03/07 09:17:16 UTC

[12/12] mesos git commit: Added inspect retries to the Docker executor.

Added inspect retries to the Docker executor.

This patch adds retries for `inspect` command to workaround docker
daemon hangs. We assume that the docker daemon can be temporarily
unresponsive. If it's unresponsive, then any started docker cli
command hangs. To address the issue, we retry `inspect` in the loop.

Review: https://reviews.apache.org/r/65759/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8d6a5f98
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8d6a5f98
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8d6a5f98

Branch: refs/heads/1.3.x
Commit: 8d6a5f98597eabce6fbeca4ec4aa12de567a42da
Parents: ad74b04
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Fri Mar 2 15:39:05 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:15:00 2018 -0800

----------------------------------------------------------------------
 src/docker/executor.cpp | 46 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/8d6a5f98/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index aadbf5f..44d8212 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -22,8 +22,10 @@
 #include <mesos/executor.hpp>
 #include <mesos/mesos.hpp>
 
+#include <process/collect.hpp>
 #include <process/delay.hpp>
 #include <process/id.hpp>
+#include <process/loop.hpp>
 #include <process/owned.hpp>
 #include <process/process.hpp>
 #include <process/protobuf.hpp>
@@ -200,13 +202,46 @@ public:
 
     run->onAny(defer(self(), &Self::reaped, lambda::_1));
 
+    // Since the Docker daemon might hang, we have to retry the inspect command.
+    auto inspectLoop = loop(
+        self(),
+        [=]() {
+          return await(
+              docker->inspect(containerName, DOCKER_INSPECT_DELAY)
+                .after(
+                    DOCKER_INSPECT_TIMEOUT,
+                    [=](Future<Docker::Container> future) {
+                      LOG(WARNING) << "Docker inspect timed out after "
+                                   << DOCKER_INSPECT_TIMEOUT
+                                   << " for container "
+                                   << "'" << containerName << "'";
+
+                      // We need to clean up the hanging Docker CLI process.
+                      // Discarding the inspect future triggers a callback in
+                      // the Docker library that kills the subprocess and
+                      // transitions the future.
+                      future.discard();
+                      return future;
+                    }));
+        },
+        [](const Future<Docker::Container>& future)
+            -> Future<ControlFlow<Docker::Container>> {
+          if (future.isReady()) {
+            return Break(future.get());
+          }
+          if (future.isFailed()) {
+            return Failure(future.failure());
+          }
+          return Continue();
+        });
+
     // Delay sending TASK_RUNNING status update until we receive
     // inspect output. Note that we store a future that completes
     // after the sending of the running update. This allows us to
     // ensure that the terminal update is sent after the running
     // update (see `reaped()`).
-    inspect = docker->inspect(containerName, DOCKER_INSPECT_DELAY)
-      .then(defer(self(), [=](const Docker::Container& container) {
+    inspect =
+      inspectLoop.then(defer(self(), [=](const Docker::Container& container) {
         if (!killed) {
           containerPid = container.pid;
 
@@ -293,13 +328,6 @@ public:
         return Nothing();
       }));
 
-    inspect
-      .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) {
-        LOG(WARNING) << "Docker inspect has not finished after "
-                     << DOCKER_INSPECT_TIMEOUT;
-        return inspect;
-      });
-
     inspect.onFailed(defer(self(), [=](const string& failure) {
       cerr << "Failed to inspect container '" << containerName << "'"
            << ": " << failure << endl;