You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2018/03/07 09:10:07 UTC

[10/10] mesos git commit: Added inspect retries to the docker containerizer in `update` method.

Added inspect retries to the docker containerizer in `update` method.

This patch fixes the bug when a terminal status update is never sent
after termination of the docker executor, when the Docker daemon hangs
for `inspect` command. A terminal status update is postponed until
containerizer `update` completes that uses `inspect` command to get
a PID of the docker container. To address the issue, we retry `inspect`
in the loop.

Review: https://reviews.apache.org/r/65868/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c8dd080a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c8dd080a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c8dd080a

Branch: refs/heads/1.5.x
Commit: c8dd080af92274eeab4a0d1937a9ff985000d61c
Parents: 811c5aa
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Fri Mar 2 15:39:09 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Wed Mar 7 01:08:13 2018 -0800

----------------------------------------------------------------------
 src/slave/constants.hpp            |  3 +++
 src/slave/containerizer/docker.cpp | 42 ++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c8dd080a/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index 030fb05..b79c084 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -135,6 +135,9 @@ constexpr Duration DOCKER_REMOVE_DELAY = Hours(6);
 // container.
 constexpr Duration DOCKER_INSPECT_DELAY = Seconds(1);
 
+// Default duration to wait for `inspect` command completion.
+constexpr Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+
 // Default maximum number of docker inspect calls docker ps will invoke
 // in parallel to prevent hitting system's open file descriptor limit.
 constexpr size_t DOCKER_PS_MAX_INSPECT_CALLS = 100;

http://git-wip-us.apache.org/repos/asf/mesos/blob/c8dd080a/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index e589c7e..9d425e0 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -1711,7 +1711,47 @@ Future<Nothing> DockerContainerizerProcess::update(
     return __update(containerId, _resources, container->pid.get());
   }
 
-  return docker->inspect(containers_.at(containerId)->containerName)
+  string containerName = containers_.at(containerId)->containerName;
+
+  // Since the Docker daemon might hang, we have to retry the inspect command.
+  //
+  // NOTE: This code is duplicated from the built-in docker executor, but
+  // the retry interval is not passed to `inspect`, because the container might
+  // be terminated.
+  // TODO(abudnik): Consider using a class helper for retrying docker commands.
+  auto inspectLoop = loop(
+      self(),
+      [=]() {
+        return await(
+            docker->inspect(containerName)
+              .after(
+                  slave::DOCKER_INSPECT_TIMEOUT,
+                  [=](Future<Docker::Container> future) {
+                    LOG(WARNING) << "Docker inspect timed out after "
+                                 << slave::DOCKER_INSPECT_TIMEOUT
+                                 << " for container "
+                                 << "'" << containerName << "'";
+
+                    // We need to clean up the hanging Docker CLI process.
+                    // Discarding the inspect future triggers a callback in
+                    // the Docker library that kills the subprocess and
+                    // transitions the future.
+                    future.discard();
+                    return future;
+                  }));
+      },
+      [](const Future<Docker::Container>& future)
+          -> Future<ControlFlow<Docker::Container>> {
+        if (future.isReady()) {
+          return Break(future.get());
+        }
+        if (future.isFailed()) {
+          return Failure(future.failure());
+        }
+        return Continue();
+      });
+
+  return inspectLoop
     .then(defer(self(), &Self::_update, containerId, _resources, lambda::_1));
 #else
   return Nothing();