You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2018/03/02 23:56:44 UTC
[4/4] mesos git commit: Added inspect retries to the docker
containerizer in `update` method.
Added inspect retries to the docker containerizer in `update` method.
This patch fixes the bug when a terminal status update is never sent
after termination of the docker executor, when the Docker daemon hangs
for `inspect` command. A terminal status update is postponed until
containerizer `update` completes that uses `inspect` command to get
a PID of the docker container. To address the issue, we retry `inspect`
in the loop.
Review: https://reviews.apache.org/r/65868/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/76c38f9d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/76c38f9d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/76c38f9d
Branch: refs/heads/master
Commit: 76c38f9d03ee6854e6bcd00a959d697472e0ea58
Parents: 84c3b4c
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Fri Mar 2 15:39:09 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Fri Mar 2 15:40:31 2018 -0800
----------------------------------------------------------------------
src/slave/constants.hpp | 3 +++
src/slave/containerizer/docker.cpp | 42 ++++++++++++++++++++++++++++++++-
2 files changed, 44 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/76c38f9d/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index 030fb05..b79c084 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -135,6 +135,9 @@ constexpr Duration DOCKER_REMOVE_DELAY = Hours(6);
// container.
constexpr Duration DOCKER_INSPECT_DELAY = Seconds(1);
+// Default duration to wait for `inspect` command completion.
+constexpr Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+
// Default maximum number of docker inspect calls docker ps will invoke
// in parallel to prevent hitting system's open file descriptor limit.
constexpr size_t DOCKER_PS_MAX_INSPECT_CALLS = 100;
http://git-wip-us.apache.org/repos/asf/mesos/blob/76c38f9d/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index 1f4eeb4..31d64a7 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -1711,7 +1711,47 @@ Future<Nothing> DockerContainerizerProcess::update(
return __update(containerId, _resources, container->pid.get());
}
- return docker->inspect(containers_.at(containerId)->containerName)
+ string containerName = containers_.at(containerId)->containerName;
+
+ // Since the Docker daemon might hang, we have to retry the inspect command.
+ //
+ // NOTE: This code is duplicated from the built-in docker executor, but
+ // the retry interval is not passed to `inspect`, because the container might
+ // be terminated.
+ // TODO(abudnik): Consider using a class helper for retrying docker commands.
+ auto inspectLoop = loop(
+ self(),
+ [=]() {
+ return await(
+ docker->inspect(containerName)
+ .after(
+ slave::DOCKER_INSPECT_TIMEOUT,
+ [=](Future<Docker::Container> future) {
+ LOG(WARNING) << "Docker inspect timed out after "
+ << slave::DOCKER_INSPECT_TIMEOUT
+ << " for container "
+ << "'" << containerName << "'";
+
+ // We need to clean up the hanging Docker CLI process.
+ // Discarding the inspect future triggers a callback in
+ // the Docker library that kills the subprocess and
+ // transitions the future.
+ future.discard();
+ return future;
+ }));
+ },
+ [](const Future<Docker::Container>& future)
+ -> Future<ControlFlow<Docker::Container>> {
+ if (future.isReady()) {
+ return Break(future.get());
+ }
+ if (future.isFailed()) {
+ return Failure(future.failure());
+ }
+ return Continue();
+ });
+
+ return inspectLoop
.then(defer(self(), &Self::_update, containerId, _resources, lambda::_1));
#else
return Nothing();