You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jo...@apache.org on 2016/04/08 21:25:51 UTC
mesos git commit: Added timeout for destroying Docker containers.
Repository: mesos
Updated Branches:
refs/heads/master 48292302a -> d01a6f8d4
Added timeout for destroying Docker containers.
Commands issued to the Docker daemon can hang, causing problems within
Mesos. For example a hanging 'docker stop' can result in an unresponsive
executor, causing the Mesos agent to issue an a 'docker stop' itself
which might result in an unresponsive agent (see MESOS-4673).
Adding a timeout can be used as a workaround.
Review: https://reviews.apache.org/r/44571/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d01a6f8d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d01a6f8d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d01a6f8d
Branch: refs/heads/master
Commit: d01a6f8d4bae38c7a1f56f87254346f7af3efc87
Parents: 4829230
Author: Jan Schlicht <ja...@mesosphere.io>
Authored: Fri Apr 8 14:43:06 2016 -0400
Committer: Joris Van Remoortere <jo...@gmail.com>
Committed: Fri Apr 8 15:25:39 2016 -0400
----------------------------------------------------------------------
src/slave/constants.hpp | 5 +++++
src/slave/containerizer/docker.cpp | 39 +++++++++++++++++++++++++++++++++
src/slave/containerizer/docker.hpp | 4 ++++
3 files changed, 48 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/d01a6f8d/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index 449c8cd..9978c11 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -106,6 +106,11 @@ constexpr size_t DOCKER_PS_MAX_INSPECT_CALLS = 100;
// TODO(tnachen): Make this a flag.
constexpr Duration DOCKER_VERSION_WAIT_TIMEOUT = Seconds(5);
+// Additional duration that docker containerizer will wait beyond the
+// configured `docker_stop_timeout` for docker stop to succeed, before
+// trying to kill the process by itself.
+constexpr Duration DOCKER_FORCE_KILL_TIMEOUT = Seconds(1);
+
// Name of the default, CRAM-MD5 authenticatee.
constexpr char DEFAULT_AUTHENTICATEE[] = "crammd5";
http://git-wip-us.apache.org/repos/asf/mesos/blob/d01a6f8d/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index 5755eff..9c24227 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -1850,7 +1850,12 @@ void DockerContainerizerProcess::_destroy(
// a containerizer responsibility; it is the responsibility of the agent
// in co-operation with the executor. Once `destroy()` is called, the
// container should be destroyed forcefully.
+ // The `after` fallback should remain as a precaution against the docker
+ // stop command hanging.
docker->stop(container->name(), flags.docker_stop_timeout)
+ .after(
+ flags.docker_stop_timeout + DOCKER_FORCE_KILL_TIMEOUT,
+ defer(self(), &Self::destroyTimeout, containerId, lambda::_1))
.onAny(defer(self(), &Self::__destroy, containerId, killed, lambda::_1));
} else {
__destroy(containerId, killed, Nothing());
@@ -1943,6 +1948,40 @@ void DockerContainerizerProcess::___destroy(
}
+Future<Nothing> DockerContainerizerProcess::destroyTimeout(
+ const ContainerID& containerId,
+ Future<Nothing> future)
+{
+ CHECK(containers_.contains(containerId));
+
+ LOG(WARNING) << "Docker stop timed out for "
+ << "container '" << containerId << "'";
+
+ Container* container = containers_[containerId];
+
+ // A hanging `docker stop` could be a problem with docker or even a kernel
+ // bug. Assuming that this is a docker problem, circumventing docker and
+ // killing the process run by it ourselves might help here.
+ if (container->pid.isSome()) {
+ LOG(WARNING) << "Sending SIGKILL to process with pid "
+ << container->pid.get();
+
+ Try<list<os::ProcessTree>> kill =
+ os::killtree(container->pid.get(), SIGKILL);
+
+ if (kill.isError()) {
+ // Ignoring the error from killing process as it can already
+ // have exited.
+ VLOG(1) << "Ignoring error when killing process pid "
+ << container->executorPid.get() << " in destroy, error: "
+ << kill.error();
+ }
+ }
+
+ return future;
+}
+
+
Future<hashset<ContainerID>> DockerContainerizerProcess::containers()
{
return containers_.keys();
http://git-wip-us.apache.org/repos/asf/mesos/blob/d01a6f8d/src/slave/containerizer/docker.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.hpp b/src/slave/containerizer/docker.hpp
index 3567321..18c15a5 100644
--- a/src/slave/containerizer/docker.hpp
+++ b/src/slave/containerizer/docker.hpp
@@ -216,6 +216,10 @@ private:
bool killed,
const process::Future<Option<int>>& status);
+ process::Future<Nothing> destroyTimeout(
+ const ContainerID& containerId,
+ process::Future<Nothing> future);
+
process::Future<Nothing> _update(
const ContainerID& containerId,
const Resources& resources,