You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jo...@apache.org on 2016/04/08 21:25:51 UTC

mesos git commit: Added timeout for destroying Docker containers.

Repository: mesos
Updated Branches:
  refs/heads/master 48292302a -> d01a6f8d4


Added timeout for destroying Docker containers.

Commands issued to the Docker daemon can hang, causing problems within
Mesos. For example a hanging 'docker stop' can result in an unresponsive
executor, causing the Mesos agent to issue an a 'docker stop' itself
which might result in an unresponsive agent (see MESOS-4673).
Adding a timeout can be used as a workaround.

Review: https://reviews.apache.org/r/44571/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d01a6f8d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d01a6f8d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d01a6f8d

Branch: refs/heads/master
Commit: d01a6f8d4bae38c7a1f56f87254346f7af3efc87
Parents: 4829230
Author: Jan Schlicht <ja...@mesosphere.io>
Authored: Fri Apr 8 14:43:06 2016 -0400
Committer: Joris Van Remoortere <jo...@gmail.com>
Committed: Fri Apr 8 15:25:39 2016 -0400

----------------------------------------------------------------------
 src/slave/constants.hpp            |  5 +++++
 src/slave/containerizer/docker.cpp | 39 +++++++++++++++++++++++++++++++++
 src/slave/containerizer/docker.hpp |  4 ++++
 3 files changed, 48 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/d01a6f8d/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index 449c8cd..9978c11 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -106,6 +106,11 @@ constexpr size_t DOCKER_PS_MAX_INSPECT_CALLS = 100;
 // TODO(tnachen): Make this a flag.
 constexpr Duration DOCKER_VERSION_WAIT_TIMEOUT = Seconds(5);
 
+// Additional duration that docker containerizer will wait beyond the
+// configured `docker_stop_timeout` for docker stop to succeed, before
+// trying to kill the process by itself.
+constexpr Duration DOCKER_FORCE_KILL_TIMEOUT = Seconds(1);
+
 // Name of the default, CRAM-MD5 authenticatee.
 constexpr char DEFAULT_AUTHENTICATEE[] = "crammd5";
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/d01a6f8d/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index 5755eff..9c24227 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -1850,7 +1850,12 @@ void DockerContainerizerProcess::_destroy(
     // a containerizer responsibility; it is the responsibility of the agent
     // in co-operation with the executor. Once `destroy()` is called, the
     // container should be destroyed forcefully.
+    // The `after` fallback should remain as a precaution against the docker
+    // stop command hanging.
     docker->stop(container->name(), flags.docker_stop_timeout)
+      .after(
+          flags.docker_stop_timeout + DOCKER_FORCE_KILL_TIMEOUT,
+          defer(self(), &Self::destroyTimeout, containerId, lambda::_1))
       .onAny(defer(self(), &Self::__destroy, containerId, killed, lambda::_1));
   } else {
     __destroy(containerId, killed, Nothing());
@@ -1943,6 +1948,40 @@ void DockerContainerizerProcess::___destroy(
 }
 
 
+Future<Nothing> DockerContainerizerProcess::destroyTimeout(
+    const ContainerID& containerId,
+    Future<Nothing> future)
+{
+  CHECK(containers_.contains(containerId));
+
+  LOG(WARNING) << "Docker stop timed out for "
+               << "container '" << containerId << "'";
+
+  Container* container = containers_[containerId];
+
+  // A hanging `docker stop` could be a problem with docker or even a kernel
+  // bug. Assuming that this is a docker problem, circumventing docker and
+  // killing the process run by it ourselves might help here.
+  if (container->pid.isSome()) {
+    LOG(WARNING) << "Sending SIGKILL to process with pid "
+                 << container->pid.get();
+
+    Try<list<os::ProcessTree>> kill =
+      os::killtree(container->pid.get(), SIGKILL);
+
+    if (kill.isError()) {
+      // Ignoring the error from killing process as it can already
+      // have exited.
+      VLOG(1) << "Ignoring error when killing process pid "
+              << container->executorPid.get() << " in destroy, error: "
+              << kill.error();
+    }
+  }
+
+  return future;
+}
+
+
 Future<hashset<ContainerID>> DockerContainerizerProcess::containers()
 {
   return containers_.keys();

http://git-wip-us.apache.org/repos/asf/mesos/blob/d01a6f8d/src/slave/containerizer/docker.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.hpp b/src/slave/containerizer/docker.hpp
index 3567321..18c15a5 100644
--- a/src/slave/containerizer/docker.hpp
+++ b/src/slave/containerizer/docker.hpp
@@ -216,6 +216,10 @@ private:
       bool killed,
       const process::Future<Option<int>>& status);
 
+  process::Future<Nothing> destroyTimeout(
+      const ContainerID& containerId,
+      process::Future<Nothing> future);
+
   process::Future<Nothing> _update(
       const ContainerID& containerId,
       const Resources& resources,