You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by al...@apache.org on 2017/08/10 23:26:17 UTC
[2/3] mesos git commit: Enabled retries for `killTask` in docker
executor.
Enabled retries for `killTask` in docker executor.
Previously, after `docker stop` command failure, docker executor
neither allowed a scheduler to retry `killTask` command, nor retried
`killTask` when task kill was triggered by a failed health check.
Review: https://reviews.apache.org/r/61530/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/67723f01
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/67723f01
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/67723f01
Branch: refs/heads/1.3.x
Commit: 67723f01b5ca3583725011c691e8ffd6b89a02e5
Parents: 5852756
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Thu Aug 10 18:53:03 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 00:09:04 2017 +0200
----------------------------------------------------------------------
src/docker/executor.cpp | 80 ++++++++++++++++++++++++++++++++------------
1 file changed, 59 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/67723f01/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 875b43c..c6f50fa 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -22,6 +22,7 @@
#include <mesos/executor.hpp>
#include <mesos/mesos.hpp>
+#include <process/delay.hpp>
#include <process/id.hpp>
#include <process/owned.hpp>
#include <process/process.hpp>
@@ -70,6 +71,7 @@ namespace docker {
const Duration DOCKER_INSPECT_DELAY = Milliseconds(500);
const Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+const Duration KILL_RETRY_INTERVAL = Seconds(5);
// Executor that is responsible to execute a docker container and
// redirect log output to configured stdout and stderr files. Similar
@@ -90,8 +92,9 @@ public:
bool cgroupsEnableCfs)
: ProcessBase(ID::generate("docker-executor")),
killed(false),
- killedByHealthCheck(false),
terminated(false),
+ killedByHealthCheck(false),
+ killingInProgress(false),
launcherDir(launcherDir),
docker(docker),
containerName(containerName),
@@ -355,8 +358,8 @@ private:
// the grace period if a new one is provided.
// Issue the kill signal if the container is running
- // and we haven't killed it yet.
- if (run.isSome() && !killed) {
+ // and kill attempt is not in progress.
+ if (run.isSome() && !killingInProgress) {
// We have to issue the kill after 'docker inspect' has
// completed, otherwise we may race with 'docker run'
// and docker may not know about the container. Note
@@ -375,7 +378,15 @@ private:
CHECK_SOME(taskId);
CHECK_EQ(taskId_, taskId.get());
- if (!terminated && !killed) {
+ if (!terminated && !killingInProgress) {
+ killingInProgress = true;
+
+ // Once the task has been transitioned to `killed`,
+ // there is no way back, even if the kill attempt
+ // failed. This also allows us to send TASK_KILLING
+ // only once, regardless of how many kill attempts
+ // have been made.
+ //
// Because we rely on `killed` to determine whether
// to send TASK_KILLED, we set `killed` only once the
// kill is issued. If we set it earlier we're more
@@ -383,23 +394,25 @@ private:
// signaled the container. Note that in general it's
// a race between signaling and the container
// terminating with a non-zero exit status.
- killed = true;
-
- // Send TASK_KILLING if the framework can handle it.
- if (protobuf::frameworkHasCapability(
- frameworkInfo.get(),
- FrameworkInfo::Capability::TASK_KILLING_STATE)) {
- // TODO(alexr): Use `protobuf::createTaskStatus()`
- // instead of manually setting fields.
- TaskStatus status;
- status.mutable_task_id()->CopyFrom(taskId.get());
- status.set_state(TASK_KILLING);
- driver.get()->sendStatusUpdate(status);
- }
+ if (!killed) {
+ killed = true;
- // Stop health checking the task.
- if (checker.get() != nullptr) {
- checker->pause();
+ // Send TASK_KILLING if the framework can handle it.
+ if (protobuf::frameworkHasCapability(
+ frameworkInfo.get(),
+ FrameworkInfo::Capability::TASK_KILLING_STATE)) {
+ // TODO(alexr): Use `protobuf::createTaskStatus()`
+ // instead of manually setting fields.
+ TaskStatus status;
+ status.mutable_task_id()->CopyFrom(taskId.get());
+ status.set_state(TASK_KILLING);
+ driver.get()->sendStatusUpdate(status);
+ }
+
+ // Stop health checking the task.
+ if (checker.get() != nullptr) {
+ checker->pause();
+ }
}
// TODO(bmahler): Replace this with 'docker kill' so
@@ -407,9 +420,32 @@ private:
// a `KillPolicy` override.
stop = docker->stop(containerName, gracePeriod);
+ // Invoking `docker stop` might be unsuccessful, in which case the
+ // container most probably does not receive the signal. In this case we
+ // should allow schedulers to retry the kill operation or, if the kill
+ // was initiated by a failing health check, retry ourselves. We do not
+ // bail out nor stop retrying to avoid sending a terminal status update
+ // while the container might still be running.
+ //
+ // NOTE: `docker stop` might also hang. We do not address this for now,
+ // because there is no evidence that in this case docker daemon might
+ // function properly, i.e., it's only the docker cli command that hangs,
+ // and hence there is not so much we can do. See MESOS-6743.
stop.onFailed(defer(self(), [=](const string& failure) {
LOG(ERROR) << "Failed to stop container '" << containerName << "'"
<< ": " << failure;
+
+ killingInProgress = false;
+
+ if (killedByHealthCheck) {
+ LOG(INFO) << "Retrying to kill task in " << KILL_RETRY_INTERVAL;
+ delay(
+ KILL_RETRY_INTERVAL,
+ self(),
+ &Self::_killTask,
+ taskId_,
+ gracePeriod);
+ }
}));
}
}
@@ -597,9 +633,11 @@ private:
// TODO(alexr): Introduce a state enum and document transitions,
// see MESOS-5252.
bool killed;
- bool killedByHealthCheck;
bool terminated;
+ bool killedByHealthCheck;
+ bool killingInProgress; // Guard against simultaneous kill attempts.
+
string launcherDir;
Owned<Docker> docker;
string containerName;