You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by al...@apache.org on 2017/08/10 23:26:16 UTC
[1/3] mesos git commit: Added logging in docker executor on `docker
stop` failure.
Repository: mesos
Updated Branches:
refs/heads/1.3.x f38bfbf34 -> f2deb04a5
Added logging in docker executor on `docker stop` failure.
Review: https://reviews.apache.org/r/61435/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5852756b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5852756b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5852756b
Branch: refs/heads/1.3.x
Commit: 5852756ba6faa9f9704c677901a773f4c98109d8
Parents: f38bfbf
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Thu Aug 10 18:52:51 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 00:08:53 2017 +0200
----------------------------------------------------------------------
src/docker/executor.cpp | 5 +++++
1 file changed, 5 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/5852756b/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 82ae9bd..875b43c 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -406,6 +406,11 @@ private:
// that we can adjust the grace period in the case of
// a `KillPolicy` override.
stop = docker->stop(containerName, gracePeriod);
+
+ stop.onFailed(defer(self(), [=](const string& failure) {
+ LOG(ERROR) << "Failed to stop container '" << containerName << "'"
+ << ": " << failure;
+ }));
}
}
[3/3] mesos git commit: Added MESOS-6743 to 1.3.2 CHANGELOG.
Posted by al...@apache.org.
Added MESOS-6743 to 1.3.2 CHANGELOG.
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f2deb04a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f2deb04a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f2deb04a
Branch: refs/heads/1.3.x
Commit: f2deb04a57cc050eddb3d242f170e4fe848c5b93
Parents: 67723f0
Author: Alexander Rukletsov <al...@apache.org>
Authored: Fri Aug 11 00:14:05 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 00:14:05 2017 +0200
----------------------------------------------------------------------
CHANGELOG | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/f2deb04a/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index df8310a..5257d33 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,18 @@
-Release Notes - Mesos - Version 1.3.1 (WIP)
+Release Notes - Mesos - Version 1.3.2 (WIP)
-------------------------------------------
* This is a bug fix release.
All Issues:
** Bug
+ * [MESOS-6743] - Docker executor hangs forever if `docker stop` fails.
+
+
+Release Notes - Mesos - Version 1.3.1
+-------------------------------------
+* This is a bug fix release.
+
+All Issues:
+** Bug
* [MESOS-5187] - The filesystem/linux isolator does not set the permissions of the host_path.
* [MESOS-7252] - Need to fix resource check in long-lived framework.
* [MESOS-7429] - Allow isolators to inject task-specific environment variables.
@@ -23,7 +32,7 @@ All Issues:
Release Notes - Mesos - Version 1.3.0
--------------------------------------------
+-------------------------------------
This release contains the following new features:
* [MESOS-1763] - Support for frameworks to receive resources for multiple
[2/3] mesos git commit: Enabled retries for `killTask` in docker
executor.
Posted by al...@apache.org.
Enabled retries for `killTask` in docker executor.
Previously, after `docker stop` command failure, docker executor
neither allowed a scheduler to retry `killTask` command, nor retried
`killTask` when task kill was triggered by a failed health check.
Review: https://reviews.apache.org/r/61530/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/67723f01
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/67723f01
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/67723f01
Branch: refs/heads/1.3.x
Commit: 67723f01b5ca3583725011c691e8ffd6b89a02e5
Parents: 5852756
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Thu Aug 10 18:53:03 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 00:09:04 2017 +0200
----------------------------------------------------------------------
src/docker/executor.cpp | 80 ++++++++++++++++++++++++++++++++------------
1 file changed, 59 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/67723f01/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 875b43c..c6f50fa 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -22,6 +22,7 @@
#include <mesos/executor.hpp>
#include <mesos/mesos.hpp>
+#include <process/delay.hpp>
#include <process/id.hpp>
#include <process/owned.hpp>
#include <process/process.hpp>
@@ -70,6 +71,7 @@ namespace docker {
const Duration DOCKER_INSPECT_DELAY = Milliseconds(500);
const Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+const Duration KILL_RETRY_INTERVAL = Seconds(5);
// Executor that is responsible to execute a docker container and
// redirect log output to configured stdout and stderr files. Similar
@@ -90,8 +92,9 @@ public:
bool cgroupsEnableCfs)
: ProcessBase(ID::generate("docker-executor")),
killed(false),
- killedByHealthCheck(false),
terminated(false),
+ killedByHealthCheck(false),
+ killingInProgress(false),
launcherDir(launcherDir),
docker(docker),
containerName(containerName),
@@ -355,8 +358,8 @@ private:
// the grace period if a new one is provided.
// Issue the kill signal if the container is running
- // and we haven't killed it yet.
- if (run.isSome() && !killed) {
+ // and kill attempt is not in progress.
+ if (run.isSome() && !killingInProgress) {
// We have to issue the kill after 'docker inspect' has
// completed, otherwise we may race with 'docker run'
// and docker may not know about the container. Note
@@ -375,7 +378,15 @@ private:
CHECK_SOME(taskId);
CHECK_EQ(taskId_, taskId.get());
- if (!terminated && !killed) {
+ if (!terminated && !killingInProgress) {
+ killingInProgress = true;
+
+ // Once the task has been transitioned to `killed`,
+ // there is no way back, even if the kill attempt
+ // failed. This also allows us to send TASK_KILLING
+ // only once, regardless of how many kill attempts
+ // have been made.
+ //
// Because we rely on `killed` to determine whether
// to send TASK_KILLED, we set `killed` only once the
// kill is issued. If we set it earlier we're more
@@ -383,23 +394,25 @@ private:
// signaled the container. Note that in general it's
// a race between signaling and the container
// terminating with a non-zero exit status.
- killed = true;
-
- // Send TASK_KILLING if the framework can handle it.
- if (protobuf::frameworkHasCapability(
- frameworkInfo.get(),
- FrameworkInfo::Capability::TASK_KILLING_STATE)) {
- // TODO(alexr): Use `protobuf::createTaskStatus()`
- // instead of manually setting fields.
- TaskStatus status;
- status.mutable_task_id()->CopyFrom(taskId.get());
- status.set_state(TASK_KILLING);
- driver.get()->sendStatusUpdate(status);
- }
+ if (!killed) {
+ killed = true;
- // Stop health checking the task.
- if (checker.get() != nullptr) {
- checker->pause();
+ // Send TASK_KILLING if the framework can handle it.
+ if (protobuf::frameworkHasCapability(
+ frameworkInfo.get(),
+ FrameworkInfo::Capability::TASK_KILLING_STATE)) {
+ // TODO(alexr): Use `protobuf::createTaskStatus()`
+ // instead of manually setting fields.
+ TaskStatus status;
+ status.mutable_task_id()->CopyFrom(taskId.get());
+ status.set_state(TASK_KILLING);
+ driver.get()->sendStatusUpdate(status);
+ }
+
+ // Stop health checking the task.
+ if (checker.get() != nullptr) {
+ checker->pause();
+ }
}
// TODO(bmahler): Replace this with 'docker kill' so
@@ -407,9 +420,32 @@ private:
// a `KillPolicy` override.
stop = docker->stop(containerName, gracePeriod);
+ // Invoking `docker stop` might be unsuccessful, in which case the
+ // container most probably does not receive the signal. In this case we
+ // should allow schedulers to retry the kill operation or, if the kill
+ // was initiated by a failing health check, retry ourselves. We do not
+ // bail out nor stop retrying to avoid sending a terminal status update
+ // while the container might still be running.
+ //
+ // NOTE: `docker stop` might also hang. We do not address this for now,
+ // because there is no evidence that in this case docker daemon might
+ // function properly, i.e., it's only the docker cli command that hangs,
+ // and hence there is not so much we can do. See MESOS-6743.
stop.onFailed(defer(self(), [=](const string& failure) {
LOG(ERROR) << "Failed to stop container '" << containerName << "'"
<< ": " << failure;
+
+ killingInProgress = false;
+
+ if (killedByHealthCheck) {
+ LOG(INFO) << "Retrying to kill task in " << KILL_RETRY_INTERVAL;
+ delay(
+ KILL_RETRY_INTERVAL,
+ self(),
+ &Self::_killTask,
+ taskId_,
+ gracePeriod);
+ }
}));
}
}
@@ -597,9 +633,11 @@ private:
// TODO(alexr): Introduce a state enum and document transitions,
// see MESOS-5252.
bool killed;
- bool killedByHealthCheck;
bool terminated;
+ bool killedByHealthCheck;
+ bool killingInProgress; // Guard against simultaneous kill attempts.
+
string launcherDir;
Owned<Docker> docker;
string containerName;