You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by al...@apache.org on 2017/08/10 23:30:22 UTC
[1/3] mesos git commit: Added MESOS-6743 to 1.2.3 CHANGELOG.
Repository: mesos
Updated Branches:
refs/heads/1.2.x 959125a9c -> 67598ba6e
Added MESOS-6743 to 1.2.3 CHANGELOG.
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/67598ba6
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/67598ba6
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/67598ba6
Branch: refs/heads/1.2.x
Commit: 67598ba6e0c316b557d09dc965ea7471be149b1f
Parents: 8440065
Author: Alexander Rukletsov <al...@apache.org>
Authored: Fri Aug 11 00:06:57 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 01:29:48 2017 +0200
----------------------------------------------------------------------
CHANGELOG | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/67598ba6/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index bc92b04..8989a57 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,18 @@
-Release Notes - Mesos - Version 1.2.2
+Release Notes - Mesos - Version 1.2.3 (WIP)
-------------------------------------------
* This is a bug fix release.
All Issues:
** Bug
+ * [MESOS-6743] - Docker executor hangs forever if `docker stop` fails.
+
+
+Release Notes - Mesos - Version 1.2.2
+-------------------------------------
+* This is a bug fix release.
+
+All Issues:
+** Bug
* [MESOS-5187] - The filesystem/linux isolator does not set the permissions of the host_path.
* [MESOS-7252] - Need to fix resource check in long-lived framework.
* [MESOS-7546] - WAIT_NESTED_CONTAINER sometimes returns 404.
@@ -21,8 +30,9 @@ All Issues:
** Improvement
* [MESOS-7540] - Add an agent flag for executor re-registration timeout.
+
Release Notes - Mesos - Version 1.2.1
--------------------------------------------
+-------------------------------------
* This is a bug fix release.
**NOTE**: In Mesos 1.2.1, the master will no longer allow 0.x agents to
[3/3] mesos git commit: Added logging in docker executor on `docker
stop` failure.
Posted by al...@apache.org.
Added logging in docker executor on `docker stop` failure.
Review: https://reviews.apache.org/r/61435/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/89747f4a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/89747f4a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/89747f4a
Branch: refs/heads/1.2.x
Commit: 89747f4a0ae6bfdb96d7d7db70b7537755c5d4bc
Parents: 959125a
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Thu Aug 10 18:52:51 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 01:29:48 2017 +0200
----------------------------------------------------------------------
src/docker/executor.cpp | 5 +++++
1 file changed, 5 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/89747f4a/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 6786687..e638451 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -374,6 +374,11 @@ private:
// that we can adjust the grace period in the case of
// a `KillPolicy` override.
stop = docker->stop(containerName, gracePeriod);
+
+ stop.onFailed(defer(self(), [=](const string& failure) {
+ LOG(ERROR) << "Failed to stop container '" << containerName << "'"
+ << ": " << failure;
+ }));
}
}
[2/3] mesos git commit: Enabled retries for `killTask` in docker
executor.
Posted by al...@apache.org.
Enabled retries for `killTask` in docker executor.
Previously, after `docker stop` command failure, docker executor
neither allowed a scheduler to retry `killTask` command, nor retried
`killTask` when task kill was triggered by a failed health check.
Review: https://reviews.apache.org/r/61530/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8440065c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8440065c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8440065c
Branch: refs/heads/1.2.x
Commit: 8440065c5c4ce1d9c026ae225bed984e855d080b
Parents: 89747f4
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Thu Aug 10 18:53:03 2017 +0200
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Aug 11 01:29:48 2017 +0200
----------------------------------------------------------------------
src/docker/executor.cpp | 76 +++++++++++++++++++++++++++++++++-----------
1 file changed, 57 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/8440065c/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index e638451..d7f2134 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -22,6 +22,7 @@
#include <mesos/executor.hpp>
#include <mesos/mesos.hpp>
+#include <process/delay.hpp>
#include <process/id.hpp>
#include <process/owned.hpp>
#include <process/process.hpp>
@@ -70,6 +71,7 @@ namespace docker {
const Duration DOCKER_INSPECT_DELAY = Milliseconds(500);
const Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+const Duration KILL_RETRY_INTERVAL = Seconds(5);
// Executor that is responsible to execute a docker container and
// redirect log output to configured stdout and stderr files. Similar
@@ -89,8 +91,9 @@ public:
const map<string, string>& taskEnvironment)
: ProcessBase(ID::generate("docker-executor")),
killed(false),
- killedByHealthCheck(false),
terminated(false),
+ killedByHealthCheck(false),
+ killingInProgress(false),
launcherDir(launcherDir),
docker(docker),
containerName(containerName),
@@ -325,8 +328,8 @@ private:
// the grace period if a new one is provided.
// Issue the kill signal if the container is running
- // and we haven't killed it yet.
- if (run.isSome() && !killed) {
+ // and kill attempt is not in progress.
+ if (run.isSome() && !killingInProgress) {
// We have to issue the kill after 'docker inspect' has
// completed, otherwise we may race with 'docker run'
// and docker may not know about the container. Note
@@ -345,7 +348,15 @@ private:
CHECK_SOME(taskId);
CHECK_EQ(taskId_, taskId.get());
- if (!terminated && !killed) {
+ if (!terminated && !killingInProgress) {
+ killingInProgress = true;
+
+ // Once the task has been transitioned to `killed`,
+ // there is no way back, even if the kill attempt
+ // failed. This also allows us to send TASK_KILLING
+ // only once, regardless of how many kill attempts
+ // have been made.
+ //
// Because we rely on `killed` to determine whether
// to send TASK_KILLED, we set `killed` only once the
// kill is issued. If we set it earlier we're more
@@ -353,21 +364,23 @@ private:
// signaled the container. Note that in general it's
// a race between signaling and the container
// terminating with a non-zero exit status.
- killed = true;
-
- // Send TASK_KILLING if the framework can handle it.
- if (protobuf::frameworkHasCapability(
- frameworkInfo.get(),
- FrameworkInfo::Capability::TASK_KILLING_STATE)) {
- TaskStatus status;
- status.mutable_task_id()->CopyFrom(taskId.get());
- status.set_state(TASK_KILLING);
- driver.get()->sendStatusUpdate(status);
- }
+ if (!killed) {
+ killed = true;
+
+ // Send TASK_KILLING if the framework can handle it.
+ if (protobuf::frameworkHasCapability(
+ frameworkInfo.get(),
+ FrameworkInfo::Capability::TASK_KILLING_STATE)) {
+ TaskStatus status;
+ status.mutable_task_id()->CopyFrom(taskId.get());
+ status.set_state(TASK_KILLING);
+ driver.get()->sendStatusUpdate(status);
+ }
- // Stop health checking the task.
- if (checker.get() != nullptr) {
- checker->stop();
+ // Stop health checking the task.
+ if (checker.get() != nullptr) {
+ checker->stop();
+ }
}
// TODO(bmahler): Replace this with 'docker kill' so
@@ -375,9 +388,32 @@ private:
// a `KillPolicy` override.
stop = docker->stop(containerName, gracePeriod);
+ // Invoking `docker stop` might be unsuccessful, in which case the
+ // container most probably does not receive the signal. In this case we
+ // should allow schedulers to retry the kill operation or, if the kill
+ // was initiated by a failing health check, retry ourselves. We do not
+ // bail out nor stop retrying to avoid sending a terminal status update
+ // while the container might still be running.
+ //
+ // NOTE: `docker stop` might also hang. We do not address this for now,
+ // because there is no evidence that in this case docker daemon might
+ // function properly, i.e., it's only the docker cli command that hangs,
+ // and hence there is not so much we can do. See MESOS-6743.
stop.onFailed(defer(self(), [=](const string& failure) {
LOG(ERROR) << "Failed to stop container '" << containerName << "'"
<< ": " << failure;
+
+ killingInProgress = false;
+
+ if (killedByHealthCheck) {
+ LOG(INFO) << "Retrying to kill task in " << KILL_RETRY_INTERVAL;
+ delay(
+ KILL_RETRY_INTERVAL,
+ self(),
+ &Self::_killTask,
+ taskId_,
+ gracePeriod);
+ }
}));
}
}
@@ -552,9 +588,11 @@ private:
// TODO(alexr): Introduce a state enum and document transitions,
// see MESOS-5252.
bool killed;
- bool killedByHealthCheck;
bool terminated;
+ bool killedByHealthCheck;
+ bool killingInProgress; // Guard against simultaneous kill attempts.
+
string launcherDir;
Owned<Docker> docker;
string containerName;