You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by an...@apache.org on 2017/02/07 17:43:43 UTC
[4/5] mesos git commit: Made `kill()` not invoke `shutdown()` in the
default executor.
Made `kill()` not invoke `shutdown()` in the default executor.
This ensures that when we add support for the default executor
launch multiple task groups, kill for a particular task should
_only_ result in the killing of a task group and not the entire
executor as was the case currently.
Review: https://reviews.apache.org/r/56268/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/7458ba52
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/7458ba52
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/7458ba52
Branch: refs/heads/master
Commit: 7458ba52dee908608f452e9adabca930656d39bf
Parents: 11fb934
Author: Anand Mazumdar <an...@apache.org>
Authored: Tue Feb 7 09:42:19 2017 -0800
Committer: Anand Mazumdar <an...@apache.org>
Committed: Tue Feb 7 09:42:52 2017 -0800
----------------------------------------------------------------------
src/launcher/default_executor.cpp | 39 ++++++++++++++++++++++---------
src/tests/default_executor_tests.cpp | 12 ++++++++++
2 files changed, 40 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/7458ba52/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp
index 327510e..4b21854 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -92,6 +92,9 @@ private:
// that a container is active but a connection for sending the
// `WAIT_NESTED_CONTAINER` call has not been established yet.
Option<Connection> waiting;
+
+ // Set to true if the child container is in the process of being killed.
+ bool killing;
};
public:
@@ -408,7 +411,7 @@ protected:
tasks[taskId] = task;
containers[taskId] = Owned<Container>(
- new Container {containerId, taskId, taskGroup, None()});
+ new Container {containerId, taskId, taskGroup, None(), false});
if (task.has_health_check()) {
// TODO(anand): Add support for command health checks.
@@ -649,7 +652,7 @@ protected:
if (WSUCCEEDED(status.get())) {
taskState = TASK_FINISHED;
- } else if (shuttingDown) {
+ } else if (container->killing) {
// Send TASK_KILLED if the task was killed as a result of
// `killTask()` or `shutdown()`.
taskState = TASK_KILLED;
@@ -687,7 +690,7 @@ protected:
// The default restart policy for a task group is to kill all the
// remaining child containers if one of them terminated with a
// non-zero exit code.
- if (taskState == TASK_FAILED) {
+ if (taskState == TASK_FAILED || taskState == TASK_KILLED) {
// Kill all the other active containers.
//
// TODO(anand): Invoke `kill()` once per active container
@@ -730,14 +733,21 @@ protected:
CHECK_EQ(SUBSCRIBED, state);
- list<Future<Nothing>> killing;
- foreachkey (const TaskID& taskId, containers) {
- killing.push_back(kill(taskId));
+ list<Future<Nothing>> killResponses;
+ foreachvalue (const Owned<Container>& container, containers) {
+ // It is possible that we received a `killTask()` request
+ // from the scheduler before and are waiting on the `waited()`
+ // callback to be invoked for the child container.
+ if (container->killing) {
+ continue;
+ }
+
+ killResponses.push_back(kill(container));
}
// It is possible that the agent process can fail while we are
// killing child containers. We fail fast if this happens.
- collect(killing)
+ collect(killResponses)
.onAny(defer(
self(),
[this](const Future<list<Nothing>>& future) {
@@ -767,12 +777,12 @@ protected:
terminate(self());
}
- Future<Nothing> kill(const TaskID& taskId)
+ Future<Nothing> kill(Owned<Container> container)
{
CHECK_EQ(SUBSCRIBED, state);
- CHECK(containers.contains(taskId));
- const Owned<Container>& container = containers.at(taskId);
+ CHECK(!container->killing);
+ container->killing = true;
LOG(INFO) << "Killing child container " << container->containerId;
@@ -810,7 +820,14 @@ protected:
return;
}
- shutdown();
+ const Owned<Container>& container = containers.at(taskId);
+ if (container->killing) {
+ LOG(WARNING) << "Ignoring kill for task '" << taskId
+ << "' as it is in the process of getting killed";
+ return;
+ }
+
+ kill(container);
}
void taskHealthUpdated(const TaskHealthStatus& healthStatus)
http://git-wip-us.apache.org/repos/asf/mesos/blob/7458ba52/src/tests/default_executor_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/default_executor_tests.cpp b/src/tests/default_executor_tests.cpp
index e8e0aa2..adf5e56 100644
--- a/src/tests/default_executor_tests.cpp
+++ b/src/tests/default_executor_tests.cpp
@@ -406,6 +406,10 @@ TEST_P(DefaultExecutorTest, KillTask)
.WillOnce(FutureArg<1>(&killedUpdate1))
.WillOnce(FutureArg<1>(&killedUpdate2));
+ Future<v1::scheduler::Event::Failure> executorFailure;
+ EXPECT_CALL(*scheduler, failure(_, _))
+ .WillOnce(FutureArg<1>(&executorFailure));
+
// Now kill one task in the task group.
{
Call call;
@@ -433,6 +437,14 @@ TEST_P(DefaultExecutorTest, KillTask)
killedUpdate2->status().task_id()};
ASSERT_EQ(tasks, tasksKilled);
+
+ // The executor should commit suicide after all the tasks have been
+ // killed.
+ AWAIT_READY(executorFailure);
+
+ // Even though the tasks were killed, the executor should exit gracefully.
+ ASSERT_TRUE(executorFailure->has_status());
+ ASSERT_EQ(0, executorFailure->status());
}