You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by cf...@apache.org on 2021/10/15 20:15:07 UTC
[mesos] branch master updated: Fixed an agent crash in case of
duplicate task ID.
This is an automated email from the ASF dual-hosted git repository.
cfnatali pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
The following commit(s) were added to refs/heads/master by this push:
new c857b62 Fixed an agent crash in case of duplicate task ID.
c857b62 is described below
commit c857b6292e5b281b5aafe5e83a90b2b3c5ad01bb
Author: Charles-Francois Natali <cf...@gmail.com>
AuthorDate: Thu Sep 9 22:12:42 2021 +0100
Fixed an agent crash in case of duplicate task ID.
When using the command executor, if a task is started with the same ID
as a previous task for which the agent still has the executor, reject
the task instead of crashing.
Closes MESOS-9657.
---
src/slave/slave.cpp | 11 +++++-
src/tests/slave_tests.cpp | 95 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 105 insertions(+), 1 deletion(-)
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index a2d6eb7..3d53db4 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -3191,7 +3191,16 @@ void Slave::__run(
if (taskGroup.isNone() && task->has_command()) {
// We are dealing with command task; a new command executor will be
// launched.
- CHECK(executor == nullptr);
+ // It is possible for an executor with this ID to already exist, if the
+ // TaskID was re-used - see MESOS-9657. If this happens, we have no
+ // choice but to drop the task.
+ if (executor != nullptr) {
+ sendTaskDroppedUpdate(
+ TaskStatus::REASON_TASK_INVALID,
+ "Cannot reuse an already existing executor for a command task");
+
+ return;
+ }
} else {
// Master set the `launch_executor` flag and this is not a command task.
if (launchExecutor.get() && executor != nullptr) {
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index b46e561..b60b260 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -12876,6 +12876,101 @@ TEST_F(SlaveTest, CheckpointedDrainInfo)
EXPECT_EQ(TaskStatus::REASON_SLAVE_DRAINING, statusKilled->reason());
}
+TEST_F(SlaveTest, DuplicateTaskIdCommandExecutor)
+{
+ // Start a master.
+ Try<Owned<cluster::Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ Owned<MasterDetector> detector = master.get()->createDetector();
+
+ // Start a slave.
+ Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
+ ASSERT_SOME(slave);
+
+ FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+
+ MockScheduler sched;
+ MesosSchedulerDriver driver(
+ &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+ EXPECT_CALL(sched, registered(_, _, _));
+
+ Future<vector<Offer>> offers1;
+
+ EXPECT_CALL(sched, resourceOffers(&driver, _))
+ .WillOnce(FutureArg<1>(&offers1))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ driver.start();
+
+ AWAIT_READY(offers1);
+ ASSERT_FALSE(offers1->empty());
+ Offer offer1 = offers1.get()[0];
+
+ // Start the first task, and wait for it to complete.
+ TaskInfo task1 = createTask(
+ offer1.slave_id(),
+ Resources::parse("cpus:0.1;mem:32").get(),
+ echoAuthorCommand(),
+ None(),
+ "task-1",
+ "duplicate-task-id");
+
+ Future<TaskStatus> statusStarting1;
+ Future<TaskStatus> statusRunning1;
+ Future<TaskStatus> statusFinished1;
+ EXPECT_CALL(sched, statusUpdate(_, _))
+ .WillOnce(FutureArg<1>(&statusStarting1))
+ .WillOnce(FutureArg<1>(&statusRunning1))
+ .WillOnce(FutureArg<1>(&statusFinished1));
+
+ driver.launchTasks(offer1.id(), {task1});
+
+ AWAIT_READY(statusStarting1);
+ EXPECT_EQ(TASK_STARTING, statusStarting1->state());
+
+ AWAIT_READY(statusRunning1);
+ EXPECT_EQ(TASK_RUNNING, statusRunning1->state());
+
+ AWAIT_READY(statusFinished1);
+ EXPECT_EQ(TASK_FINISHED, statusFinished1->state());
+
+ // Start second task with same TaskId.
+ Future<vector<Offer>> offers2;
+
+ EXPECT_CALL(sched, resourceOffers(&driver, _))
+ .WillOnce(FutureArg<1>(&offers2))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ AWAIT_READY(offers2);
+ ASSERT_FALSE(offers2->empty());
+ Offer offer2 = offers2.get()[0];
+
+ TaskInfo task2 = createTask(
+ offer2.slave_id(),
+ Resources::parse("cpus:0.1;mem:32").get(),
+ echoAuthorCommand(),
+ None(),
+ "task-2",
+ "duplicate-task-id");
+
+ Future<TaskStatus> status2;
+ EXPECT_CALL(sched, statusUpdate(_, _))
+ .WillOnce(FutureArg<1>(&status2));
+
+ driver.launchTasks(offer2.id(), {task2});
+
+ AWAIT_READY(status2);
+ EXPECT_EQ(TASK_LOST, status2->state());
+ EXPECT_EQ(TaskStatus::REASON_TASK_INVALID, status2->reason());
+ EXPECT_EQ(status2->message(),
+ "Cannot reuse an already existing executor for a command task");
+
+ driver.stop();
+ driver.join();
+}
+
} // namespace tests {
} // namespace internal {
} // namespace mesos {