You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by cf...@apache.org on 2021/10/15 20:15:07 UTC

[mesos] branch master updated: Fixed an agent crash in case of duplicate task ID.

This is an automated email from the ASF dual-hosted git repository.

cfnatali pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git


The following commit(s) were added to refs/heads/master by this push:
     new c857b62  Fixed an agent crash in case of duplicate task ID.
c857b62 is described below

commit c857b6292e5b281b5aafe5e83a90b2b3c5ad01bb
Author: Charles-Francois Natali <cf...@gmail.com>
AuthorDate: Thu Sep 9 22:12:42 2021 +0100

    Fixed an agent crash in case of duplicate task ID.
    
    When using the command executor, if a task is started with the same ID
    as a previous task for which the agent still has the executor, reject
    the task instead of crashing.
    
    Closes MESOS-9657.
---
 src/slave/slave.cpp       | 11 +++++-
 src/tests/slave_tests.cpp | 95 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index a2d6eb7..3d53db4 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -3191,7 +3191,16 @@ void Slave::__run(
     if (taskGroup.isNone() && task->has_command()) {
       // We are dealing with command task; a new command executor will be
       // launched.
-      CHECK(executor == nullptr);
+      // It is possible for an executor with this ID to already exist, if the
+      // TaskID was re-used - see MESOS-9657. If this happens, we have no
+      // choice but to drop the task.
+      if (executor != nullptr) {
+        sendTaskDroppedUpdate(
+            TaskStatus::REASON_TASK_INVALID,
+            "Cannot reuse an already existing executor for a command task");
+
+        return;
+      }
     } else {
       // Master set the `launch_executor` flag and this is not a command task.
       if (launchExecutor.get() && executor != nullptr) {
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index b46e561..b60b260 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -12876,6 +12876,101 @@ TEST_F(SlaveTest, CheckpointedDrainInfo)
   EXPECT_EQ(TaskStatus::REASON_SLAVE_DRAINING, statusKilled->reason());
 }
 
+TEST_F(SlaveTest, DuplicateTaskIdCommandExecutor)
+{
+  // Start a master.
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  // Start a slave.
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  Future<vector<Offer>> offers1;
+
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers1))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers1);
+  ASSERT_FALSE(offers1->empty());
+  Offer offer1 = offers1.get()[0];
+
+  // Start the first task, and wait for it to complete.
+  TaskInfo task1 = createTask(
+      offer1.slave_id(),
+      Resources::parse("cpus:0.1;mem:32").get(),
+      echoAuthorCommand(),
+      None(),
+      "task-1",
+      "duplicate-task-id");
+
+  Future<TaskStatus> statusStarting1;
+  Future<TaskStatus> statusRunning1;
+  Future<TaskStatus> statusFinished1;
+  EXPECT_CALL(sched, statusUpdate(_, _))
+    .WillOnce(FutureArg<1>(&statusStarting1))
+    .WillOnce(FutureArg<1>(&statusRunning1))
+    .WillOnce(FutureArg<1>(&statusFinished1));
+
+  driver.launchTasks(offer1.id(), {task1});
+
+  AWAIT_READY(statusStarting1);
+  EXPECT_EQ(TASK_STARTING, statusStarting1->state());
+
+  AWAIT_READY(statusRunning1);
+  EXPECT_EQ(TASK_RUNNING, statusRunning1->state());
+
+  AWAIT_READY(statusFinished1);
+  EXPECT_EQ(TASK_FINISHED, statusFinished1->state());
+
+  // Start second task with same TaskId.
+  Future<vector<Offer>> offers2;
+
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers2))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  AWAIT_READY(offers2);
+  ASSERT_FALSE(offers2->empty());
+  Offer offer2 = offers2.get()[0];
+
+  TaskInfo task2 = createTask(
+      offer2.slave_id(),
+      Resources::parse("cpus:0.1;mem:32").get(),
+      echoAuthorCommand(),
+      None(),
+      "task-2",
+      "duplicate-task-id");
+
+  Future<TaskStatus> status2;
+  EXPECT_CALL(sched, statusUpdate(_, _))
+    .WillOnce(FutureArg<1>(&status2));
+
+  driver.launchTasks(offer2.id(), {task2});
+
+  AWAIT_READY(status2);
+  EXPECT_EQ(TASK_LOST, status2->state());
+  EXPECT_EQ(TaskStatus::REASON_TASK_INVALID, status2->reason());
+  EXPECT_EQ(status2->message(),
+      "Cannot reuse an already existing executor for a command task");
+
+  driver.stop();
+  driver.join();
+}
+
 } // namespace tests {
 } // namespace internal {
 } // namespace mesos {