You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/10/19 23:33:55 UTC

[05/10] mesos git commit: Changed scheduler driver to send TASK_DROPPED.

Changed scheduler driver to send TASK_DROPPED.

If a scheduler tries to launch a task when the scheduler driver is not
connected to the master, the scheduler driver creates a faux TASK_LOST
status update to indicate that the task launch has not succeeded. If the
framework is PARTITION_AWARE, the scheduler driver will now send
TASK_DROPPED instead.

Review: https://reviews.apache.org/r/52658/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f8a0c28b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f8a0c28b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f8a0c28b

Branch: refs/heads/master
Commit: f8a0c28b5f7a8cb86432882f65440be32a052764
Parents: 1a3e931
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:31:58 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:31:58 2016 -0700

----------------------------------------------------------------------
 src/sched/sched.cpp                 | 13 +++++--
 src/tests/fault_tolerance_tests.cpp | 63 ++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f8a0c28b/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 9d1b5ce..6a44d57 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -1300,8 +1300,15 @@ protected:
     if (!connected) {
       VLOG(1) << "Ignoring accept offers message as master is disconnected";
 
-      // NOTE: Reply to the framework with TASK_LOST messages for each
-      // task launch. See details from notes in launchTasks.
+      // Reply to the framework with TASK_DROPPED messages for each
+      // task launch. If the framework is not partition-aware, we send
+      // TASK_LOST instead. See details from notes in `launchTasks`.
+      TaskState newTaskState = TASK_DROPPED;
+      if (!protobuf::frameworkHasCapability(
+              framework, FrameworkInfo::Capability::PARTITION_AWARE)) {
+        newTaskState = TASK_LOST;
+      }
+
       foreach (const Offer::Operation& operation, operations) {
         if (operation.type() != Offer::Operation::LAUNCH) {
           continue;
@@ -1312,7 +1319,7 @@ protected:
               framework.id(),
               None(),
               task.task_id(),
-              TASK_LOST,
+              newTaskState,
               TaskStatus::SOURCE_MASTER,
               None(),
               "Master disconnected",

http://git-wip-us.apache.org/repos/asf/mesos/blob/f8a0c28b/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index e15bf8d..95ac98c 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -914,6 +914,69 @@ TEST_F(FaultToleranceTest, DisconnectedSchedulerLaunchLost)
 }
 
 
+// This test checks that if a partition-aware scheduler that is
+// disconnected from the master attempts to launch a task, it receives
+// a TASK_DROPPED status update.
+TEST_F(FaultToleranceTest, DisconnectedSchedulerLaunchDropped)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  StandaloneMasterDetector detector(master.get()->pid);
+  Try<Owned<cluster::Slave>> slave = StartSlave(&detector);
+  ASSERT_SOME(slave);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  TestingMesosSchedulerDriver driver(&sched, &detector, frameworkInfo);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  Future<FrameworkRegisteredMessage> message =
+    FUTURE_PROTOBUF(FrameworkRegisteredMessage(), _, _);
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
+  AWAIT_READY(message);
+
+  Future<Nothing> disconnected;
+  EXPECT_CALL(sched, disconnected(&driver))
+    .WillOnce(FutureSatisfy(&disconnected));
+
+  // Simulate a spurious master loss event at the scheduler.
+  detector.appoint(None());
+
+  AWAIT_READY(disconnected);
+
+  TaskInfo task = createTask(offers.get()[0], "sleep 60");
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status));
+
+  driver.launchTasks(offers.get()[0].id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_DROPPED, status.get().state());
+  EXPECT_EQ(TaskStatus::REASON_MASTER_DISCONNECTED, status.get().reason());
+  EXPECT_EQ(TaskStatus::SOURCE_MASTER, status.get().source());
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test checks that a failover scheduler gets the
 // retried status update.
 TEST_F(FaultToleranceTest, SchedulerFailoverStatusUpdate)