You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/10/19 23:33:55 UTC
[05/10] mesos git commit: Changed scheduler driver to send
TASK_DROPPED.
Changed scheduler driver to send TASK_DROPPED.
If a scheduler tries to launch a task when the scheduler driver is not
connected to the master, the scheduler driver creates a faux TASK_LOST
status update to indicate that the task launch has not succeeded. If the
framework is PARTITION_AWARE, the scheduler driver will now send
TASK_DROPPED instead.
Review: https://reviews.apache.org/r/52658/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f8a0c28b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f8a0c28b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f8a0c28b
Branch: refs/heads/master
Commit: f8a0c28b5f7a8cb86432882f65440be32a052764
Parents: 1a3e931
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:31:58 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:31:58 2016 -0700
----------------------------------------------------------------------
src/sched/sched.cpp | 13 +++++--
src/tests/fault_tolerance_tests.cpp | 63 ++++++++++++++++++++++++++++++++
2 files changed, 73 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/f8a0c28b/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 9d1b5ce..6a44d57 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -1300,8 +1300,15 @@ protected:
if (!connected) {
VLOG(1) << "Ignoring accept offers message as master is disconnected";
- // NOTE: Reply to the framework with TASK_LOST messages for each
- // task launch. See details from notes in launchTasks.
+ // Reply to the framework with TASK_DROPPED messages for each
+ // task launch. If the framework is not partition-aware, we send
+ // TASK_LOST instead. See details from notes in `launchTasks`.
+ TaskState newTaskState = TASK_DROPPED;
+ if (!protobuf::frameworkHasCapability(
+ framework, FrameworkInfo::Capability::PARTITION_AWARE)) {
+ newTaskState = TASK_LOST;
+ }
+
foreach (const Offer::Operation& operation, operations) {
if (operation.type() != Offer::Operation::LAUNCH) {
continue;
@@ -1312,7 +1319,7 @@ protected:
framework.id(),
None(),
task.task_id(),
- TASK_LOST,
+ newTaskState,
TaskStatus::SOURCE_MASTER,
None(),
"Master disconnected",
http://git-wip-us.apache.org/repos/asf/mesos/blob/f8a0c28b/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index e15bf8d..95ac98c 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -914,6 +914,69 @@ TEST_F(FaultToleranceTest, DisconnectedSchedulerLaunchLost)
}
+// This test checks that if a partition-aware scheduler that is
+// disconnected from the master attempts to launch a task, it receives
+// a TASK_DROPPED status update.
+TEST_F(FaultToleranceTest, DisconnectedSchedulerLaunchDropped)
+{
+ Try<Owned<cluster::Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ StandaloneMasterDetector detector(master.get()->pid);
+ Try<Owned<cluster::Slave>> slave = StartSlave(&detector);
+ ASSERT_SOME(slave);
+
+ FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+ frameworkInfo.add_capabilities()->set_type(
+ FrameworkInfo::Capability::PARTITION_AWARE);
+
+ MockScheduler sched;
+ TestingMesosSchedulerDriver driver(&sched, &detector, frameworkInfo);
+
+ EXPECT_CALL(sched, registered(&driver, _, _));
+
+ Future<vector<Offer>> offers;
+ EXPECT_CALL(sched, resourceOffers(&driver, _))
+ .WillOnce(FutureArg<1>(&offers))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ Future<FrameworkRegisteredMessage> message =
+ FUTURE_PROTOBUF(FrameworkRegisteredMessage(), _, _);
+
+ driver.start();
+
+ AWAIT_READY(offers);
+ EXPECT_NE(0u, offers.get().size());
+
+ AWAIT_READY(message);
+
+ Future<Nothing> disconnected;
+ EXPECT_CALL(sched, disconnected(&driver))
+ .WillOnce(FutureSatisfy(&disconnected));
+
+ // Simulate a spurious master loss event at the scheduler.
+ detector.appoint(None());
+
+ AWAIT_READY(disconnected);
+
+ TaskInfo task = createTask(offers.get()[0], "sleep 60");
+
+ Future<TaskStatus> status;
+ EXPECT_CALL(sched, statusUpdate(&driver, _))
+ .WillOnce(FutureArg<1>(&status));
+
+ driver.launchTasks(offers.get()[0].id(), {task});
+
+ AWAIT_READY(status);
+ EXPECT_EQ(TASK_DROPPED, status.get().state());
+ EXPECT_EQ(TaskStatus::REASON_MASTER_DISCONNECTED, status.get().reason());
+ EXPECT_EQ(TaskStatus::SOURCE_MASTER, status.get().source());
+
+ driver.stop();
+ driver.join();
+}
+
+
// This test checks that a failover scheduler gets the
// retried status update.
TEST_F(FaultToleranceTest, SchedulerFailoverStatusUpdate)