You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jo...@apache.org on 2015/09/14 19:58:59 UTC
[15/16] mesos git commit: Maintenance Primitives: Shutdown slave when
maintenance is started.
Maintenance Primitives: Shutdown slave when maintenance is started.
Review: https://reviews.apache.org/r/37622
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/147420e3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/147420e3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/147420e3
Branch: refs/heads/master
Commit: 147420e3e591c4b2674d3f84252066bc5d4b660c
Parents: ea96190
Author: Joris Van Remoortere <jo...@gmail.com>
Authored: Tue Aug 25 18:55:25 2015 -0400
Committer: Joris Van Remoortere <jo...@gmail.com>
Committed: Mon Sep 14 13:58:37 2015 -0400
----------------------------------------------------------------------
src/master/http.cpp | 31 ++++++++
src/tests/master_maintenance_tests.cpp | 114 ++++++++++++++++++++++++++++
2 files changed, 145 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/147420e3/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index 05b590e..f7ce9aa 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -1593,6 +1593,37 @@ Future<Response> Master::Http::machineDown(const Request& request) const
// is here, and is appropriate.
CHECK(result);
+ // We currently send a `ShutdownMessage` to each slave. This terminates
+ // all the executors for all the frameworks running on that slave.
+ // We also manually remove the slave to force sending TASK_LOST updates
+ // for all the tasks that were running on the slave and `LostSlaveMessage`
+ // messages to the framework. This guards against the slave having dropped
+ // the `ShutdownMessage`.
+ foreach (const MachineID& machineId, ids.values()) {
+ // The machine may not be in machines. This means no slaves are
+ // currently registered on that machine so this is a no-op.
+ if (master->machines.contains(machineId)) {
+ // NOTE: Copies are needed because removeSlave modifies
+ // master->machines.
+ foreach (
+ const SlaveID& slaveId,
+ utils::copy(master->machines[machineId].slaves)) {
+ Slave* slave = master->slaves.registered.get(slaveId);
+ CHECK_NOTNULL(slave);
+
+ // Tell the slave to shut down.
+ ShutdownMessage shutdownMessage;
+ shutdownMessage.set_message("Operator initiated 'Machine DOWN'");
+ master->send(slave->pid, shutdownMessage);
+
+ // Immediately remove the slave to force sending `TASK_LOST` status
+ // updates as well as `LostSlaveMessage` messages to the frameworks.
+ // See comment above.
+ master->removeSlave(slave, "Operator initiated 'Machine DOWN'");
+ }
+ }
+ }
+
// Update the master's local state with the downed machines.
foreach (const MachineID& id, ids.values()) {
master->machines[id].info.set_mode(MachineInfo::DOWN);
http://git-wip-us.apache.org/repos/asf/mesos/blob/147420e3/src/tests/master_maintenance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_maintenance_tests.cpp b/src/tests/master_maintenance_tests.cpp
index 4a59389..6ae502d 100644
--- a/src/tests/master_maintenance_tests.cpp
+++ b/src/tests/master_maintenance_tests.cpp
@@ -564,6 +564,120 @@ TEST_F(MasterMaintenanceTest, PreV1SchedulerSupport)
}
+// Test ensures that slaves receive a shutdown message from the master when
+// maintenance is started, and frameworks receive a task lost message.
+TEST_F(MasterMaintenanceTest, EnterMaintenanceMode)
+{
+ Try<PID<Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ MockExecutor exec(DEFAULT_EXECUTOR_ID);
+
+ Try<PID<Slave>> slave = StartSlave(&exec);
+ ASSERT_SOME(slave);
+
+ MockScheduler sched;
+ MesosSchedulerDriver driver(
+ &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+ EXPECT_CALL(sched, registered(&driver, _, _))
+ .Times(1);
+
+ // Launch a task.
+ EXPECT_CALL(sched, resourceOffers(&driver, _))
+ .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 64, "*"))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ EXPECT_CALL(exec, registered(_, _, _, _))
+ .Times(1);
+
+ EXPECT_CALL(exec, launchTask(_, _))
+ .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));
+
+ EXPECT_CALL(exec, shutdown(_))
+ .Times(AtMost(1));
+
+ EXPECT_CALL(sched, offerRescinded(&driver, _))
+ .WillRepeatedly(Return()); // Ignore rescinds.
+
+ // Collect the status updates to verify the task is running and then lost.
+ Future<TaskStatus> startStatus, lostStatus;
+ EXPECT_CALL(sched, statusUpdate(&driver, _))
+ .WillOnce(FutureArg<1>(&startStatus))
+ .WillOnce(FutureArg<1>(&lostStatus));
+
+ // Start the test.
+ driver.start();
+
+ // Wait till the task is running to schedule the maintenance.
+ AWAIT_READY(startStatus);
+ EXPECT_EQ(TASK_RUNNING, startStatus.get().state());
+
+ // Schedule this slave for maintenance.
+ MachineID machine;
+ machine.set_hostname(maintenanceHostname);
+ machine.set_ip(stringify(slave.get().address.ip));
+
+ // TODO(jmlvanre): Replace Time(0.0) with `Clock::now()` once JSON double
+ // conversion is fixed. For now using a rounded time avoids the issue.
+ const Time start = Time::create(0.0).get() + Seconds(60);
+ const Duration duration = Seconds(120);
+ const Unavailability unavailability = createUnavailability(start, duration);
+
+ // Post a valid schedule with one machine.
+ maintenance::Schedule schedule = createSchedule(
+ {createWindow({machine}, unavailability)});
+
+ // We have a few seconds between the first set of offers and the next
+ // allocation of offers. This should be enough time to perform a maintenance
+ // schedule update. This update will also trigger the rescinding of offers
+ // from the scheduled slave.
+ Future<Response> response =
+ process::http::post(
+ master.get(),
+ "maintenance/schedule",
+ headers,
+ stringify(JSON::Protobuf(schedule)));
+
+ AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+
+ // Verify that the master forces the slave to be shut down after the
+ // maintenance is started.
+ Future<ShutdownMessage> shutdownMessage =
+ FUTURE_PROTOBUF(ShutdownMessage(), master.get(), slave.get());
+
+ // Verify that the framework will be informed that the slave is lost.
+ Future<Nothing> slaveLost;
+ EXPECT_CALL(sched, slaveLost(&driver, _))
+ .WillOnce(FutureSatisfy(&slaveLost));
+
+ // Start the maintenance.
+ response =
+ process::http::post(
+ master.get(),
+ "machine/down",
+ headers,
+ stringify(JSON::Protobuf(createMachineList({machine}))));
+
+ AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+
+ // Wait for the slave to be shut down.
+ AWAIT_READY(shutdownMessage);
+
+ // Verify that we received a TASK_LOST.
+ AWAIT_READY(lostStatus);
+ EXPECT_EQ(TASK_LOST, lostStatus.get().state());
+
+ // Verify that the framework received the slave lost message.
+ AWAIT_READY(slaveLost);
+
+ driver.stop();
+ driver.join();
+
+ Shutdown(); // Must shutdown before 'containerizer' gets deallocated.
+}
+
+
// Posts valid and invalid machines to the maintenance start endpoint.
TEST_F(MasterMaintenanceTest, BringDownMachines)
{