You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jo...@apache.org on 2015/09/14 19:58:59 UTC
[15/16] mesos git commit: Maintenance Primitives: Shutdown slave when maintenance is started.

Maintenance Primitives: Shutdown slave when maintenance is started.

Review: https://reviews.apache.org/r/37622


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/147420e3
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/147420e3
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/147420e3

Branch: refs/heads/master
Commit: 147420e3e591c4b2674d3f84252066bc5d4b660c
Parents: ea96190
Author: Joris Van Remoortere <jo...@gmail.com>
Authored: Tue Aug 25 18:55:25 2015 -0400
Committer: Joris Van Remoortere <jo...@gmail.com>
Committed: Mon Sep 14 13:58:37 2015 -0400

----------------------------------------------------------------------
 src/master/http.cpp                    |  31 ++++++++
 src/tests/master_maintenance_tests.cpp | 114 ++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/147420e3/src/master/http.cpp
----------------------------------------------------------------------
diff --git a/src/master/http.cpp b/src/master/http.cpp
index 05b590e..f7ce9aa 100644
--- a/src/master/http.cpp
+++ b/src/master/http.cpp
@@ -1593,6 +1593,37 @@ Future<Response> Master::Http::machineDown(const Request& request) const
       // is here, and is appropriate.
       CHECK(result);
 
+      // We currently send a `ShutdownMessage` to each slave. This terminates
+      // all the executors for all the frameworks running on that slave.
+      // We also manually remove the slave to force sending TASK_LOST updates
+      // for all the tasks that were running on the slave and `LostSlaveMessage`
+      // messages to the framework. This guards against the slave having dropped
+      // the `ShutdownMessage`.
+      foreach (const MachineID& machineId, ids.values()) {
+        // The machine may not be in machines. This means no slaves are
+        // currently registered on that machine so this is a no-op.
+        if (master->machines.contains(machineId)) {
+          // NOTE: Copies are needed because removeSlave modifies
+          // master->machines.
+          foreach (
+              const SlaveID& slaveId,
+              utils::copy(master->machines[machineId].slaves)) {
+            Slave* slave = master->slaves.registered.get(slaveId);
+            CHECK_NOTNULL(slave);
+
+            // Tell the slave to shut down.
+            ShutdownMessage shutdownMessage;
+            shutdownMessage.set_message("Operator initiated 'Machine DOWN'");
+            master->send(slave->pid, shutdownMessage);
+
+            // Immediately remove the slave to force sending `TASK_LOST` status
+            // updates as well as `LostSlaveMessage` messages to the frameworks.
+            // See comment above.
+            master->removeSlave(slave, "Operator initiated 'Machine DOWN'");
+          }
+        }
+      }
+
       // Update the master's local state with the downed machines.
       foreach (const MachineID& id, ids.values()) {
         master->machines[id].info.set_mode(MachineInfo::DOWN);

http://git-wip-us.apache.org/repos/asf/mesos/blob/147420e3/src/tests/master_maintenance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_maintenance_tests.cpp b/src/tests/master_maintenance_tests.cpp
index 4a59389..6ae502d 100644
--- a/src/tests/master_maintenance_tests.cpp
+++ b/src/tests/master_maintenance_tests.cpp
@@ -564,6 +564,120 @@ TEST_F(MasterMaintenanceTest, PreV1SchedulerSupport)
 }
 
 
+// Test ensures that slaves receive a shutdown message from the master when
+// maintenance is started, and frameworks receive a task lost message.
+TEST_F(MasterMaintenanceTest, EnterMaintenanceMode)
+{
+  Try<PID<Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+
+  Try<PID<Slave>> slave = StartSlave(&exec);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .Times(1);
+
+  // Launch a task.
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 64, "*"))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  EXPECT_CALL(exec, registered(_, _, _, _))
+    .Times(1);
+
+  EXPECT_CALL(exec, launchTask(_, _))
+    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));
+
+  EXPECT_CALL(exec, shutdown(_))
+    .Times(AtMost(1));
+
+  EXPECT_CALL(sched, offerRescinded(&driver, _))
+    .WillRepeatedly(Return()); // Ignore rescinds.
+
+  // Collect the status updates to verify the task is running and then lost.
+  Future<TaskStatus> startStatus, lostStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&startStatus))
+    .WillOnce(FutureArg<1>(&lostStatus));
+
+  // Start the test.
+  driver.start();
+
+  // Wait till the task is running to schedule the maintenance.
+  AWAIT_READY(startStatus);
+  EXPECT_EQ(TASK_RUNNING, startStatus.get().state());
+
+  // Schedule this slave for maintenance.
+  MachineID machine;
+  machine.set_hostname(maintenanceHostname);
+  machine.set_ip(stringify(slave.get().address.ip));
+
+  // TODO(jmlvanre): Replace Time(0.0) with `Clock::now()` once JSON double
+  // conversion is fixed. For now using a rounded time avoids the issue.
+  const Time start = Time::create(0.0).get() + Seconds(60);
+  const Duration duration = Seconds(120);
+  const Unavailability unavailability = createUnavailability(start, duration);
+
+  // Post a valid schedule with one machine.
+  maintenance::Schedule schedule = createSchedule(
+      {createWindow({machine}, unavailability)});
+
+  // We have a few seconds between the first set of offers and the next
+  // allocation of offers.  This should be enough time to perform a maintenance
+  // schedule update.  This update will also trigger the rescinding of offers
+  // from the scheduled slave.
+  Future<Response> response =
+    process::http::post(
+        master.get(),
+        "maintenance/schedule",
+        headers,
+        stringify(JSON::Protobuf(schedule)));
+
+  AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+
+  // Verify that the master forces the slave to be shut down after the
+  // maintenance is started.
+  Future<ShutdownMessage> shutdownMessage =
+    FUTURE_PROTOBUF(ShutdownMessage(), master.get(), slave.get());
+
+  // Verify that the framework will be informed that the slave is lost.
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  // Start the maintenance.
+  response =
+    process::http::post(
+        master.get(),
+        "machine/down",
+        headers,
+        stringify(JSON::Protobuf(createMachineList({machine}))));
+
+  AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
+
+  // Wait for the slave to be shut down.
+  AWAIT_READY(shutdownMessage);
+
+  // Verify that we received a TASK_LOST.
+  AWAIT_READY(lostStatus);
+  EXPECT_EQ(TASK_LOST, lostStatus.get().state());
+
+  // Verify that the framework received the slave lost message.
+  AWAIT_READY(slaveLost);
+
+  driver.stop();
+  driver.join();
+
+  Shutdown(); // Must shutdown before 'containerizer' gets deallocated.
+}
+
+
 // Posts valid and invalid machines to the maintenance start endpoint.
 TEST_F(MasterMaintenanceTest, BringDownMachines)
 {