You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/02/09 01:52:47 UTC

mesos git commit: Returned "ServiceUnavailable" for slave's /state during recovery.

Repository: mesos
Updated Branches:
  refs/heads/master 748a60e00 -> 83b43d91a


Returned "ServiceUnavailable" for slave's /state during recovery.

This will help operators hitting the /state endpoint by not returning incomplete
data when slave is still recoverying.

Review: https://reviews.apache.org/r/43267


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/83b43d91
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/83b43d91
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/83b43d91

Branch: refs/heads/master
Commit: 83b43d91adb720f0b7d25816155f5c71240c4164
Parents: 748a60e
Author: Vinod Kone <vi...@gmail.com>
Authored: Fri Feb 5 12:07:20 2016 -0800
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Feb 8 16:52:27 2016 -0800

----------------------------------------------------------------------
 CHANGELOG                 |  9 +++++
 src/slave/http.cpp        |  4 ++
 src/tests/slave_tests.cpp | 89 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/83b43d91/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 82c1be6..fc387dc 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,12 @@
+
+Release Notes - Mesos - Version 0.28.0 (WIP)
+--------------------------------------------
+
+** API Changes:
+  * [MESOS-4066] - Agent should not return partial state when a request is made
+                   to /state endpoint during recovery.
+
+
 Release Notes - Mesos - Version 0.27.0
 --------------------------------------------
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/83b43d91/src/slave/http.cpp
----------------------------------------------------------------------
diff --git a/src/slave/http.cpp b/src/slave/http.cpp
index 9167030..523e8dc 100644
--- a/src/slave/http.cpp
+++ b/src/slave/http.cpp
@@ -399,6 +399,10 @@ string Slave::Http::STATE_HELP() {
 
 Future<Response> Slave::Http::state(const Request& request) const
 {
+  if (slave->state == Slave::RECOVERING) {
+    return ServiceUnavailable("Agent has not finished recovery");
+  }
+
   JSON::Object object;
   object.values["version"] = MESOS_VERSION;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/83b43d91/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 2a34d54..0884ee5 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -75,6 +75,7 @@ using process::UPID;
 
 using process::http::OK;
 using process::http::Response;
+using process::http::ServiceUnavailable;
 
 using std::map;
 using std::shared_ptr;
@@ -1184,9 +1185,15 @@ TEST_F(SlaveTest, StateEndpoint)
   // Capture the start time deterministically.
   Clock::pause();
 
+  Future<Nothing> __recover = FUTURE_DISPATCH(_, &Slave::__recover);
+
   Try<PID<Slave>> slave = StartSlave(&containerizer, flags);
   ASSERT_SOME(slave);
 
+  // Ensure slave has finished recovery.
+  AWAIT_READY(__recover);
+  Clock::settle();
+
   Future<Response> response = process::http::get(slave.get(), "state");
 
   AWAIT_EXPECT_RESPONSE_STATUS_EQ(OK().status, response);
@@ -1320,6 +1327,88 @@ TEST_F(SlaveTest, StateEndpoint)
 }
 
 
+// This test checks that when a slave is in RECOVERING state it responds
+// to HTTP requests for "/state" endpoint with ServiceUnavailable.
+TEST_F(SlaveTest, StateEndpointUnavailableDuringRecovery)
+{
+  Try<PID<Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+
+  TestContainerizer* containerizer1 = new TestContainerizer(&exec);
+
+  slave::Flags flags = CreateSlaveFlags();
+
+  Try<PID<Slave>> slave = StartSlave(containerizer1, flags);
+  ASSERT_SOME(slave);
+
+  // Launch a task so that slave has something to recover after restart.
+  MockScheduler sched;
+
+  // Enable checkpointing for the framework.
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_checkpoint(true);
+
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get(), DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .Times(1);
+
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 512, "*"))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  EXPECT_CALL(exec, registered(_, _, _, _));
+
+  EXPECT_CALL(exec, launchTask(_, _))
+    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status))
+    .WillRepeatedly(Return()); // Ignore subsequent updates.
+
+  driver.start();
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_RUNNING, status.get().state());
+
+  // Need this expectation here because `TestContainerizer` doesn't do recovery
+  // and hence sets `MESOS_RECOVERY_TIMEOUT` as '0s' causing the executor driver
+  // to exit immediately after slave exit.
+  EXPECT_CALL(exec, shutdown(_))
+    .Times(AtMost(1));
+
+  // Restart the slave.
+  Stop(slave.get());
+  delete containerizer1;
+
+  // Pause the clock to keep slave in RECOVERING state.
+  Clock::pause();
+
+  Future<Nothing> _recover = FUTURE_DISPATCH(_, &Slave::_recover);
+
+  TestContainerizer containerizer2;
+
+  slave = StartSlave(&containerizer2, flags);
+  ASSERT_SOME(slave);
+
+  // Ensure slave has setup the route for "/state".
+  AWAIT_READY(_recover);
+
+  Future<Response> response = process::http::get(slave.get(), "state");
+
+  AWAIT_EXPECT_RESPONSE_STATUS_EQ(ServiceUnavailable().status, response);
+
+  driver.stop();
+  driver.join();
+
+  Shutdown();
+}
+
+
 // This test ensures that when a slave is shutting down, it will not
 // try to re-register with the master.
 TEST_F(SlaveTest, DISABLED_TerminatingSlaveDoesNotReregister)