You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/10/25 19:21:47 UTC

[1/4] mesos git commit: Avoided passing `TimeInfo` by value.

Repository: mesos
Updated Branches:
  refs/heads/1.1.x c2cb47f78 -> e1b6b2145


Avoided passing `TimeInfo` by value.

Although this is likely to remain small in practice, passing by const
reference should be preferred until there is a reason not to.

Review: https://reviews.apache.org/r/53099/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8cf2ca87
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8cf2ca87
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8cf2ca87

Branch: refs/heads/1.1.x
Commit: 8cf2ca8703d3b776fdbdaac2979cbd3ea40873ad
Parents: c2cb47f
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Oct 21 14:18:46 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Tue Oct 25 12:19:11 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 4 ++--
 src/master/master.hpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/8cf2ca87/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 3c6b18e..7407bc9 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -5937,7 +5937,7 @@ void Master::markUnreachable(const SlaveID& slaveId)
 
 void Master::_markUnreachable(
     Slave* slave,
-    TimeInfo unreachableTime,
+    const TimeInfo& unreachableTime,
     const Future<bool>& registrarResult)
 {
   CHECK_NOTNULL(slave);
@@ -6281,7 +6281,7 @@ void Master::_reconcileTasks(
       // does not have the PARTITION_AWARE capability, send TASK_LOST
       // for backward compatibility. In either case, the status update
       // also includes the time when the slave was marked unreachable.
-      TimeInfo unreachableTime = slaves.unreachable[slaveId.get()];
+      const TimeInfo& unreachableTime = slaves.unreachable[slaveId.get()];
 
       TaskState taskState = TASK_UNREACHABLE;
       if (!protobuf::frameworkHasCapability(

http://git-wip-us.apache.org/repos/asf/mesos/blob/8cf2ca87/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 881f0d6..6e9865a 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -682,7 +682,7 @@ protected:
 
   void _markUnreachable(
       Slave* slave,
-      TimeInfo unreachableTime,
+      const TimeInfo& unreachableTime,
       const process::Future<bool>& registrarResult);
 
   // Mark a slave as unreachable in the registry. Called when the slave
@@ -1955,7 +1955,7 @@ private:
 class MarkSlaveUnreachable : public Operation
 {
 public:
-  MarkSlaveUnreachable(const SlaveInfo& _info, TimeInfo _unreachableTime)
+  MarkSlaveUnreachable(const SlaveInfo& _info, const TimeInfo& _unreachableTime)
     : info(_info), unreachableTime(_unreachableTime) {
     CHECK(info.has_id()) << "SlaveInfo is missing the 'id' field";
   }


[2/4] mesos git commit: Fixed bug when marking agents unreachable after master failover.

Posted by vi...@apache.org.
Fixed bug when marking agents unreachable after master failover.

If the master fails over and an agent does not re-register within the
`agent_reregister_timeout`, the master marks the agent as unreachable in
the registry and sends `slaveLost` for it. However, we neglected to
update the master's in-memory state for the newly unreachable agent;
this meant that task reconciliation would return incorrect results
(until/unless the next master failover).

Review: https://reviews.apache.org/r/53097/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/0abd9510
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/0abd9510
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/0abd9510

Branch: refs/heads/1.1.x
Commit: 0abd9510a0dba87d1a791d8751e5ccdbb02784db
Parents: 8cf2ca8
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Oct 21 14:18:52 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Tue Oct 25 12:19:22 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp      |   4 ++
 src/master/master.hpp      |   1 +
 src/tests/master_tests.cpp | 148 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 152 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/0abd9510/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 7407bc9..0d50268 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1887,6 +1887,7 @@ Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
     .onAny(defer(self(),
                  &Self::_markUnreachableAfterFailover,
                  slave.info(),
+                 unreachableTime,
                  lambda::_1));
 
   return Nothing();
@@ -1895,6 +1896,7 @@ Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
 
 void Master::_markUnreachableAfterFailover(
     const SlaveInfo& slaveInfo,
+    const TimeInfo& unreachableTime,
     const Future<bool>& registrarResult)
 {
   CHECK(slaves.markingUnreachable.contains(slaveInfo.id()));
@@ -1920,6 +1922,8 @@ void Master::_markUnreachableAfterFailover(
   ++metrics->slave_removals_reason_unhealthy;
   ++metrics->recovery_slave_removals;
 
+  slaves.unreachable[slaveInfo.id()] = unreachableTime;
+
   sendSlaveLost(slaveInfo);
 }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/0abd9510/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 6e9865a..951e9b2 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -691,6 +691,7 @@ protected:
 
   void _markUnreachableAfterFailover(
       const SlaveInfo& slaveInfo,
+      const TimeInfo& unreachableTime,
       const process::Future<bool>& registrarResult);
 
   void sendSlaveLost(const SlaveInfo& slaveInfo);

http://git-wip-us.apache.org/repos/asf/mesos/blob/0abd9510/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 88cf1e6..88cc6af 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -1932,7 +1932,7 @@ TEST_F(MasterTest, RecoveredSlaveCanReregister)
   AWAIT_READY(registered);
 
   // Step 6: Advance the clock until the re-registration timeout
-  // elapses, and expect the slave / task to be lost!
+  // elapses, and expect the slave to be lost!
   Future<Nothing> slaveLost;
   EXPECT_CALL(sched, slaveLost(&driver, _))
     .WillOnce(FutureSatisfy(&slaveLost));
@@ -1973,6 +1973,152 @@ TEST_F(MasterTest, RecoveredSlaveCanReregister)
 }
 
 
+// This test ensures that when a master fails over and an agent does
+// not reregister within the `agent_reregister_timeout`, the agent is
+// marked unreachable; the framework should NOT receive a status
+// update for any tasks running on the agent, but reconciliation
+// should indicate the agent is unreachable.
+TEST_F(MasterTest, UnreachableTaskAfterFailover)
+{
+  // Step 1: Start a master.
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Step 2: Start a slave.
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  StandaloneMasterDetector slaveDetector(master.get()->pid);
+  Try<Owned<cluster::Slave>> slave = StartSlave(&slaveDetector);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(slaveRegisteredMessage);
+  const SlaveID slaveId = slaveRegisteredMessage.get().slave_id();
+
+  // Step 3: Start a scheduler.
+  StandaloneMasterDetector schedDetector(master.get()->pid);
+  MockScheduler sched;
+  TestingMesosSchedulerDriver driver(
+      &sched, &schedDetector, DEFAULT_FRAMEWORK_INFO);
+
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .Times(1);
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
+  TaskInfo task = createTask(offers.get()[0], "sleep 100");
+
+  Future<TaskStatus> runningStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&runningStatus));
+
+  driver.launchTasks(offers.get()[0].id(), {task});
+
+  AWAIT_READY(runningStatus);
+  EXPECT_EQ(TASK_RUNNING, runningStatus.get().state());
+  EXPECT_EQ(task.task_id(), runningStatus.get().task_id());
+
+  // Step 4: Simulate master failover. We leave the slave without a
+  // master so it does not attempt to re-register.
+  slaveDetector.appoint(None());
+
+  master->reset();
+  master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Cause the scheduler to re-register with the master.
+  Future<Nothing> disconnected;
+  EXPECT_CALL(sched, disconnected(&driver))
+    .WillOnce(FutureSatisfy(&disconnected));
+
+  Future<Nothing> registered;
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .WillOnce(FutureSatisfy(&registered));
+
+  schedDetector.appoint(master.get()->pid);
+
+  AWAIT_READY(disconnected);
+  AWAIT_READY(registered);
+
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  // Trigger the slave re-registration timeout.
+  Clock::pause();
+  Clock::advance(masterFlags.agent_reregister_timeout);
+  TimeInfo unreachableTime = protobuf::getCurrentTime();
+
+  // We expect to get a `slaveLost` signal; we do NOT expect to get a
+  // status update for the task that was running on the slave.
+  AWAIT_READY(slaveLost);
+
+  // Reconciliation should return TASK_LOST, with `unreachable_time`
+  // equal to the time when the re-registration timeout fired.
+  TaskStatus status;
+  status.mutable_task_id()->CopyFrom(task.task_id());
+  status.mutable_slave_id()->CopyFrom(slaveId);
+  status.set_state(TASK_STAGING); // Dummy value.
+
+  Future<TaskStatus> reconcileUpdate1;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&reconcileUpdate1));
+
+  driver.reconcileTasks({status});
+
+  AWAIT_READY(reconcileUpdate1);
+  EXPECT_EQ(TASK_LOST, reconcileUpdate1.get().state());
+  EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate1.get().reason());
+  EXPECT_EQ(unreachableTime, reconcileUpdate1.get().unreachable_time());
+
+  // Cause the slave to re-register with the master.
+  Future<SlaveReregisteredMessage> slaveReregisteredMessage =
+    FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
+
+  slaveDetector.appoint(master.get()->pid);
+
+  AWAIT_READY(slaveReregisteredMessage);
+
+  // The task should have returned to TASK_RUNNING. This is true even
+  // for non-partition-aware frameworks, since we emulate the old
+  // "non-strict registry" semantics.
+  Future<TaskStatus> reconcileUpdate2;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&reconcileUpdate2));
+
+  driver.reconcileTasks({status});
+
+  AWAIT_READY(reconcileUpdate2);
+  EXPECT_EQ(TASK_RUNNING, reconcileUpdate2.get().state());
+  EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate2.get().reason());
+
+  Clock::resume();
+
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(0, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(0, stats.values["master/tasks_unreachable"]);
+  EXPECT_EQ(1, stats.values["master/tasks_running"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_scheduled"]);
+  EXPECT_EQ(1, stats.values["master/slave_unreachable_completed"]);
+  EXPECT_EQ(1, stats.values["master/slave_removals"]);
+  EXPECT_EQ(1, stats.values["master/slave_removals/reason_unhealthy"]);
+  EXPECT_EQ(0, stats.values["master/slave_removals/reason_unregistered"]);
+  EXPECT_EQ(1, stats.values["master/recovery_slave_removals"]);
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test ensures that slave removals during master recovery
 // are rate limited.
 TEST_F(MasterTest, RateLimitRecoveredSlaveRemoval)


[3/4] mesos git commit: Tweaked test expectation.

Posted by vi...@apache.org.
Tweaked test expectation.

`WillOnce` is more accurate than `WillRepeatedly`.

Review: https://reviews.apache.org/r/53098/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c6516be5
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c6516be5
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c6516be5

Branch: refs/heads/1.1.x
Commit: c6516be5df87e3fc8bea67f4dc74bc6a4743147d
Parents: 0abd951
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Oct 21 14:18:59 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Tue Oct 25 12:19:33 2016 -0700

----------------------------------------------------------------------
 src/tests/master_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c6516be5/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 88cc6af..62d24b0 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -2169,7 +2169,7 @@ TEST_F(MasterTest, RateLimitRecoveredSlaveRemoval)
 
   Future<Nothing> slaveLost;
   EXPECT_CALL(sched, slaveLost(&driver, _))
-    .WillRepeatedly(FutureSatisfy(&slaveLost));
+    .WillOnce(FutureSatisfy(&slaveLost));
 
   driver.start();
 


[4/4] mesos git commit: Added MESOS-6445 to CHANGELOG for 1.1.0.

Posted by vi...@apache.org.
Added MESOS-6445 to CHANGELOG for 1.1.0.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e1b6b214
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e1b6b214
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e1b6b214

Branch: refs/heads/1.1.x
Commit: e1b6b2145a82a0f21aed82fc335ea09965b2f2dc
Parents: c6516be
Author: Vinod Kone <vi...@gmail.com>
Authored: Tue Oct 25 12:20:39 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Tue Oct 25 12:21:31 2016 -0700

----------------------------------------------------------------------
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e1b6b214/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 0d92452..9d41a4e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -357,6 +357,7 @@ All Issues:
   * [MESOS-6312] - Update CHANGELOG to mention addition of agent '--runtime_dir' flag.
   * [MESOS-6344] - Allow `network/cni` isolator to take a search path for CNI plugins instead of single directory.
   * [MESOS-6408] - Changelog for `mesos-cni-port-mapper` to 1.1.0.
+  * [MESOS-6445] - Reconciliation for unreachable agent after master failover is incorrect 
 
 ** Wish
   * [MESOS-5929] - Total cluster resources on master Mesos UI should have better spacing.