You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/10/19 23:33:51 UTC

[01/10] mesos git commit: Avoided using SIGUSR1 in two test cases.

Repository: mesos
Updated Branches:
  refs/heads/master 5e850a362 -> bf00061b5


Avoided using SIGUSR1 in two test cases.

We want to cause the agent to shutdown gracefully (i.e., to send an
"unregister" message to the master). This can be accomplished by sending
the whole process a SIGUSR1 but that seems fragile; using the agent's
`shutdown()` method seems more robust.

Review: https://reviews.apache.org/r/52039/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2c15661e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2c15661e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2c15661e

Branch: refs/heads/master
Commit: 2c15661e58ef34d1496ac11745cd434fe7b8d8b0
Parents: 5e850a3
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:31:32 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:31:32 2016 -0700

----------------------------------------------------------------------
 src/tests/slave_tests.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2c15661e/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 93b81d3..ab46293 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -2854,16 +2854,16 @@ TEST_F(SlaveTest, HealthCheckUnregisterRace)
   EXPECT_CALL(sched, slaveLost(&driver, _))
     .WillOnce(FutureSatisfy(&slaveLost));
 
-  // Cause the slave to shutdown gracefully by sending it SIGUSR1.
-  // This should result in the slave sending `UnregisterSlaveMessage`
-  // to the master.
+  // Cause the slave to shutdown gracefully. This should result in
+  // the slave sending `UnregisterSlaveMessage` to the master.
   Future<UnregisterSlaveMessage> unregisterSlaveMessage =
     FUTURE_PROTOBUF(
         UnregisterSlaveMessage(),
         slave.get()->pid,
         master.get()->pid);
 
-  kill(getpid(), SIGUSR1);
+  slave.get()->shutdown();
+  slave->reset();
 
   AWAIT_READY(unregisterSlaveMessage);
   AWAIT_READY(slaveLost);
@@ -2966,10 +2966,10 @@ TEST_F(SlaveTest, UnreachableThenUnregisterRace)
       dynamic_cast<master::MarkSlaveUnreachable*>(
           markUnreachable.get().get()));
 
-  // Cause the slave to shutdown gracefully by sending it SIGUSR1.
-  // This should result in the slave sending `UnregisterSlaveMessage`
-  // to the master. Normally, the master would then remove the slave
-  // from the registry, but since the slave is already being marked
+  // Cause the slave to shutdown gracefully.  This should result in
+  // the slave sending `UnregisterSlaveMessage` to the master.
+  // Normally, the master would then remove the slave from the
+  // registry, but since the slave is already being marked
   // unreachable, the master should ignore the unregister message.
   Future<UnregisterSlaveMessage> unregisterSlaveMessage =
     FUTURE_PROTOBUF(
@@ -2980,7 +2980,8 @@ TEST_F(SlaveTest, UnreachableThenUnregisterRace)
   EXPECT_CALL(*master.get()->registrar.get(), apply(_))
     .Times(0);
 
-  kill(getpid(), SIGUSR1);
+  slave.get()->shutdown();
+  slave->reset();
 
   AWAIT_READY(unregisterSlaveMessage);
 


[05/10] mesos git commit: Changed scheduler driver to send TASK_DROPPED.

Posted by vi...@apache.org.
Changed scheduler driver to send TASK_DROPPED.

If a scheduler tries to launch a task when the scheduler driver is not
connected to the master, the scheduler driver creates a faux TASK_LOST
status update to indicate that the task launch has not succeeded. If the
framework is PARTITION_AWARE, the scheduler driver will now send
TASK_DROPPED instead.

Review: https://reviews.apache.org/r/52658/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f8a0c28b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f8a0c28b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f8a0c28b

Branch: refs/heads/master
Commit: f8a0c28b5f7a8cb86432882f65440be32a052764
Parents: 1a3e931
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:31:58 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:31:58 2016 -0700

----------------------------------------------------------------------
 src/sched/sched.cpp                 | 13 +++++--
 src/tests/fault_tolerance_tests.cpp | 63 ++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f8a0c28b/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 9d1b5ce..6a44d57 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -1300,8 +1300,15 @@ protected:
     if (!connected) {
       VLOG(1) << "Ignoring accept offers message as master is disconnected";
 
-      // NOTE: Reply to the framework with TASK_LOST messages for each
-      // task launch. See details from notes in launchTasks.
+      // Reply to the framework with TASK_DROPPED messages for each
+      // task launch. If the framework is not partition-aware, we send
+      // TASK_LOST instead. See details from notes in `launchTasks`.
+      TaskState newTaskState = TASK_DROPPED;
+      if (!protobuf::frameworkHasCapability(
+              framework, FrameworkInfo::Capability::PARTITION_AWARE)) {
+        newTaskState = TASK_LOST;
+      }
+
       foreach (const Offer::Operation& operation, operations) {
         if (operation.type() != Offer::Operation::LAUNCH) {
           continue;
@@ -1312,7 +1319,7 @@ protected:
               framework.id(),
               None(),
               task.task_id(),
-              TASK_LOST,
+              newTaskState,
               TaskStatus::SOURCE_MASTER,
               None(),
               "Master disconnected",

http://git-wip-us.apache.org/repos/asf/mesos/blob/f8a0c28b/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index e15bf8d..95ac98c 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -914,6 +914,69 @@ TEST_F(FaultToleranceTest, DisconnectedSchedulerLaunchLost)
 }
 
 
+// This test checks that if a partition-aware scheduler that is
+// disconnected from the master attempts to launch a task, it receives
+// a TASK_DROPPED status update.
+TEST_F(FaultToleranceTest, DisconnectedSchedulerLaunchDropped)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  StandaloneMasterDetector detector(master.get()->pid);
+  Try<Owned<cluster::Slave>> slave = StartSlave(&detector);
+  ASSERT_SOME(slave);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  TestingMesosSchedulerDriver driver(&sched, &detector, frameworkInfo);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  Future<FrameworkRegisteredMessage> message =
+    FUTURE_PROTOBUF(FrameworkRegisteredMessage(), _, _);
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
+  AWAIT_READY(message);
+
+  Future<Nothing> disconnected;
+  EXPECT_CALL(sched, disconnected(&driver))
+    .WillOnce(FutureSatisfy(&disconnected));
+
+  // Simulate a spurious master loss event at the scheduler.
+  detector.appoint(None());
+
+  AWAIT_READY(disconnected);
+
+  TaskInfo task = createTask(offers.get()[0], "sleep 60");
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status));
+
+  driver.launchTasks(offers.get()[0].id(), {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_DROPPED, status.get().state());
+  EXPECT_EQ(TaskStatus::REASON_MASTER_DISCONNECTED, status.get().reason());
+  EXPECT_EQ(TaskStatus::SOURCE_MASTER, status.get().source());
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test checks that a failover scheduler gets the
 // retried status update.
 TEST_F(FaultToleranceTest, SchedulerFailoverStatusUpdate)


[08/10] mesos git commit: Changed master to send TASK_UNKNOWN during reconciliation.

Posted by vi...@apache.org.
Changed master to send TASK_UNKNOWN during reconciliation.

Previously, the master would send TASK_LOST in response to explicit
reconciliation requests for (a) unknown tasks at registered slaves and
(b) tasks at unknown slaves (neither registered nor unreachable). The
master will now send TASK_UNKNOWN for these situations if the framework
has the PARTITION_AWARE capability.

Review: https://reviews.apache.org/r/52693/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/97bd957f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/97bd957f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/97bd957f

Branch: refs/heads/master
Commit: 97bd957ffc45b3484dd82321c5e7fe7f02f9d79c
Parents: 5082181
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:32:14 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:32:14 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp              | 36 ++++++++++------
 src/tests/partition_tests.cpp      | 14 +++----
 src/tests/reconciliation_tests.cpp | 73 ++++++++++++++++++++++++++++++---
 3 files changed, 99 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/97bd957f/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 2fc41f5..bf6bb1a 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6248,17 +6248,13 @@ void Master::_reconcileTasks(
   // Explicit reconciliation occurs for the following cases:
   //   (1) Task is known, but pending: TASK_STAGING.
   //   (2) Task is known: send the latest state.
-  //   (3) Task is unknown, slave is registered: TASK_LOST.
+  //   (3) Task is unknown, slave is registered: TASK_UNKNOWN.
   //   (4) Task is unknown, slave is transitioning: no-op.
   //   (5) Task is unknown, slave is unreachable: TASK_UNREACHABLE.
-  //   (6) Task is unknown, slave is unknown: TASK_LOST.
+  //   (6) Task is unknown, slave is unknown: TASK_UNKNOWN.
   //
-  // When using a non-strict registry, case (6) may result in
-  // a TASK_LOST for a task that may later be non-terminal. This
-  // is better than no reply at all because the framework can take
-  // action for TASK_LOST. Later, if the task is running, the
-  // framework can discover it with implicit reconciliation and will
-  // be able to kill it.
+  // For cases (3), (5), and (6), TASK_LOST is sent instead if the
+  // framework has not opted-in to the PARTITION_AWARE capability.
   foreach (const TaskStatus& status, statuses) {
     Option<SlaveID> slaveId = None();
     if (status.has_slave_id()) {
@@ -6304,12 +6300,20 @@ void Master::_reconcileTasks(
           None(),
           protobuf::getTaskContainerStatus(*task));
     } else if (slaveId.isSome() && slaves.registered.contains(slaveId.get())) {
-      // (3) Task is unknown, slave is registered: TASK_LOST.
+      // (3) Task is unknown, slave is registered: TASK_UNKNOWN. If
+      // the framework does not have the PARTITION_AWARE capability,
+      // send TASK_LOST for backward compatibility.
+      TaskState taskState = TASK_UNKNOWN;
+      if (!protobuf::frameworkHasCapability(
+              framework->info, FrameworkInfo::Capability::PARTITION_AWARE)) {
+        taskState = TASK_LOST;
+      }
+
       update = protobuf::createStatusUpdate(
           framework->id(),
           slaveId.get(),
           status.task_id(),
-          TASK_LOST,
+          taskState,
           TaskStatus::SOURCE_MASTER,
           None(),
           "Reconciliation: Task is unknown to the agent",
@@ -6347,12 +6351,20 @@ void Master::_reconcileTasks(
           None(),
           unreachableTime);
     } else {
-      // (6) Task is unknown, slave is unknown: TASK_LOST.
+      // (6) Task is unknown, slave is unknown: TASK_UNKNOWN. If the
+      // framework does not have the PARTITION_AWARE capability, send
+      // TASK_LOST for backward compatibility.
+      TaskState taskState = TASK_UNKNOWN;
+      if (!protobuf::frameworkHasCapability(
+              framework->info, FrameworkInfo::Capability::PARTITION_AWARE)) {
+        taskState = TASK_LOST;
+      }
+
       update = protobuf::createStatusUpdate(
           framework->id(),
           slaveId,
           status.task_id(),
-          TASK_LOST,
+          taskState,
           TaskStatus::SOURCE_MASTER,
           None(),
           "Reconciliation: Task is unknown",

http://git-wip-us.apache.org/repos/asf/mesos/blob/97bd957f/src/tests/partition_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/partition_tests.cpp b/src/tests/partition_tests.cpp
index 12fe859..5a0d4bd 100644
--- a/src/tests/partition_tests.cpp
+++ b/src/tests/partition_tests.cpp
@@ -1603,7 +1603,7 @@ TEST_F(PartitionTest, RegistryGcByCount)
   driver.reconcileTasks({status2});
 
   AWAIT_READY(reconcileUpdate2);
-  EXPECT_EQ(TASK_LOST, reconcileUpdate2.get().state());
+  EXPECT_EQ(TASK_UNKNOWN, reconcileUpdate2.get().state());
   EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate2.get().reason());
   EXPECT_FALSE(reconcileUpdate2.get().has_unreachable_time());
 
@@ -1746,7 +1746,7 @@ TEST_F(PartitionTest, RegistryGcByCountManySlaves)
   driver.reconcileTasks({status2});
 
   AWAIT_READY(reconcileUpdate2);
-  EXPECT_EQ(TASK_LOST, reconcileUpdate2.get().state());
+  EXPECT_EQ(TASK_UNKNOWN, reconcileUpdate2.get().state());
   EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate2.get().reason());
   EXPECT_FALSE(reconcileUpdate2.get().has_unreachable_time());
 
@@ -1965,7 +1965,7 @@ TEST_F(PartitionTest, RegistryGcByAge)
   driver.reconcileTasks({status3});
 
   AWAIT_READY(reconcileUpdate3);
-  EXPECT_EQ(TASK_LOST, reconcileUpdate3.get().state());
+  EXPECT_EQ(TASK_UNKNOWN, reconcileUpdate3.get().state());
   EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate3.get().reason());
   EXPECT_FALSE(reconcileUpdate3.get().has_unreachable_time());
 
@@ -2004,7 +2004,7 @@ TEST_F(PartitionTest, RegistryGcByAge)
   driver.reconcileTasks({status5});
 
   AWAIT_READY(reconcileUpdate5);
-  EXPECT_EQ(TASK_LOST, reconcileUpdate5.get().state());
+  EXPECT_EQ(TASK_UNKNOWN, reconcileUpdate5.get().state());
   EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate5.get().reason());
   EXPECT_FALSE(reconcileUpdate5.get().has_unreachable_time());
 
@@ -2022,7 +2022,7 @@ TEST_F(PartitionTest, RegistryGcByAge)
 // configure GC to only keep a single agent. Concurrently with GC
 // running, we arrange for one of those agents to reregister with the
 // master.
-TEST_F(PartitionTest, RegistryGcRace2)
+TEST_F(PartitionTest, RegistryGcRace)
 {
   master::Flags masterFlags = CreateMasterFlags();
   masterFlags.registry_max_agent_count = 1;
@@ -2251,7 +2251,7 @@ TEST_F(PartitionTest, RegistryGcRace2)
   driver.reconcileTasks({status1});
 
   AWAIT_READY(reconcileUpdate1);
-  EXPECT_EQ(TASK_LOST, reconcileUpdate1.get().state());
+  EXPECT_EQ(TASK_UNKNOWN, reconcileUpdate1.get().state());
   EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate1.get().reason());
   EXPECT_FALSE(reconcileUpdate1.get().has_unreachable_time());
 
@@ -2267,7 +2267,7 @@ TEST_F(PartitionTest, RegistryGcRace2)
   driver.reconcileTasks({status2});
 
   AWAIT_READY(reconcileUpdate2);
-  EXPECT_EQ(TASK_LOST, reconcileUpdate2.get().state());
+  EXPECT_EQ(TASK_UNKNOWN, reconcileUpdate2.get().state());
   EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, reconcileUpdate2.get().reason());
   EXPECT_FALSE(reconcileUpdate2.get().has_unreachable_time());
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/97bd957f/src/tests/reconciliation_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/reconciliation_tests.cpp b/src/tests/reconciliation_tests.cpp
index d2bba48..71073a0 100644
--- a/src/tests/reconciliation_tests.cpp
+++ b/src/tests/reconciliation_tests.cpp
@@ -230,8 +230,8 @@ TEST_F(ReconciliationTest, TaskStateMatch)
 
 
 // This test verifies that reconciliation of a task that belongs to an
-// unknown slave results in TASK_LOST, even if the framework has
-// enabled the PARTITION_AWARE capability.
+// unknown slave results in TASK_UNKNOWN if the framework has enabled
+// the PARTITION_AWARE capability.
 TEST_F(ReconciliationTest, UnknownSlave)
 {
   Try<Owned<cluster::Master>> master = StartMaster();
@@ -266,9 +266,9 @@ TEST_F(ReconciliationTest, UnknownSlave)
 
   driver.reconcileTasks({status});
 
-  // Framework should receive TASK_LOST because the slave is unknown.
+  // Framework should receive TASK_UNKNOWN because the slave is unknown.
   AWAIT_READY(update);
-  EXPECT_EQ(TASK_LOST, update.get().state());
+  EXPECT_EQ(TASK_UNKNOWN, update.get().state());
   EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, update.get().reason());
   EXPECT_FALSE(update.get().has_unreachable_time());
 
@@ -278,7 +278,8 @@ TEST_F(ReconciliationTest, UnknownSlave)
 
 
 // This test verifies that reconciliation of an unknown task that
-// belongs to a known slave results in TASK_LOST.
+// belongs to a known slave results in TASK_LOST if the framework is
+// not partition-aware.
 TEST_F(ReconciliationTest, UnknownTask)
 {
   Try<Owned<cluster::Master>> master = StartMaster();
@@ -334,6 +335,68 @@ TEST_F(ReconciliationTest, UnknownTask)
 }
 
 
+// This test verifies that reconciliation of an unknown task that
+// belongs to a known slave results in TASK_UNKNOWN if the framework
+// is partition-aware.
+TEST_F(ReconciliationTest, UnknownTaskPartitionAware)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  // Wait for the slave to register and get the slave id.
+  AWAIT_READY(slaveRegisteredMessage);
+  const SlaveID slaveId = slaveRegisteredMessage.get().slave_id();
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+    &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillRepeatedly(Return()); // Ignore offers.
+
+  driver.start();
+
+  // Wait until the framework is registered.
+  AWAIT_READY(frameworkId);
+
+  Future<TaskStatus> update;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&update));
+
+  // Create a task status with a random task id.
+  TaskStatus status;
+  status.mutable_task_id()->set_value(UUID::random().toString());
+  status.mutable_slave_id()->CopyFrom(slaveId);
+  status.set_state(TASK_STAGING); // Dummy value.
+
+  driver.reconcileTasks({status});
+
+  // Framework should receive TASK_UNKNOWN for an unknown task.
+  AWAIT_READY(update);
+  EXPECT_EQ(TASK_UNKNOWN, update.get().state());
+  EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, update.get().reason());
+  EXPECT_FALSE(update.get().has_unreachable_time());
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test verifies that the killTask request of an unknown task
 // results in reconciliation. In this case, the task is unknown
 // and there are no transitional slaves.


[04/10] mesos git commit: Clarified a comment that occurs in several tests.

Posted by vi...@apache.org.
Clarified a comment that occurs in several tests.

Review: https://reviews.apache.org/r/52657/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/1a3e931f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/1a3e931f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/1a3e931f

Branch: refs/heads/master
Commit: 1a3e931fc671ba063c7eac8f43cbf28b4c9f727b
Parents: 0bef37a
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:31:51 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:31:51 2016 -0700

----------------------------------------------------------------------
 src/tests/master_authorization_tests.cpp | 12 ++++++------
 src/tests/master_validation_tests.cpp    |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/1a3e931f/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp
index a4623d1..a53e270 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -233,7 +233,7 @@ TEST_F(MasterAuthorizationTest, UnauthorizedTask)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();
@@ -343,7 +343,7 @@ TEST_F(MasterAuthorizationTest, UnauthorizedTaskGroup)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();
@@ -426,7 +426,7 @@ TEST_F(MasterAuthorizationTest, KillTask)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();
@@ -556,7 +556,7 @@ TEST_F(MasterAuthorizationTest, KillPendingTaskInTaskGroup)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();
@@ -659,7 +659,7 @@ TEST_F(MasterAuthorizationTest, SlaveRemoved)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();
@@ -765,7 +765,7 @@ TEST_F(MasterAuthorizationTest, SlaveDisconnected)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();

http://git-wip-us.apache.org/repos/asf/mesos/blob/1a3e931f/src/tests/master_validation_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_validation_tests.cpp b/src/tests/master_validation_tests.cpp
index da43f99..a5d8610 100644
--- a/src/tests/master_validation_tests.cpp
+++ b/src/tests/master_validation_tests.cpp
@@ -718,7 +718,7 @@ TEST_F(TaskValidationTest, ExecutorUsesInvalidFrameworkID)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();
@@ -1826,7 +1826,7 @@ TEST_F(TaskGroupValidationTest, ExecutorUsesDockerContainerInfo)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();
@@ -1916,7 +1916,7 @@ TEST_F(TaskGroupValidationTest, ExecutorWithoutFrameworkId)
 
   driver.reconcileTasks({});
 
-  // We pause the clock here to ensure any updates sent by the master
+  // We settle the clock here to ensure any updates sent by the master
   // are received. There shouldn't be any updates in this case.
   Clock::pause();
   Clock::settle();


[03/10] mesos git commit: Cleaned up a test case.

Posted by vi...@apache.org.
Cleaned up a test case.

Review: https://reviews.apache.org/r/52656/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/0bef37ab
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/0bef37ab
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/0bef37ab

Branch: refs/heads/master
Commit: 0bef37ab88415bc876ba9511528af88265fd0be5
Parents: e510813
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:31:45 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:31:45 2016 -0700

----------------------------------------------------------------------
 src/tests/fault_tolerance_tests.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/0bef37ab/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 5a9944c..e15bf8d 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -855,7 +855,10 @@ TEST_F(FaultToleranceTest, FrameworkReregister)
 }
 
 
-TEST_F(FaultToleranceTest, TaskLost)
+// This test checks that if a non-partition-aware scheduler that is
+// disconnected from the master attempts to launch a task, it receives
+// a TASK_LOST status update.
+TEST_F(FaultToleranceTest, DisconnectedSchedulerLaunchLost)
 {
   Try<Owned<cluster::Master>> master = StartMaster();
   ASSERT_SOME(master);
@@ -874,8 +877,8 @@ TEST_F(FaultToleranceTest, TaskLost)
     .WillOnce(FutureArg<1>(&offers))
     .WillRepeatedly(Return()); // Ignore subsequent offers.
 
-  Future<process::Message> message =
-    FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+  Future<FrameworkRegisteredMessage> message =
+    FUTURE_PROTOBUF(FrameworkRegisteredMessage(), _, _);
 
   driver.start();
 
@@ -893,12 +896,7 @@ TEST_F(FaultToleranceTest, TaskLost)
 
   AWAIT_READY(disconnected);
 
-  TaskInfo task;
-  task.set_name("test task");
-  task.mutable_task_id()->set_value("1");
-  task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id());
-  task.mutable_resources()->MergeFrom(offers.get()[0].resources());
-  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
+  TaskInfo task = createTask(offers.get()[0], "sleep 60");
 
   Future<TaskStatus> status;
   EXPECT_CALL(sched, statusUpdate(&driver, _))
@@ -909,6 +907,7 @@ TEST_F(FaultToleranceTest, TaskLost)
   AWAIT_READY(status);
   EXPECT_EQ(TASK_LOST, status.get().state());
   EXPECT_EQ(TaskStatus::REASON_MASTER_DISCONNECTED, status.get().reason());
+  EXPECT_EQ(TaskStatus::SOURCE_MASTER, status.get().source());
 
   driver.stop();
   driver.join();


[02/10] mesos git commit: Changed reconciliation for unregistering, reregistering agents.

Posted by vi...@apache.org.
Changed reconciliation for unregistering, reregistering agents.

Previously, explicit reconciliation for an agent that was in the process
of reregistering or unregistering returned no results. This degree of
cleverness seems unwarranted: if the agent hasn't completed the
reregistration or unregistration process, it seems simpler for the
master to return the previous state of the agent. This is what the
framework would observe if their reconcile request lost the race with
the reregister/unregister operation, anyway.

Note that since reregistering agents are no longer considered to be "in
transition", we need to slightly adjust the rules for how we update the
`slaves.recovered` collection in the master: an agent remains in the
"recovered" collection until it has been marked reachable in the
registry (rather than removing it from "recovered" as soon as the
reregistration process beings). This is more consistent with how we
manage the other collections in the master anyway: an agent appears in
the `recovered` list until the registry operation that reregisters it
has been successfully applied.

Review: https://reviews.apache.org/r/52083/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e510813f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e510813f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e510813f

Branch: refs/heads/master
Commit: e510813f93e253480005ce95cc4bd7ef094193db
Parents: 2c15661
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:31:37 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:31:37 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp              |  37 ++++--
 src/master/master.hpp              |  15 +--
 src/tests/master_tests.cpp         |  66 ++++++++++
 src/tests/reconciliation_tests.cpp | 208 +++++++++++++++++++++++++++++---
 4 files changed, 290 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e510813f/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 3c6b18e..324391a 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1829,8 +1829,12 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
   // Remove the slaves in a rate limited manner, similar to how the
   // SlaveObserver removes slaves.
   foreach (const Registry::Slave& slave, registry.slaves().slaves()) {
-    // The slave is removed from 'recovered' when it re-registers.
-    if (!slaves.recovered.contains(slave.info().id())) {
+    // The slave is removed from `recovered` when it completes the
+    // re-registration process. If the slave is in `reregistering`, it
+    // has started but not yet finished re-registering. In either
+    // case, we don't want to try to remove it.
+    if (!slaves.recovered.contains(slave.info().id()) ||
+        slaves.reregistering.contains(slave.info().id())) {
       continue;
     }
 
@@ -1859,7 +1863,8 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
 
 Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
 {
-  // The slave is removed from 'recovered' when it re-registers.
+  // The slave might have reregistered while we were waiting to
+  // acquire the rate limit.
   if (!slaves.recovered.contains(slave.info().id())) {
     LOG(INFO) << "Canceling transition of agent "
               << slave.info().id() << " (" << slave.info().hostname() << ")"
@@ -1869,6 +1874,16 @@ Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
     return Nothing();
   }
 
+  // The slave might be in the process of reregistering.
+  if (slaves.reregistering.contains(slave.info().id())) {
+    LOG(INFO) << "Canceling transition of agent "
+              << slave.info().id() << " (" << slave.info().hostname() << ")"
+              << " to unreachable because it is re-registering";
+
+    ++metrics->slave_unreachable_canceled;
+    return Nothing();
+  }
+
   LOG(WARNING) << "Agent " << slave.info().id()
                << " (" << slave.info().hostname() << ") did not re-register"
                << " within " << flags.agent_reregister_timeout
@@ -1876,8 +1891,6 @@ Nothing Master::markUnreachableAfterFailover(const Registry::Slave& slave)
 
   ++metrics->slave_unreachable_completed;
 
-  slaves.recovered.erase(slave.info().id());
-
   TimeInfo unreachableTime = protobuf::getCurrentTime();
 
   slaves.markingUnreachable.insert(slave.info().id());
@@ -1900,6 +1913,9 @@ void Master::_markUnreachableAfterFailover(
   CHECK(slaves.markingUnreachable.contains(slaveInfo.id()));
   slaves.markingUnreachable.erase(slaveInfo.id());
 
+  CHECK(slaves.recovered.contains(slaveInfo.id()));
+  slaves.recovered.erase(slaveInfo.id());
+
   if (registrarResult.isFailed()) {
     LOG(FATAL) << "Failed to mark agent " << slaveInfo.id()
                << " (" << slaveInfo.hostname() << ")"
@@ -5233,6 +5249,8 @@ void Master::reregisterSlave(
   Slave* slave = slaves.registered.get(slaveInfo.id());
 
   if (slave != nullptr) {
+    CHECK(!slaves.recovered.contains(slaveInfo.id()));
+
     slave->reregisteredTime = Clock::now();
 
     // NOTE: This handles the case where a slave tries to
@@ -5302,10 +5320,6 @@ void Master::reregisterSlave(
     return;
   }
 
-  // Ensure we don't remove the slave for not re-registering after
-  // we've recovered it from the registry.
-  slaves.recovered.erase(slaveInfo.id());
-
   // If we're already re-registering this slave, then no need to ask
   // the registrar again.
   if (slaves.reregistering.contains(slaveInfo.id())) {
@@ -5367,6 +5381,11 @@ void Master::_reregisterSlave(
   CHECK(readmit.get());
 
   // Re-admission succeeded.
+
+  // Ensure we don't remove the slave for not re-registering after
+  // we've recovered it from the registry.
+  slaves.recovered.erase(slaveInfo.id());
+
   MachineID machineId;
   machineId.set_hostname(slaveInfo.hostname());
   machineId.set_ip(stringify(pid.address.ip));

http://git-wip-us.apache.org/repos/asf/mesos/blob/e510813f/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 881f0d6..263ceb4 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -1616,9 +1616,10 @@ private:
     // registry to re-register with the master.
     Option<process::Timer> recoveredTimer;
 
-    // Slaves that have been recovered from the registrar but have yet
-    // to re-register. We use `recoveredTimer` above to ensure we
-    // remove these slaves if they do not re-register.
+    // Slaves that have been recovered from the registrar after master
+    // failover. Slaves are removed from this collection when they
+    // either re-register with the master or are marked unreachable
+    // because they do not re-register before `recoveredTimer` fires.
     hashset<SlaveID> recovered;
 
     // Slaves that are in the process of registering.
@@ -1726,13 +1727,9 @@ private:
     bool transitioning(const Option<SlaveID>& slaveId)
     {
       if (slaveId.isSome()) {
-        return recovered.contains(slaveId.get()) ||
-               reregistering.contains(slaveId.get()) ||
-               removing.contains(slaveId.get());
+        return recovered.contains(slaveId.get());
       } else {
-        return !recovered.empty() ||
-               !reregistering.empty() ||
-               !removing.empty();
+        return !recovered.empty();
       }
     }
   } slaves;

http://git-wip-us.apache.org/repos/asf/mesos/blob/e510813f/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 88cf1e6..df492d3 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -2222,6 +2222,72 @@ TEST_F(MasterTest, RecoveredSlaveReregisters)
 }
 
 
+// This test checks that the master behaves correctly when a slave is
+// in the process of reregistering after master failover when the
+// agent failover timeout expires.
+TEST_F(MasterTest, RecoveredSlaveReregisterThenUnreachableRace)
+{
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), master.get()->pid, _);
+
+  // Reuse slaveFlags so both StartSlave() use the same work_dir.
+  slave::Flags slaveFlags = this->CreateSlaveFlags();
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(slaveRegisteredMessage);
+
+  // Stop the slave while the master is down.
+  master->reset();
+  slave.get()->terminate();
+  slave->reset();
+
+  // Restart the master.
+  master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  // Start the slave, which will cause it to reregister. Intercept the
+  // next registry operation, which we expect to be slave reregistration.
+  Future<ReregisterSlaveMessage> reregisterSlaveMessage =
+    FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, master.get()->pid);
+
+  Future<Owned<master::Operation>> reregister;
+  Promise<bool> reregisterContinue;
+  EXPECT_CALL(*master.get()->registrar.get(), apply(_))
+    .WillOnce(DoAll(FutureArg<0>(&reregister),
+                    Return(reregisterContinue.future())));
+
+  detector = master.get()->createDetector();
+  slave = StartSlave(detector.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(reregisterSlaveMessage);
+
+  AWAIT_READY(reregister);
+  EXPECT_NE(
+      nullptr,
+      dynamic_cast<master::MarkSlaveReachable*>(
+          reregister.get().get()));
+
+  // Advance the clock to cause the agent reregister timeout to
+  // expire. Because slave reregistration has already started, we do
+  // NOT expect the master to mark the slave unreachable. Hence we
+  // don't expect to see any registry operations.
+  EXPECT_CALL(*master.get()->registrar.get(), apply(_))
+    .Times(0);
+
+  Clock::pause();
+  Clock::advance(masterFlags.agent_reregister_timeout);
+  Clock::settle();
+}
+
+
 #ifdef MESOS_HAS_JAVA
 
 class MasterZooKeeperTest : public MesosZooKeeperTest {};

http://git-wip-us.apache.org/repos/asf/mesos/blob/e510813f/src/tests/reconciliation_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/reconciliation_tests.cpp b/src/tests/reconciliation_tests.cpp
index 1412090..d2bba48 100644
--- a/src/tests/reconciliation_tests.cpp
+++ b/src/tests/reconciliation_tests.cpp
@@ -376,9 +376,10 @@ TEST_F(ReconciliationTest, UnknownKillTask)
 }
 
 
-// This test verifies that reconciliation of a task that belongs to a
-// slave that is in a transitional state doesn't result in an update.
-TEST_F(ReconciliationTest, SlaveInTransition)
+// This test verifies that explicit reconciliation does not return any
+// results for tasks running on an agent that has been recovered from
+// the registry after master failover but has not yet reregistered.
+TEST_F(ReconciliationTest, RecoveredAgent)
 {
   master::Flags masterFlags = CreateMasterFlags();
   Try<Owned<cluster::Master>> master = StartMaster();
@@ -398,15 +399,105 @@ TEST_F(ReconciliationTest, SlaveInTransition)
   AWAIT_READY(slaveRegisteredMessage);
   const SlaveID slaveId = slaveRegisteredMessage.get().slave_id();
 
+  // Stop the master.
+  master->reset();
+
+  // Stop the slave.
+  slave.get()->terminate();
+  slave->reset();
+
+  // Restart the master.
+  master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
   MockScheduler sched;
   MesosSchedulerDriver driver(
       &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
 
-  // Stop the slave and master (a bit later).
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillRepeatedly(Return()); // Ignore offers.
+
+  // Framework should not receive any task status updates.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.start();
+
+  // Wait for the framework to register.
+  AWAIT_READY(frameworkId);
+
+  // Do reconciliation before the agent has attempted to reregister.
+  // This should not yield any results.
+  Future<mesos::scheduler::Call> reconcileCall = FUTURE_CALL(
+      mesos::scheduler::Call(), mesos::scheduler::Call::RECONCILE, _ , _);
+
+  // Reconcile for a random task ID on the slave.
+  TaskStatus status;
+  status.mutable_task_id()->set_value(UUID::random().toString());
+  status.mutable_slave_id()->CopyFrom(slaveId);
+  status.set_state(TASK_STAGING); // Dummy value.
+
+  driver.reconcileTasks({status});
+
+  // Make sure the master received the reconcile call.
+  AWAIT_READY(reconcileCall);
+
+  // The Clock::settle() will ensure that framework would receive
+  // a status update if it is sent by the master. In this test it
+  // shouldn't receive any.
+  Clock::pause();
+  Clock::settle();
+
+  Clock::resume();
+
+  driver.stop();
+  driver.join();
+}
+
+
+// This test verifies that explicit reconciliation does not return any
+// results for tasks running on an agent that has been recovered from
+// the registry after master failover, where the agent has started the
+// reregistration process but has not completed it yet.
+TEST_F(ReconciliationTest, RecoveredAgentReregistrationInProgress)
+{
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  // Reuse slaveFlags so both StartSlave() use the same work_dir.
+  slave::Flags slaveFlags = CreateSlaveFlags();
+
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  // Wait for the slave to register and get the slave id.
+  AWAIT_READY(slaveRegisteredMessage);
+  const SlaveID slaveId = slaveRegisteredMessage.get().slave_id();
+
+  // Stop the master.
   master->reset();
+
+  // Stop the slave.
   slave.get()->terminate();
   slave->reset();
 
+  // Restart the master.
+  master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
   Future<FrameworkID> frameworkId;
   EXPECT_CALL(sched, registered(&driver, _, _))
     .WillOnce(FutureArg<1>(&frameworkId));
@@ -414,13 +505,14 @@ TEST_F(ReconciliationTest, SlaveInTransition)
   EXPECT_CALL(sched, resourceOffers(&driver, _))
     .WillRepeatedly(Return()); // Ignore offers.
 
-  // Framework should not receive any update.
+  // Framework should not receive any task status updates.
   EXPECT_CALL(sched, statusUpdate(&driver, _))
     .Times(0);
 
-  // Restart the master.
-  master = StartMaster(masterFlags);
-  ASSERT_SOME(master);
+  driver.start();
+
+  // Wait for the framework to register.
+  AWAIT_READY(frameworkId);
 
   // Intercept the first registrar operation that is attempted; this
   // should be the registry operation that reregisters the slave.
@@ -430,11 +522,6 @@ TEST_F(ReconciliationTest, SlaveInTransition)
     .WillOnce(DoAll(FutureArg<0>(&reregister),
                     Return(promise.future())));
 
-  driver.start();
-
-  // Wait for the framework to register.
-  AWAIT_READY(frameworkId);
-
   // Restart the slave.
   detector = master.get()->createDetector();
   slave = StartSlave(detector.get(), slaveFlags);
@@ -449,9 +536,7 @@ TEST_F(ReconciliationTest, SlaveInTransition)
   Future<mesos::scheduler::Call> reconcileCall = FUTURE_CALL(
       mesos::scheduler::Call(), mesos::scheduler::Call::RECONCILE, _ , _);
 
-  Clock::pause();
-
-  // Create a task status with a random task id.
+  // Reconcile for a random task ID on the slave.
   TaskStatus status;
   status.mutable_task_id()->set_value(UUID::random().toString());
   status.mutable_slave_id()->CopyFrom(slaveId);
@@ -462,11 +547,98 @@ TEST_F(ReconciliationTest, SlaveInTransition)
   // Make sure the master received the reconcile call.
   AWAIT_READY(reconcileCall);
 
-  // The Clock::settle() will ensure that framework would receive
-  // a status update if it is sent by the master. In this test it
+  // The Clock::settle() will ensure that the framework receives a
+  // status update if it is sent by the master. In this test it
   // shouldn't receive any.
+  Clock::pause();
   Clock::settle();
 
+  Clock::resume();
+
+  driver.stop();
+  driver.join();
+}
+
+
+// This test ensures that when an agent has started but not finished
+// the unregistration process, explicit reconciliation indicates that
+// the agent is still registered.
+TEST_F(ReconciliationTest, RemovalInProgress)
+{
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Future<SlaveRegisteredMessage> slaveRegisteredMessage =
+    FUTURE_PROTOBUF(SlaveRegisteredMessage(), _, _);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get());
+  ASSERT_SOME(slave);
+
+  // Wait for the slave to register and get the slave id.
+  AWAIT_READY(slaveRegisteredMessage);
+  const SlaveID slaveId = slaveRegisteredMessage.get().slave_id();
+
+  Future<UnregisterSlaveMessage> unregisterSlaveMessage =
+    FUTURE_PROTOBUF(
+        UnregisterSlaveMessage(),
+        slave.get()->pid,
+        master.get()->pid);
+
+  // Intercept the next registrar operation; this should be the
+  // registry operation that unregisters the slave.
+  Future<Owned<master::Operation>> unregister;
+  Future<Nothing> unregisterStarted;
+  Promise<bool> promise; // Never satisfied.
+  EXPECT_CALL(*master.get()->registrar.get(), apply(_))
+    .WillOnce(DoAll(FutureArg<0>(&unregister),
+                    Return(promise.future())));
+
+  // Cause the slave to shutdown gracefully.
+  slave.get()->shutdown();
+  slave->reset();
+
+  AWAIT_READY(unregisterSlaveMessage);
+
+  AWAIT_READY(unregister);
+  EXPECT_NE(
+      nullptr,
+      dynamic_cast<master::RemoveSlave*>(unregister.get().get()));
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillRepeatedly(Return()); // Ignore offers.
+
+  driver.start();
+
+  // Wait for the framework to register.
+  AWAIT_READY(frameworkId);
+
+  Future<TaskStatus> update;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&update));
+
+  // Reconcile for a random task ID on the slave.
+  TaskStatus status;
+  status.mutable_task_id()->set_value(UUID::random().toString());
+  status.mutable_slave_id()->CopyFrom(slaveId);
+  status.set_state(TASK_STAGING); // Dummy value.
+
+  driver.reconcileTasks({status});
+
+  AWAIT_READY(update);
+  EXPECT_EQ(TASK_LOST, update.get().state());
+  EXPECT_EQ(TaskStatus::REASON_RECONCILIATION, update.get().reason());
+  EXPECT_FALSE(update.get().has_unreachable_time());
+
   driver.stop();
   driver.join();
 }


[10/10] mesos git commit: Clarified a comment.

Posted by vi...@apache.org.
Clarified a comment.

Review: https://reviews.apache.org/r/52720/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/bf00061b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/bf00061b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/bf00061b

Branch: refs/heads/master
Commit: bf00061b5dd2d5552ad14a206cf8167b70d669c3
Parents: da2ca0f
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:32:27 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:32:27 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/bf00061b/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 0cdb45f..3f3ce93 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -6813,9 +6813,9 @@ void Master::reconcileKnownSlave(
                      << " unknown to the agent " << *slave
                      << " during re-registration : reconciling with the agent";
 
-        // NOTE: Currently the slave doesn't look at the task state
-        // when it reconciles the task state; we include the correct
-        // state for correctness and consistency.
+        // NOTE: The slave doesn't look at the task state when it
+        // reconciles the task. We send the master's view of the
+        // current task state since it might be useful in the future.
         const TaskState& state = task->has_status_update_state()
             ? task->status_update_state()
             : task->state();


[06/10] mesos git commit: Changed master to send TASK_DROPPED for task launch errors.

Posted by vi...@apache.org.
Changed master to send TASK_DROPPED for task launch errors.

When a task launch fails due to a transient error (e.g., insufficient
available resources at an agent), the master sends a TASK_LOST update to
the framework. For PARTITION_AWARE frameworks, we now send TASK_DROPPED
instead.

Review: https://reviews.apache.org/r/52659/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4eac0b06
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4eac0b06
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4eac0b06

Branch: refs/heads/master
Commit: 4eac0b0663de87c0fdde6f9e8c42566f99a3dfaf
Parents: f8a0c28
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:32:02 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:32:02 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp                    |  38 ++++-
 src/tests/master_authorization_tests.cpp | 126 +++++++++++++--
 src/tests/master_tests.cpp               | 224 ++++++++++++++++++++++++--
 3 files changed, 358 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/4eac0b06/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 324391a..2fc41f5 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -3498,7 +3498,9 @@ void Master::accept(
     }
   }
 
-  // If invalid, send TASK_LOST for the launch attempts.
+  // If invalid, send TASK_DROPPED for the launch attempts. If the
+  // framework is not partition-aware, send TASK_LOST instead.
+  //
   // TODO(jieyu): Consider adding a 'drop' overload for ACCEPT call to
   // consistently handle message dropping. It would be ideal if the
   // 'drop' overload can handle both resource recovery and lost task
@@ -3507,6 +3509,12 @@ void Master::accept(
     LOG(WARNING) << "ACCEPT call used invalid offers '" << accept.offer_ids()
                  << "': " << error.get().message;
 
+    TaskState newTaskState = TASK_DROPPED;
+    if (!protobuf::frameworkHasCapability(
+            framework->info, FrameworkInfo::Capability::PARTITION_AWARE)) {
+      newTaskState = TASK_LOST;
+    }
+
     foreach (const Offer::Operation& operation, accept.operations()) {
       if (operation.type() != Offer::Operation::LAUNCH &&
           operation.type() != Offer::Operation::LAUNCH_GROUP) {
@@ -3527,16 +3535,21 @@ void Master::accept(
             framework->id(),
             task.slave_id(),
             task.task_id(),
-            TASK_LOST,
+            newTaskState,
             TaskStatus::SOURCE_MASTER,
             None(),
             "Task launched with invalid offers: " + error.get().message,
             TaskStatus::REASON_INVALID_OFFERS);
 
-        metrics->tasks_lost++;
+        if (protobuf::frameworkHasCapability(
+                framework->info, FrameworkInfo::Capability::PARTITION_AWARE)) {
+          metrics->tasks_dropped++;
+        } else {
+          metrics->tasks_lost++;
+        }
 
         metrics->incrementTasksStates(
-            TASK_LOST,
+            newTaskState,
             TaskStatus::SOURCE_MASTER,
             TaskStatus::REASON_INVALID_OFFERS);
 
@@ -3702,6 +3715,12 @@ void Master::_accept(
   Slave* slave = slaves.registered.get(slaveId);
 
   if (slave == nullptr || !slave->connected) {
+    TaskState newTaskState = TASK_DROPPED;
+    if (!protobuf::frameworkHasCapability(
+            framework->info, FrameworkInfo::Capability::PARTITION_AWARE)) {
+      newTaskState = TASK_LOST;
+    }
+
     foreach (const Offer::Operation& operation, accept.operations()) {
       if (operation.type() != Offer::Operation::LAUNCH &&
           operation.type() != Offer::Operation::LAUNCH_GROUP) {
@@ -3734,16 +3753,21 @@ void Master::_accept(
             framework->id(),
             task.slave_id(),
             task.task_id(),
-            TASK_LOST,
+            newTaskState,
             TaskStatus::SOURCE_MASTER,
             None(),
             slave == nullptr ? "Agent removed" : "Agent disconnected",
             reason);
 
-        metrics->tasks_lost++;
+        if (protobuf::frameworkHasCapability(
+                framework->info, FrameworkInfo::Capability::PARTITION_AWARE)) {
+          metrics->tasks_dropped++;
+        } else {
+          metrics->tasks_lost++;
+        }
 
         metrics->incrementTasksStates(
-            TASK_LOST,
+            newTaskState,
             TaskStatus::SOURCE_MASTER,
             reason);
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/4eac0b06/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp
index a53e270..001d4b3 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -567,8 +567,9 @@ TEST_F(MasterAuthorizationTest, KillPendingTaskInTaskGroup)
 
 
 // This test verifies that a slave removal that comes before
-// '_accept()' is called results in TASK_LOST.
-TEST_F(MasterAuthorizationTest, SlaveRemoved)
+// '_accept()' is called results in TASK_LOST for a framework that is
+// not partition-aware.
+TEST_F(MasterAuthorizationTest, SlaveRemovedLost)
 {
   MockAuthorizer authorizer;
   Try<Owned<cluster::Master>> master = StartMaster(&authorizer);
@@ -646,9 +647,7 @@ TEST_F(MasterAuthorizationTest, SlaveRemoved)
 
   // Check metrics.
   JSON::Object stats = Metrics();
-  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
-  EXPECT_EQ(1u, stats.values.count(
-                    "master/task_lost/source_master/reason_slave_removed"));
+  EXPECT_EQ(0u, stats.values["master/tasks_dropped"]);
   EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
   EXPECT_EQ(
       1u, stats.values["master/task_lost/source_master/reason_slave_removed"]);
@@ -669,9 +668,117 @@ TEST_F(MasterAuthorizationTest, SlaveRemoved)
 }
 
 
+// This test verifies that a slave removal that comes before
+// '_accept()' is called results in TASK_DROPPED for a framework that
+// is partition-aware.
+TEST_F(MasterAuthorizationTest, SlaveRemovedDropped)
+{
+  MockAuthorizer authorizer;
+  Try<Owned<cluster::Master>> master = StartMaster(&authorizer);
+  ASSERT_SOME(master);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+  TestContainerizer containerizer(&exec);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer);
+  ASSERT_SOME(slave);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .Times(1);
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+
+  TaskInfo task = createTask(offers.get()[0], "", DEFAULT_EXECUTOR_ID);
+
+  // Return a pending future from authorizer.
+  Future<Nothing> authorize;
+  Promise<bool> promise;
+  EXPECT_CALL(authorizer, authorized(_))
+    .WillOnce(DoAll(FutureSatisfy(&authorize),
+                    Return(promise.future())));
+
+  driver.launchTasks(offers.get()[0].id(), {task});
+
+  // Wait until authorization is in progress.
+  AWAIT_READY(authorize);
+
+  Future<Nothing> slaveLost;
+  EXPECT_CALL(sched, slaveLost(&driver, _))
+    .WillOnce(FutureSatisfy(&slaveLost));
+
+  // Stop the slave with explicit shutdown as otherwise with
+  // checkpointing the master will wait for the slave to reconnect.
+  slave.get()->shutdown();
+  slave->reset();
+
+  AWAIT_READY(slaveLost);
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status));
+
+  Future<Nothing> recoverResources =
+    FUTURE_DISPATCH(_, &MesosAllocatorProcess::recoverResources);
+
+  // Now complete authorization.
+  promise.set(true);
+
+  // Framework should get a TASK_DROPPED.
+  AWAIT_READY(status);
+
+  EXPECT_EQ(TASK_DROPPED, status.get().state());
+  EXPECT_EQ(TaskStatus::SOURCE_MASTER, status.get().source());
+  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, status.get().reason());
+
+  // No task launch should happen resulting in all resources being
+  // returned to the allocator.
+  AWAIT_READY(recoverResources);
+
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(0u, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(1u, stats.values["master/tasks_dropped"]);
+  EXPECT_EQ(
+      1u,
+      stats.values["master/task_dropped/source_master/reason_slave_removed"]);
+
+  // Make sure the task is not known to master anymore.
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .Times(0);
+
+  driver.reconcileTasks({});
+
+  // We settle the clock here to ensure any updates sent by the master
+  // are received. There shouldn't be any updates in this case.
+  Clock::pause();
+  Clock::settle();
+
+  driver.stop();
+  driver.join();
+}
+
+
 // This test verifies that a slave disconnection that comes before
-// '_launchTasks()' is called results in TASK_LOST.
-TEST_F(MasterAuthorizationTest, SlaveDisconnected)
+// '_launchTasks()' is called results in TASK_LOST for a framework
+// that is not partition-aware.
+TEST_F(MasterAuthorizationTest, SlaveDisconnectedLost)
 {
   MockAuthorizer authorizer;
   Try<Owned<cluster::Master>> master = StartMaster(&authorizer);
@@ -750,11 +857,8 @@ TEST_F(MasterAuthorizationTest, SlaveDisconnected)
 
   // Check metrics.
   JSON::Object stats = Metrics();
-  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(0u, stats.values["master/tasks_dropped"]);
   EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
-  EXPECT_EQ(1u,
-            stats.values.count(
-                "master/task_lost/source_master/reason_slave_removed"));
   EXPECT_EQ(
       1u,
       stats.values["master/task_lost/source_master/reason_slave_removed"]);

http://git-wip-us.apache.org/repos/asf/mesos/blob/4eac0b06/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index df492d3..b31502f 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -1488,8 +1488,10 @@ TEST_F(MasterTest, LaunchCombinedOfferTest)
 }
 
 
-// Test ensures offers for launchTasks cannot span multiple slaves.
-TEST_F(MasterTest, LaunchAcrossSlavesTest)
+// This test ensures that the offers provided to a single launchTasks
+// call cannot span multiple slaves. A non-partition-aware framework
+// should receive TASK_LOST.
+TEST_F(MasterTest, LaunchAcrossSlavesLost)
 {
   Try<Owned<cluster::Master>> master = StartMaster();
   ASSERT_SOME(master);
@@ -1580,24 +1582,134 @@ TEST_F(MasterTest, LaunchAcrossSlavesTest)
 
   // Check metrics.
   JSON::Object stats = Metrics();
-  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(0u, stats.values["master/tasks_dropped"]);
   EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
   EXPECT_EQ(
       1u,
+      stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+
+  driver.stop();
+  driver.join();
+}
+
+
+// This test ensures that the offers provided to a single launchTasks
+// call cannot span multiple slaves. A partition-aware framework
+// should receive TASK_DROPPED.
+TEST_F(MasterTest, LaunchAcrossSlavesDropped)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+  TestContainerizer containerizer(&exec);
+
+  // See LaunchCombinedOfferTest() for resource size motivation.
+  Resources fullSlave = Resources::parse("cpus:2;mem:1024").get();
+  Resources twoSlaves = fullSlave + fullSlave;
+
+  slave::Flags flags = CreateSlaveFlags();
+  flags.resources = Option<string>(stringify(fullSlave));
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  Try<Owned<cluster::Slave>> slave1 =
+    StartSlave(detector.get(), &containerizer, flags);
+  ASSERT_SOME(slave1);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  Future<vector<Offer>> offers1;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers1));
+
+  driver.start();
+
+  AWAIT_READY(offers1);
+  EXPECT_NE(0u, offers1.get().size());
+  Resources resources1(offers1.get()[0].resources());
+  EXPECT_EQ(2, resources1.cpus().get());
+  EXPECT_EQ(Megabytes(1024), resources1.mem().get());
+
+  // Test that offers cannot span multiple slaves.
+  Future<vector<Offer>> offers2;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers2))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  // Create new Flags as we require another work_dir for checkpoints.
+  slave::Flags flags2 = CreateSlaveFlags();
+  flags2.resources = Option<string>(stringify(fullSlave));
+
+  Try<Owned<cluster::Slave>> slave2 =
+    StartSlave(detector.get(), &containerizer, flags2);
+  ASSERT_SOME(slave2);
+
+  AWAIT_READY(offers2);
+  EXPECT_NE(0u, offers2.get().size());
+  Resources resources2(offers1.get()[0].resources());
+  EXPECT_EQ(2, resources2.cpus().get());
+  EXPECT_EQ(Megabytes(1024), resources2.mem().get());
+
+  TaskInfo task;
+  task.set_name("");
+  task.mutable_task_id()->set_value("1");
+  task.mutable_slave_id()->MergeFrom(offers1.get()[0].slave_id());
+  task.mutable_resources()->MergeFrom(twoSlaves);
+  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
+
+  Future<TaskStatus> status;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status));
+
+  vector<OfferID> combinedOffers;
+  combinedOffers.push_back(offers1.get()[0].id());
+  combinedOffers.push_back(offers2.get()[0].id());
+
+  Future<Nothing> recoverResources =
+    FUTURE_DISPATCH(_, &MesosAllocatorProcess::recoverResources);
+
+  driver.launchTasks(combinedOffers, {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_DROPPED, status.get().state());
+  EXPECT_EQ(TaskStatus::REASON_INVALID_OFFERS, status.get().reason());
+
+  // The resources of the invalid offers should be recovered.
+  AWAIT_READY(recoverResources);
+
+  EXPECT_CALL(exec, shutdown(_))
+    .Times(AtMost(1));
+
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(1u, stats.values.count("master/tasks_dropped"));
+  EXPECT_EQ(1u, stats.values["master/tasks_dropped"]);
+  EXPECT_EQ(
+      1u,
       stats.values.count(
-          "master/task_lost/source_master/reason_invalid_offers"));
+          "master/task_dropped/source_master/reason_invalid_offers"));
   EXPECT_EQ(
       1u,
-      stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+      stats.values["master/task_dropped/source_master/reason_invalid_offers"]);
 
   driver.stop();
   driver.join();
 }
 
 
-// Test ensures that an offer cannot appear more than once in offers
-// for launchTasks.
-TEST_F(MasterTest, LaunchDuplicateOfferTest)
+// This test ensures that an offer cannot appear more than once in the
+// offers provided to a single launchTasks call. A non-partition-aware
+// framework should receive TASK_LOST.
+TEST_F(MasterTest, LaunchDuplicateOfferLost)
 {
   Try<Owned<cluster::Master>> master = StartMaster();
   ASSERT_SOME(master);
@@ -1671,15 +1783,103 @@ TEST_F(MasterTest, LaunchDuplicateOfferTest)
 
   // Check metrics.
   JSON::Object stats = Metrics();
-  EXPECT_EQ(1u, stats.values.count("master/tasks_lost"));
+  EXPECT_EQ(0u, stats.values["master/tasks_dropped"]);
   EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
   EXPECT_EQ(
       1u,
-      stats.values.count(
-          "master/task_lost/source_master/reason_invalid_offers"));
+      stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+
+  driver.stop();
+  driver.join();
+}
+
+
+// This test ensures that an offer cannot appear more than once in the
+// offers provided to a single launchTasks call. A partition-aware
+// framework should receive TASK_DROPPED.
+TEST_F(MasterTest, LaunchDuplicateOfferDropped)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+  TestContainerizer containerizer(&exec);
+
+  // See LaunchCombinedOfferTest() for resource size motivation.
+  Resources fullSlave = Resources::parse("cpus:2;mem:1024").get();
+
+  slave::Flags flags = CreateSlaveFlags();
+  flags.resources = Option<string>(stringify(fullSlave));
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(detector.get(), &containerizer, flags);
+  ASSERT_SOME(slave);
+
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::PARTITION_AWARE);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(&driver, _, _));
+
+  // Test that same offers cannot be used more than once.
+  // Kill 2nd task and get offer for full slave.
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+  Resources resources(offers.get()[0].resources());
+  EXPECT_EQ(2, resources.cpus().get());
+  EXPECT_EQ(Megabytes(1024), resources.mem().get());
+
+  vector<OfferID> combinedOffers;
+  combinedOffers.push_back(offers.get()[0].id());
+  combinedOffers.push_back(offers.get()[0].id());
+
+  TaskInfo task;
+  task.set_name("");
+  task.mutable_task_id()->set_value("1");
+  task.mutable_slave_id()->MergeFrom(offers.get()[0].slave_id());
+  task.mutable_resources()->MergeFrom(fullSlave);
+  task.mutable_executor()->MergeFrom(DEFAULT_EXECUTOR_INFO);
+
+  Future<TaskStatus> status;
+
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&status));
+
+  Future<Nothing> recoverResources =
+    FUTURE_DISPATCH(_, &MesosAllocatorProcess::recoverResources);
+
+  driver.launchTasks(combinedOffers, {task});
+
+  AWAIT_READY(status);
+  EXPECT_EQ(TASK_DROPPED, status.get().state());
+  EXPECT_EQ(TaskStatus::REASON_INVALID_OFFERS, status.get().reason());
+
+  // The resources of the invalid offers should be recovered.
+  AWAIT_READY(recoverResources);
+
+  EXPECT_CALL(exec, shutdown(_))
+    .Times(AtMost(1));
+
+  // Check metrics.
+  JSON::Object stats = Metrics();
+  EXPECT_EQ(0u, stats.values["master/tasks_lost"]);
+  EXPECT_EQ(1u, stats.values["master/tasks_dropped"]);
   EXPECT_EQ(
       1u,
-      stats.values["master/task_lost/source_master/reason_invalid_offers"]);
+      stats.values["master/task_dropped/source_master/reason_invalid_offers"]);
 
   driver.stop();
   driver.join();


[07/10] mesos git commit: Removed redundant test.

Posted by vi...@apache.org.
Removed redundant test.

`MasterAuthorizationTest.SlaveDisconnectedLost` was identical to
`MasterAuthorizationTest.SlaveRemovedLost`.

Review: https://reviews.apache.org/r/52969/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5082181a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5082181a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5082181a

Branch: refs/heads/master
Commit: 5082181aebf3798484e52792b1fc9a6ff6a4416c
Parents: 4eac0b0
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:32:10 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:32:10 2016 -0700

----------------------------------------------------------------------
 src/tests/master_authorization_tests.cpp | 104 --------------------------
 1 file changed, 104 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/5082181a/src/tests/master_authorization_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_authorization_tests.cpp b/src/tests/master_authorization_tests.cpp
index 001d4b3..4712361 100644
--- a/src/tests/master_authorization_tests.cpp
+++ b/src/tests/master_authorization_tests.cpp
@@ -775,110 +775,6 @@ TEST_F(MasterAuthorizationTest, SlaveRemovedDropped)
 }
 
 
-// This test verifies that a slave disconnection that comes before
-// '_launchTasks()' is called results in TASK_LOST for a framework
-// that is not partition-aware.
-TEST_F(MasterAuthorizationTest, SlaveDisconnectedLost)
-{
-  MockAuthorizer authorizer;
-  Try<Owned<cluster::Master>> master = StartMaster(&authorizer);
-  ASSERT_SOME(master);
-
-  MockExecutor exec(DEFAULT_EXECUTOR_ID);
-  TestContainerizer containerizer(&exec);
-
-  Owned<MasterDetector> detector = master.get()->createDetector();
-  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer);
-  ASSERT_SOME(slave);
-
-  MockScheduler sched;
-  MesosSchedulerDriver driver(
-      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
-
-  EXPECT_CALL(sched, registered(&driver, _, _))
-    .Times(1);
-
-  Future<vector<Offer>> offers;
-  EXPECT_CALL(sched, resourceOffers(&driver, _))
-    .WillOnce(FutureArg<1>(&offers))
-    .WillRepeatedly(Return()); // Ignore subsequent offers.
-
-  driver.start();
-
-  AWAIT_READY(offers);
-  EXPECT_NE(0u, offers.get().size());
-
-  TaskInfo task = createTask(offers.get()[0], "", DEFAULT_EXECUTOR_ID);
-
-  // Return a pending future from authorizer.
-  Future<Nothing> authorize;
-  Promise<bool> promise;
-  EXPECT_CALL(authorizer, authorized(_))
-    .WillOnce(DoAll(FutureSatisfy(&authorize),
-                    Return(promise.future())));
-
-  driver.launchTasks(offers.get()[0].id(), {task});
-
-  // Wait until authorization is in progress.
-  AWAIT_READY(authorize);
-
-  Future<Nothing> slaveLost;
-  EXPECT_CALL(sched, slaveLost(&driver, _))
-    .WillOnce(FutureSatisfy(&slaveLost));
-
-  // Stop the slave with explicit shutdown message so that the master
-  // does not wait for it to reconnect.
-  slave.get()->shutdown();
-  slave->reset();
-
-  // Wait for the slave to be removed by the master.
-  AWAIT_READY(slaveLost);
-
-  Future<TaskStatus> status;
-  EXPECT_CALL(sched, statusUpdate(&driver, _))
-    .WillOnce(FutureArg<1>(&status));
-
-  Future<Nothing> recoverResources =
-    FUTURE_DISPATCH(_, &MesosAllocatorProcess::recoverResources);
-
-  // Now complete authorization.
-  promise.set(true);
-
-  // Framework should get a TASK_LOST.
-  AWAIT_READY(status);
-
-  EXPECT_EQ(TASK_LOST, status.get().state());
-  EXPECT_EQ(TaskStatus::SOURCE_MASTER, status.get().source());
-  EXPECT_EQ(TaskStatus::REASON_SLAVE_REMOVED, status.get().reason());
-
-  // No task launch should happen resulting in all resources being
-  // returned to the allocator.
-  AWAIT_READY(recoverResources);
-
-  // Check metrics.
-  JSON::Object stats = Metrics();
-  EXPECT_EQ(0u, stats.values["master/tasks_dropped"]);
-  EXPECT_EQ(1u, stats.values["master/tasks_lost"]);
-  EXPECT_EQ(
-      1u,
-      stats.values["master/task_lost/source_master/reason_slave_removed"]);
-
-  // Make sure the task is not known to master anymore.
-  EXPECT_CALL(sched, statusUpdate(&driver, _))
-    .Times(0);
-
-  driver.reconcileTasks({});
-
-  // We settle the clock here to ensure any updates sent by the master
-  // are received. There shouldn't be any updates in this case.
-  Clock::pause();
-  Clock::settle();
-
-  driver.stop();
-  driver.join();
-}
-
-
 // This test verifies that a framework removal that comes before
 // '_accept()' is called results in recovery of resources.
 TEST_F(MasterAuthorizationTest, FrameworkRemoved)


[09/10] mesos git commit: Renamed a function for clarity.

Posted by vi...@apache.org.
Renamed a function for clarity.

`Master::reconcile(Framework*, const scheduler::Call::Reconcile&)` and
`Master::reconcile(Slave*, const vector<ExecutorInfo>&, const
vector<Task>& tasks)` are only loosely related. Per discussion on the
development list, using overloading to distinguish these two functions
is confusing. Hence, rename the latter to `reconcileKnownSlave`.

Review: https://reviews.apache.org/r/52719/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/da2ca0f1
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/da2ca0f1
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/da2ca0f1

Branch: refs/heads/master
Commit: da2ca0f17d9ac9a0248edd18ed2cc7c774e985d4
Parents: 97bd957
Author: Neil Conway <ne...@gmail.com>
Authored: Wed Oct 19 16:32:21 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Wed Oct 19 16:32:21 2016 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 12 ++++--------
 src/master/master.hpp |  7 ++++---
 2 files changed, 8 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/da2ca0f1/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index bf6bb1a..0cdb45f 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -5318,10 +5318,9 @@ void Master::reregisterSlave(
     // Update slave's version after re-registering successfully.
     slave->version = version;
 
-    // Reconcile tasks between master and the slave.
-    // NOTE: This sends the re-registered message, including tasks
-    // that need to be reconciled by the slave.
-    reconcile(slave, executorInfos, tasks);
+    // Reconcile tasks between master and slave, and send the
+    // `SlaveReregisteredMessage`.
+    reconcileKnownSlave(slave, executorInfos, tasks);
 
     // If this is a disconnected slave, add it back to the allocator.
     // This is done after reconciliation to ensure the allocator's
@@ -6767,10 +6766,7 @@ void Master::authenticationTimeout(Future<Option<string>> future)
 }
 
 
-// NOTE: This function is only called when the slave re-registers
-// with a master that already knows about it (i.e., not a failed
-// over master).
-void Master::reconcile(
+void Master::reconcileKnownSlave(
     Slave* slave,
     const vector<ExecutorInfo>& executors,
     const vector<Task>& tasks)

http://git-wip-us.apache.org/repos/asf/mesos/blob/da2ca0f1/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 263ceb4..6d2db9d 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -640,9 +640,10 @@ protected:
       Framework* framework,
       const std::vector<TaskStatus>& statuses);
 
-  // Handles a known re-registering slave by reconciling the master's
-  // view of the slave's tasks and executors.
-  void reconcile(
+  // When a slave that is known to the master re-registers, we need to
+  // reconcile the master's view of the slave's tasks and executors.
+  // This function also sends the `ReregisterSlaveMessage`.
+  void reconcileKnownSlave(
       Slave* slave,
       const std::vector<ExecutorInfo>& executors,
       const std::vector<Task>& tasks);