You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2014/05/29 02:20:26 UTC

[1/3] git commit: Fixed master to properly rescind offers.

Repository: mesos
Updated Branches:
  refs/heads/master 31e382e95 -> 60865b2f4


Fixed master to properly rescind offers.

Review: https://reviews.apache.org/r/21948


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/44e5456e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/44e5456e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/44e5456e

Branch: refs/heads/master
Commit: 44e5456e71389dcfa584af6c6c2cddc48bbac3a4
Parents: 31e382e
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri May 23 17:26:13 2014 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Wed May 28 16:45:42 2014 -0700

----------------------------------------------------------------------
 src/master/master.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/44e5456e/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index efb4de1..472eedd 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1183,7 +1183,7 @@ void Master::reregisterFramework(
       foreach (Offer* offer, utils::copy(framework->offers)) {
         allocator->resourcesRecovered(
             offer->framework_id(), offer->slave_id(), offer->resources());
-        removeOffer(offer);
+        removeOffer(offer, true); // Rescind.
       }
 
       FrameworkReregisteredMessage message;
@@ -1318,7 +1318,7 @@ void Master::deactivate(Framework* framework)
   foreach (Offer* offer, utils::copy(framework->offers)) {
     allocator->resourcesRecovered(
         offer->framework_id(), offer->slave_id(), offer->resources());
-    removeOffer(offer);
+    removeOffer(offer, true); // Rescind.
   }
 }
 


[2/3] git commit: Fixed Master::launchTasks() to inform allocator of unused resources when any of the offers are invalid.

Posted by vi...@apache.org.
Fixed Master::launchTasks() to inform allocator of unused resources
when any of the offers are invalid.

Review: https://reviews.apache.org/r/21750


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/0b8f30e8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/0b8f30e8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/0b8f30e8

Branch: refs/heads/master
Commit: 0b8f30e8ac3fb0893ffd32f63d132730ddad3d65
Parents: 44e5456
Author: Vinod Kone <vi...@twitter.com>
Authored: Tue May 20 18:44:19 2014 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Wed May 28 17:17:19 2014 -0700

----------------------------------------------------------------------
 src/master/master.cpp      | 53 ++++++++++++++++++++++++++++-------------
 src/tests/master_tests.cpp | 15 ++++++++++++
 2 files changed, 51 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/0b8f30e8/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 472eedd..fcbbc26 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1637,12 +1637,16 @@ struct OfferVisitor
 
   virtual ~OfferVisitor() {}
 
-  Slave* getSlave(Master* master, const SlaveID& id) {
-    return master->getSlave(id);
+  Slave* getSlave(Master* master, const SlaveID& slaveId)
+  {
+    CHECK_NOTNULL(master);
+    return master->getSlave(slaveId);
   }
 
-  Offer* getOffer(Master* master, const OfferID& id) {
-    return master->getOffer(id);
+  Offer* getOffer(Master* master, const OfferID& offerId)
+  {
+    CHECK_NOTNULL(master);
+    return master->getOffer(offerId);
   }
 };
 
@@ -1672,6 +1676,10 @@ struct FrameworkChecker : OfferVisitor {
       Master* master)
   {
     Offer* offer = getOffer(master, offerId);
+    if (offer == NULL) {
+      return "Offer " + stringify(offerId) + " is no longer valid";
+    }
+
     if (!(framework.id == offer->framework_id())) {
       return "Offer " + stringify(offer->id()) +
           " has invalid framework " + stringify(offer->framework_id()) +
@@ -1693,15 +1701,21 @@ struct SlaveChecker : OfferVisitor
       Master* master)
   {
     Offer* offer = getOffer(master, offerId);
-    Slave* slave = getSlave(master, offer->slave_id());
-    if (slave == NULL) {
-      return "Offer " + stringify(offerId) +
-          " outlived slave " + stringify(offer->slave_id());
+    if (offer == NULL) {
+      return "Offer " + stringify(offerId) + " is no longer valid";
     }
 
+    Slave* slave = getSlave(master, offer->slave_id());
+
+    // This is not possible because the offer should've been removed.
+    CHECK(slave != NULL)
+      << "Offer " << offerId
+      << " outlived slave " << offer->slave_id();
+
+    // This is not possible because the offer should've been removed.
     CHECK(!slave->disconnected)
-      << "Offer " + stringify(offerId)
-      << " outlived disconnected slave " << stringify(slave->id);
+      << "Offer " << offerId
+      << " outlived disconnected slave " << offer->slave_id();
 
     if (slaveId.isNone()) {
       // Set slave id and use as base case for validation.
@@ -1828,7 +1842,7 @@ void Master::launchTasks(
     }
 
     // If offer validation succeeds, we need to pass along the common
-    // slave. So optimisticaly, we store the first slave id we see.
+    // slave. So optimistically, we store the first slave id we see.
     // In case of invalid offers (different slaves for example), we
     // report error and return from launchTask before slaveId is used.
     if (slaveId.isNone()) {
@@ -1845,19 +1859,22 @@ void Master::launchTasks(
     delete visitor;
   };
 
-  // Remove offers.
+  // Remove offers and recover resources if any of the offers are
+  // invalid.
   foreach (const OfferID& offerId, offerIds) {
     Offer* offer = getOffer(offerId);
-    // Explicit check needed if an offerId appears more
-    // than once in offerIds.
-     if (offer != NULL) {
+    if (offer != NULL) {
+      if (offerError.isSome()) {
+        allocator->resourcesRecovered(
+            offer->framework_id(), offer->slave_id(), offer->resources());
+      }
       removeOffer(offer);
     }
   }
 
   if (offerError.isSome()) {
     LOG(WARNING) << "Failed to validate offer " << offerId
-                   << " : " << offerError.get();
+                   << ": " << offerError.get();
 
     foreach (const TaskInfo& task, tasks) {
       const StatusUpdate& update = protobuf::createStatusUpdate(
@@ -1874,7 +1891,6 @@ void Master::launchTasks(
       message.mutable_update()->CopyFrom(update);
       send(framework->pid, message);
     }
-
     return;
   }
 
@@ -3761,6 +3777,8 @@ void Master::removeTask(Task* task)
 }
 
 
+// TODO(vinod): Instead of 'removeOffer()', consider implementing
+// 'useOffer()', 'discardOffer()' and 'rescindOffer()' for clarity.
 void Master::removeOffer(Offer* offer, bool rescind)
 {
   // Remove from framework.
@@ -3790,6 +3808,7 @@ void Master::removeOffer(Offer* offer, bool rescind)
   delete offer;
 }
 
+
 // TODO(bmahler): Consider killing this.
 Framework* Master::getFramework(const FrameworkID& frameworkId)
 {

http://git-wip-us.apache.org/repos/asf/mesos/blob/0b8f30e8/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index b0ff627..7183cb7 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -41,6 +41,7 @@
 #include <stout/os.hpp>
 #include <stout/try.hpp>
 
+#include "master/allocator.hpp"
 #include "master/flags.hpp"
 #include "master/master.hpp"
 
@@ -59,6 +60,8 @@ using namespace mesos::internal::tests;
 
 using mesos::internal::master::Master;
 
+using mesos::internal::master::allocator::AllocatorProcess;
+
 using mesos::internal::slave::GarbageCollectorProcess;
 using mesos::internal::slave::Slave;
 using mesos::internal::slave::Containerizer;
@@ -1300,11 +1303,17 @@ TEST_F(MasterTest, LaunchAcrossSlavesTest)
   combinedOffers.push_back(offers1.get()[0].id());
   combinedOffers.push_back(offers2.get()[0].id());
 
+  Future<Nothing> resourcesRecovered =
+    FUTURE_DISPATCH(_, &AllocatorProcess::resourcesRecovered);
+
   driver.launchTasks(combinedOffers, tasks);
 
   AWAIT_READY(status);
   EXPECT_EQ(TASK_LOST, status.get().state());
 
+  // The resources of the invalid offers should be recovered.
+  AWAIT_READY(resourcesRecovered);
+
   EXPECT_CALL(exec, shutdown(_))
     .Times(AtMost(1));
 
@@ -1372,11 +1381,17 @@ TEST_F(MasterTest, LaunchDuplicateOfferTest)
   EXPECT_CALL(sched, statusUpdate(&driver, _))
     .WillOnce(FutureArg<1>(&status));
 
+  Future<Nothing> resourcesRecovered =
+    FUTURE_DISPATCH(_, &AllocatorProcess::resourcesRecovered);
+
   driver.launchTasks(combinedOffers, tasks);
 
   AWAIT_READY(status);
   EXPECT_EQ(TASK_LOST, status.get().state());
 
+  // The resources of the invalid offers should be recovered.
+  AWAIT_READY(resourcesRecovered);
+
   EXPECT_CALL(exec, shutdown(_))
     .Times(AtMost(1));
 


[3/3] git commit: Fixed master to remove and rescind offers when a slave is disconnected.

Posted by vi...@apache.org.
Fixed master to remove and rescind offers when a slave is
disconnected.

Review: https://reviews.apache.org/r/21961


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/60865b2f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/60865b2f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/60865b2f

Branch: refs/heads/master
Commit: 60865b2f473d9898f1a18eee10f9a59a4862893a
Parents: 0b8f30e
Author: Vinod Kone <vi...@twitter.com>
Authored: Mon May 26 22:25:19 2014 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Wed May 28 17:17:20 2014 -0700

----------------------------------------------------------------------
 src/master/master.cpp               | 43 ++++++++++++--------------------
 src/master/master.hpp               |  5 ++--
 src/tests/fault_tolerance_tests.cpp |  8 ++++++
 3 files changed, 26 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/60865b2f/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index fcbbc26..766a0e3 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -720,7 +720,21 @@ void Master::exited(const UPID& pid)
       } else if (!slave->disconnected) {
         // Checkpointing slaves can just be disconnected.
         disconnect(slave);
-        removeFrameworksAndOffers(slave);
+
+        // Remove all non-checkpointing frameworks.
+        hashset<FrameworkID> frameworkIds =
+          slave->tasks.keys() | slave->executors.keys();
+
+        foreach (const FrameworkID& frameworkId, frameworkIds) {
+          Framework* framework = getFramework(frameworkId);
+          if (framework != NULL && !framework->info.checkpoint()) {
+            LOG(INFO) << "Removing framework " << frameworkId
+                      << " from disconnected slave " << *slave
+                      << " because the framework is not checkpointing";
+
+            removeFramework(slave, framework);
+          }
+        }
       } else {
         LOG(WARNING) << "Ignoring duplicate exited() notification for "
                      << "checkpointing slave " << *slave;
@@ -1336,37 +1350,12 @@ void Master::disconnect(Slave* slave)
   // Remove the slave from authenticated. This is safe because
   // a slave will always reauthenticate before (re-)registering.
   authenticated.erase(slave->pid);
-}
-
-
-void Master::removeFrameworksAndOffers(Slave* slave)
-{
-  CHECK_NOTNULL(slave);
-
-  // If a slave is checkpointing, remove all non-checkpointing
-  // frameworks from the slave. If the slave is not checkpointing,
-  // remove all of its frameworks.
-  hashset<FrameworkID> frameworkIds =
-    slave->tasks.keys() | slave->executors.keys();
-
-  foreach (const FrameworkID& frameworkId, frameworkIds) {
-    Framework* framework = getFramework(frameworkId);
-    if (framework != NULL &&
-        (!framework->info.checkpoint() || !slave->info.checkpoint())) {
-      LOG(INFO) << "Removing framework " << frameworkId
-                << " from disconnected slave " << *slave << " because "
-                << (!slave->info.checkpoint() ? "slave" : "framework")
-                << " is not checkpointing";
-
-      removeFramework(slave, framework);
-    }
-  }
 
+  // Remove and rescind offers.
   foreach (Offer* offer, utils::copy(slave->offers)) {
     allocator->resourcesRecovered(
         offer->framework_id(), slave->id, offer->resources());
 
-    // Remove and rescind offers.
     removeOffer(offer, true); // Rescind!
   }
 }

http://git-wip-us.apache.org/repos/asf/mesos/blob/60865b2f/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 4c21d9e..d4ef4be 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -286,14 +286,13 @@ protected:
   // reschedule offers that were assigned to this framework.
   void removeFramework(Framework* framework);
 
-  // Remove a framework from the slave, i.e., kill all of its tasks,
-  // remove its offers and reallocate its resources.
+  // Remove a framework from the slave, i.e., remove its tasks and
+  // executors and recover the resources.
   void removeFramework(Slave* slave, Framework* framework);
 
   // TODO(adam-mesos): Rename deactivate to disconnect, or v.v.
   void deactivate(Framework* framework);
   void disconnect(Slave* slave);
-  void removeFrameworksAndOffers(Slave* slave);
 
   // Add a slave.
   void addSlave(Slave* slave, bool reregister = false);

http://git-wip-us.apache.org/repos/asf/mesos/blob/60865b2f/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index e484a8a..4c6a5c4 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -1834,6 +1834,10 @@ TEST_F(FaultToleranceTest, SlaveReregisterOnZKExpiration)
 
   AWAIT_READY(resourceOffers);
 
+  Future<Nothing> offerRescinded;
+  EXPECT_CALL(sched, offerRescinded(_, _))
+    .WillOnce(FutureSatisfy(&offerRescinded));
+
   Future<SlaveReregisteredMessage> slaveReregisteredMessage =
     FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
 
@@ -1841,6 +1845,10 @@ TEST_F(FaultToleranceTest, SlaveReregisterOnZKExpiration)
   // expiration) at the slave.
   detector.appoint(master.get());
 
+  // Since an authenticating slave re-registration results in
+  // disconnecting the slave, its resources should be rescinded.
+  AWAIT_READY(offerRescinded);
+
   AWAIT_READY(slaveReregisteredMessage);
 
   driver.stop();