You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2014/05/29 02:20:26 UTC
[1/3] git commit: Fixed master to properly rescind offers.
Repository: mesos
Updated Branches:
refs/heads/master 31e382e95 -> 60865b2f4
Fixed master to properly rescind offers.
Review: https://reviews.apache.org/r/21948
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/44e5456e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/44e5456e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/44e5456e
Branch: refs/heads/master
Commit: 44e5456e71389dcfa584af6c6c2cddc48bbac3a4
Parents: 31e382e
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri May 23 17:26:13 2014 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Wed May 28 16:45:42 2014 -0700
----------------------------------------------------------------------
src/master/master.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/44e5456e/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index efb4de1..472eedd 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1183,7 +1183,7 @@ void Master::reregisterFramework(
foreach (Offer* offer, utils::copy(framework->offers)) {
allocator->resourcesRecovered(
offer->framework_id(), offer->slave_id(), offer->resources());
- removeOffer(offer);
+ removeOffer(offer, true); // Rescind.
}
FrameworkReregisteredMessage message;
@@ -1318,7 +1318,7 @@ void Master::deactivate(Framework* framework)
foreach (Offer* offer, utils::copy(framework->offers)) {
allocator->resourcesRecovered(
offer->framework_id(), offer->slave_id(), offer->resources());
- removeOffer(offer);
+ removeOffer(offer, true); // Rescind.
}
}
[2/3] git commit: Fixed Master::launchTasks() to inform allocator of
unused resources when any of the offers are invalid.
Posted by vi...@apache.org.
Fixed Master::launchTasks() to inform allocator of unused resources
when any of the offers are invalid.
Review: https://reviews.apache.org/r/21750
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/0b8f30e8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/0b8f30e8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/0b8f30e8
Branch: refs/heads/master
Commit: 0b8f30e8ac3fb0893ffd32f63d132730ddad3d65
Parents: 44e5456
Author: Vinod Kone <vi...@twitter.com>
Authored: Tue May 20 18:44:19 2014 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Wed May 28 17:17:19 2014 -0700
----------------------------------------------------------------------
src/master/master.cpp | 53 ++++++++++++++++++++++++++++-------------
src/tests/master_tests.cpp | 15 ++++++++++++
2 files changed, 51 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/0b8f30e8/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 472eedd..fcbbc26 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1637,12 +1637,16 @@ struct OfferVisitor
virtual ~OfferVisitor() {}
- Slave* getSlave(Master* master, const SlaveID& id) {
- return master->getSlave(id);
+ Slave* getSlave(Master* master, const SlaveID& slaveId)
+ {
+ CHECK_NOTNULL(master);
+ return master->getSlave(slaveId);
}
- Offer* getOffer(Master* master, const OfferID& id) {
- return master->getOffer(id);
+ Offer* getOffer(Master* master, const OfferID& offerId)
+ {
+ CHECK_NOTNULL(master);
+ return master->getOffer(offerId);
}
};
@@ -1672,6 +1676,10 @@ struct FrameworkChecker : OfferVisitor {
Master* master)
{
Offer* offer = getOffer(master, offerId);
+ if (offer == NULL) {
+ return "Offer " + stringify(offerId) + " is no longer valid";
+ }
+
if (!(framework.id == offer->framework_id())) {
return "Offer " + stringify(offer->id()) +
" has invalid framework " + stringify(offer->framework_id()) +
@@ -1693,15 +1701,21 @@ struct SlaveChecker : OfferVisitor
Master* master)
{
Offer* offer = getOffer(master, offerId);
- Slave* slave = getSlave(master, offer->slave_id());
- if (slave == NULL) {
- return "Offer " + stringify(offerId) +
- " outlived slave " + stringify(offer->slave_id());
+ if (offer == NULL) {
+ return "Offer " + stringify(offerId) + " is no longer valid";
}
+ Slave* slave = getSlave(master, offer->slave_id());
+
+ // This is not possible because the offer should've been removed.
+ CHECK(slave != NULL)
+ << "Offer " << offerId
+ << " outlived slave " << offer->slave_id();
+
+ // This is not possible because the offer should've been removed.
CHECK(!slave->disconnected)
- << "Offer " + stringify(offerId)
- << " outlived disconnected slave " << stringify(slave->id);
+ << "Offer " << offerId
+ << " outlived disconnected slave " << offer->slave_id();
if (slaveId.isNone()) {
// Set slave id and use as base case for validation.
@@ -1828,7 +1842,7 @@ void Master::launchTasks(
}
// If offer validation succeeds, we need to pass along the common
- // slave. So optimisticaly, we store the first slave id we see.
+ // slave. So optimistically, we store the first slave id we see.
// In case of invalid offers (different slaves for example), we
// report error and return from launchTask before slaveId is used.
if (slaveId.isNone()) {
@@ -1845,19 +1859,22 @@ void Master::launchTasks(
delete visitor;
};
- // Remove offers.
+ // Remove offers and recover resources if any of the offers are
+ // invalid.
foreach (const OfferID& offerId, offerIds) {
Offer* offer = getOffer(offerId);
- // Explicit check needed if an offerId appears more
- // than once in offerIds.
- if (offer != NULL) {
+ if (offer != NULL) {
+ if (offerError.isSome()) {
+ allocator->resourcesRecovered(
+ offer->framework_id(), offer->slave_id(), offer->resources());
+ }
removeOffer(offer);
}
}
if (offerError.isSome()) {
LOG(WARNING) << "Failed to validate offer " << offerId
- << " : " << offerError.get();
+ << ": " << offerError.get();
foreach (const TaskInfo& task, tasks) {
const StatusUpdate& update = protobuf::createStatusUpdate(
@@ -1874,7 +1891,6 @@ void Master::launchTasks(
message.mutable_update()->CopyFrom(update);
send(framework->pid, message);
}
-
return;
}
@@ -3761,6 +3777,8 @@ void Master::removeTask(Task* task)
}
+// TODO(vinod): Instead of 'removeOffer()', consider implementing
+// 'useOffer()', 'discardOffer()' and 'rescindOffer()' for clarity.
void Master::removeOffer(Offer* offer, bool rescind)
{
// Remove from framework.
@@ -3790,6 +3808,7 @@ void Master::removeOffer(Offer* offer, bool rescind)
delete offer;
}
+
// TODO(bmahler): Consider killing this.
Framework* Master::getFramework(const FrameworkID& frameworkId)
{
http://git-wip-us.apache.org/repos/asf/mesos/blob/0b8f30e8/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index b0ff627..7183cb7 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -41,6 +41,7 @@
#include <stout/os.hpp>
#include <stout/try.hpp>
+#include "master/allocator.hpp"
#include "master/flags.hpp"
#include "master/master.hpp"
@@ -59,6 +60,8 @@ using namespace mesos::internal::tests;
using mesos::internal::master::Master;
+using mesos::internal::master::allocator::AllocatorProcess;
+
using mesos::internal::slave::GarbageCollectorProcess;
using mesos::internal::slave::Slave;
using mesos::internal::slave::Containerizer;
@@ -1300,11 +1303,17 @@ TEST_F(MasterTest, LaunchAcrossSlavesTest)
combinedOffers.push_back(offers1.get()[0].id());
combinedOffers.push_back(offers2.get()[0].id());
+ Future<Nothing> resourcesRecovered =
+ FUTURE_DISPATCH(_, &AllocatorProcess::resourcesRecovered);
+
driver.launchTasks(combinedOffers, tasks);
AWAIT_READY(status);
EXPECT_EQ(TASK_LOST, status.get().state());
+ // The resources of the invalid offers should be recovered.
+ AWAIT_READY(resourcesRecovered);
+
EXPECT_CALL(exec, shutdown(_))
.Times(AtMost(1));
@@ -1372,11 +1381,17 @@ TEST_F(MasterTest, LaunchDuplicateOfferTest)
EXPECT_CALL(sched, statusUpdate(&driver, _))
.WillOnce(FutureArg<1>(&status));
+ Future<Nothing> resourcesRecovered =
+ FUTURE_DISPATCH(_, &AllocatorProcess::resourcesRecovered);
+
driver.launchTasks(combinedOffers, tasks);
AWAIT_READY(status);
EXPECT_EQ(TASK_LOST, status.get().state());
+ // The resources of the invalid offers should be recovered.
+ AWAIT_READY(resourcesRecovered);
+
EXPECT_CALL(exec, shutdown(_))
.Times(AtMost(1));
[3/3] git commit: Fixed master to remove and rescind offers when a
slave is disconnected.
Posted by vi...@apache.org.
Fixed master to remove and rescind offers when a slave is
disconnected.
Review: https://reviews.apache.org/r/21961
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/60865b2f
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/60865b2f
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/60865b2f
Branch: refs/heads/master
Commit: 60865b2f473d9898f1a18eee10f9a59a4862893a
Parents: 0b8f30e
Author: Vinod Kone <vi...@twitter.com>
Authored: Mon May 26 22:25:19 2014 -0700
Committer: Vinod Kone <vi...@twitter.com>
Committed: Wed May 28 17:17:20 2014 -0700
----------------------------------------------------------------------
src/master/master.cpp | 43 ++++++++++++--------------------
src/master/master.hpp | 5 ++--
src/tests/fault_tolerance_tests.cpp | 8 ++++++
3 files changed, 26 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/60865b2f/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index fcbbc26..766a0e3 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -720,7 +720,21 @@ void Master::exited(const UPID& pid)
} else if (!slave->disconnected) {
// Checkpointing slaves can just be disconnected.
disconnect(slave);
- removeFrameworksAndOffers(slave);
+
+ // Remove all non-checkpointing frameworks.
+ hashset<FrameworkID> frameworkIds =
+ slave->tasks.keys() | slave->executors.keys();
+
+ foreach (const FrameworkID& frameworkId, frameworkIds) {
+ Framework* framework = getFramework(frameworkId);
+ if (framework != NULL && !framework->info.checkpoint()) {
+ LOG(INFO) << "Removing framework " << frameworkId
+ << " from disconnected slave " << *slave
+ << " because the framework is not checkpointing";
+
+ removeFramework(slave, framework);
+ }
+ }
} else {
LOG(WARNING) << "Ignoring duplicate exited() notification for "
<< "checkpointing slave " << *slave;
@@ -1336,37 +1350,12 @@ void Master::disconnect(Slave* slave)
// Remove the slave from authenticated. This is safe because
// a slave will always reauthenticate before (re-)registering.
authenticated.erase(slave->pid);
-}
-
-
-void Master::removeFrameworksAndOffers(Slave* slave)
-{
- CHECK_NOTNULL(slave);
-
- // If a slave is checkpointing, remove all non-checkpointing
- // frameworks from the slave. If the slave is not checkpointing,
- // remove all of its frameworks.
- hashset<FrameworkID> frameworkIds =
- slave->tasks.keys() | slave->executors.keys();
-
- foreach (const FrameworkID& frameworkId, frameworkIds) {
- Framework* framework = getFramework(frameworkId);
- if (framework != NULL &&
- (!framework->info.checkpoint() || !slave->info.checkpoint())) {
- LOG(INFO) << "Removing framework " << frameworkId
- << " from disconnected slave " << *slave << " because "
- << (!slave->info.checkpoint() ? "slave" : "framework")
- << " is not checkpointing";
-
- removeFramework(slave, framework);
- }
- }
+ // Remove and rescind offers.
foreach (Offer* offer, utils::copy(slave->offers)) {
allocator->resourcesRecovered(
offer->framework_id(), slave->id, offer->resources());
- // Remove and rescind offers.
removeOffer(offer, true); // Rescind!
}
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/60865b2f/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 4c21d9e..d4ef4be 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -286,14 +286,13 @@ protected:
// reschedule offers that were assigned to this framework.
void removeFramework(Framework* framework);
- // Remove a framework from the slave, i.e., kill all of its tasks,
- // remove its offers and reallocate its resources.
+ // Remove a framework from the slave, i.e., remove its tasks and
+ // executors and recover the resources.
void removeFramework(Slave* slave, Framework* framework);
// TODO(adam-mesos): Rename deactivate to disconnect, or v.v.
void deactivate(Framework* framework);
void disconnect(Slave* slave);
- void removeFrameworksAndOffers(Slave* slave);
// Add a slave.
void addSlave(Slave* slave, bool reregister = false);
http://git-wip-us.apache.org/repos/asf/mesos/blob/60865b2f/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index e484a8a..4c6a5c4 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -1834,6 +1834,10 @@ TEST_F(FaultToleranceTest, SlaveReregisterOnZKExpiration)
AWAIT_READY(resourceOffers);
+ Future<Nothing> offerRescinded;
+ EXPECT_CALL(sched, offerRescinded(_, _))
+ .WillOnce(FutureSatisfy(&offerRescinded));
+
Future<SlaveReregisteredMessage> slaveReregisteredMessage =
FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
@@ -1841,6 +1845,10 @@ TEST_F(FaultToleranceTest, SlaveReregisterOnZKExpiration)
// expiration) at the slave.
detector.appoint(master.get());
+ // Since an authenticating slave re-registration results in
+ // disconnecting the slave, its resources should be rescinded.
+ AWAIT_READY(offerRescinded);
+
AWAIT_READY(slaveReregisteredMessage);
driver.stop();