You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2015/02/28 03:07:11 UTC
[4/5] mesos git commit: Added rate limiting to slave removals during
master failover.
Added rate limiting to slave removals during master failover.
Review: https://reviews.apache.org/r/31515
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/26546a4b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/26546a4b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/26546a4b
Branch: refs/heads/master
Commit: 26546a4bb8dd64479cd1dfd1d450b37c59e280cc
Parents: 52c64bd
Author: Benjamin Mahler <be...@gmail.com>
Authored: Tue Feb 24 17:33:55 2015 -0800
Committer: Benjamin Mahler <be...@gmail.com>
Committed: Fri Feb 27 17:52:39 2015 -0800
----------------------------------------------------------------------
src/master/master.cpp | 95 ++++++++++++++++++++++++++++++++--------------
src/master/master.hpp | 4 ++
2 files changed, 71 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/26546a4b/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 4a1b428..53c8696 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -220,9 +220,8 @@ protected:
acquire = limiter.get()->acquire();
}
- ++metrics->slave_shutdowns_scheduled;
-
shuttingDown = acquire.onAny(defer(self(), &Self::_shutdown));
+ ++metrics->slave_shutdowns_scheduled;
}
void _shutdown()
@@ -1227,40 +1226,80 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
<< " investigate or increase this limit to proceed further";
}
+ // Remove the slaves in a rate limited manner, similar to how the
+ // SlaveObserver removes slaves.
foreach (const Registry::Slave& slave, registry.slaves().slaves()) {
- if (!slaves.recovered.contains(slave.info().id())) {
- continue; // Slave re-registered.
+ Future<Nothing> acquire = Nothing();
+
+ if (slaves.limiter.isSome()) {
+ LOG(INFO) << "Scheduling removal of slave "
+ << slave.info().id() << " (" << slave.info().hostname() << ")"
+ << "; did not re-register within "
+ << flags.slave_reregister_timeout << " after master failover";
+
+ acquire = slaves.limiter.get()->acquire();
}
- LOG(WARNING) << "Slave " << slave.info().id()
- << " (" << slave.info().hostname() << ") did not re-register "
- << "within the timeout; removing it from the registrar";
+ // Need to disambiguate for the compiler.
+ // TODO(bmahler): With C++11, just call removeSlave from within
+ // a lambda function to avoid the need to disambiguate.
+ Nothing (Master::*removeSlave)(const Registry::Slave&) = &Self::removeSlave;
+ const string failure = "Slave removal rate limit acquisition failed";
- ++metrics->recovery_slave_removals;
+ acquire
+ .then(defer(self(), removeSlave, slave))
+ .onFailed(lambda::bind(fail, failure, lambda::_1))
+ .onDiscarded(lambda::bind(fail, failure, "discarded"));
- slaves.recovered.erase(slave.info().id());
+ ++metrics->slave_shutdowns_scheduled;
+ }
+}
- if (flags.registry_strict) {
- slaves.removing.insert(slave.info().id());
- registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
- .onAny(defer(self(),
- &Self::_removeSlave,
- slave.info(),
- vector<StatusUpdate>(), // No TASK_LOST updates to send.
- lambda::_1));
- } else {
- // When a non-strict registry is in use, we want to ensure the
- // registry is used in a write-only manner. Therefore we remove
- // the slave from the registry but we do not inform the
- // framework.
- const string& message =
- "Failed to remove slave " + stringify(slave.info().id());
-
- registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
- .onFailed(lambda::bind(fail, message, lambda::_1));
- }
+Nothing Master::removeSlave(const Registry::Slave& slave)
+{
+ // The slave is removed from 'recovered' when it re-registers.
+ if (!slaves.recovered.contains(slave.info().id())) {
+ LOG(INFO) << "Canceling removal of slave "
+ << slave.info().id() << " (" << slave.info().hostname() << ")"
+ << " since it re-registered!";
+
+ ++metrics->slave_shutdowns_canceled;
+
+ return Nothing();
}
+
+ LOG(WARNING) << "Slave " << slave.info().id()
+ << " (" << slave.info().hostname() << ") did not re-register"
+ << " within " << flags.slave_reregister_timeout
+ << " after master failover; removing it from the registrar";
+
+ ++metrics->recovery_slave_removals;
+
+ slaves.recovered.erase(slave.info().id());
+
+ if (flags.registry_strict) {
+ slaves.removing.insert(slave.info().id());
+
+ registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
+ .onAny(defer(self(),
+ &Self::_removeSlave,
+ slave.info(),
+ vector<StatusUpdate>(), // No TASK_LOST updates to send.
+ lambda::_1));
+ } else {
+ // When a non-strict registry is in use, we want to ensure the
+ // registry is used in a write-only manner. Therefore we remove
+ // the slave from the registry but we do not inform the
+ // framework.
+ const string& message =
+ "Failed to remove slave " + stringify(slave.info().id());
+
+ registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
+ .onFailed(lambda::bind(fail, message, lambda::_1));
+ }
+
+ return Nothing();
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/26546a4b/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 3e8a8dc..ce0e0b3 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -362,6 +362,10 @@ protected:
const std::vector<Archive::Framework>& completedFrameworks =
std::vector<Archive::Framework>());
+ // Remove the slave from the registrar. Called when the slave
+ // does not re-register in time after a master failover.
+ Nothing removeSlave(const Registry::Slave& slave);
+
// Remove the slave from the registrar and from the master's state.
void removeSlave(Slave* slave);
void _removeSlave(