You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2015/02/28 03:07:11 UTC

[4/5] mesos git commit: Added rate limiting to slave removals during master failover.

Added rate limiting to slave removals during master failover.

Review: https://reviews.apache.org/r/31515


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/26546a4b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/26546a4b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/26546a4b

Branch: refs/heads/master
Commit: 26546a4bb8dd64479cd1dfd1d450b37c59e280cc
Parents: 52c64bd
Author: Benjamin Mahler <be...@gmail.com>
Authored: Tue Feb 24 17:33:55 2015 -0800
Committer: Benjamin Mahler <be...@gmail.com>
Committed: Fri Feb 27 17:52:39 2015 -0800

----------------------------------------------------------------------
 src/master/master.cpp | 95 ++++++++++++++++++++++++++++++++--------------
 src/master/master.hpp |  4 ++
 2 files changed, 71 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/26546a4b/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 4a1b428..53c8696 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -220,9 +220,8 @@ protected:
       acquire = limiter.get()->acquire();
     }
 
-    ++metrics->slave_shutdowns_scheduled;
-
     shuttingDown = acquire.onAny(defer(self(), &Self::_shutdown));
+    ++metrics->slave_shutdowns_scheduled;
   }
 
   void _shutdown()
@@ -1227,40 +1226,80 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
             << " investigate or increase this limit to proceed further";
   }
 
+  // Remove the slaves in a rate limited manner, similar to how the
+  // SlaveObserver removes slaves.
   foreach (const Registry::Slave& slave, registry.slaves().slaves()) {
-    if (!slaves.recovered.contains(slave.info().id())) {
-      continue; // Slave re-registered.
+    Future<Nothing> acquire = Nothing();
+
+    if (slaves.limiter.isSome()) {
+      LOG(INFO) << "Scheduling removal of slave "
+                << slave.info().id() << " (" << slave.info().hostname() << ")"
+                << "; did not re-register within "
+                << flags.slave_reregister_timeout << " after master failover";
+
+      acquire = slaves.limiter.get()->acquire();
     }
 
-    LOG(WARNING) << "Slave " << slave.info().id()
-                 << " (" << slave.info().hostname() << ") did not re-register "
-                 << "within the timeout; removing it from the registrar";
+    // Need to disambiguate for the compiler.
+    // TODO(bmahler): With C++11, just call removeSlave from within
+    // a lambda function to avoid the need to disambiguate.
+    Nothing (Master::*removeSlave)(const Registry::Slave&) = &Self::removeSlave;
+    const string failure = "Slave removal rate limit acquisition failed";
 
-    ++metrics->recovery_slave_removals;
+    acquire
+      .then(defer(self(), removeSlave, slave))
+      .onFailed(lambda::bind(fail, failure, lambda::_1))
+      .onDiscarded(lambda::bind(fail, failure, "discarded"));
 
-    slaves.recovered.erase(slave.info().id());
+    ++metrics->slave_shutdowns_scheduled;
+  }
+}
 
-    if (flags.registry_strict) {
-      slaves.removing.insert(slave.info().id());
 
-      registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
-        .onAny(defer(self(),
-                     &Self::_removeSlave,
-                     slave.info(),
-                     vector<StatusUpdate>(), // No TASK_LOST updates to send.
-                     lambda::_1));
-    } else {
-      // When a non-strict registry is in use, we want to ensure the
-      // registry is used in a write-only manner. Therefore we remove
-      // the slave from the registry but we do not inform the
-      // framework.
-      const string& message =
-        "Failed to remove slave " + stringify(slave.info().id());
-
-      registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
-        .onFailed(lambda::bind(fail, message, lambda::_1));
-    }
+Nothing Master::removeSlave(const Registry::Slave& slave)
+{
+  // The slave is removed from 'recovered' when it re-registers.
+  if (!slaves.recovered.contains(slave.info().id())) {
+    LOG(INFO) << "Canceling removal of slave "
+              << slave.info().id() << " (" << slave.info().hostname() << ")"
+              << " since it re-registered!";
+
+    ++metrics->slave_shutdowns_canceled;
+
+    return Nothing();
   }
+
+  LOG(WARNING) << "Slave " << slave.info().id()
+               << " (" << slave.info().hostname() << ") did not re-register"
+               << " within " << flags.slave_reregister_timeout
+               << " after master failover; removing it from the registrar";
+
+  ++metrics->recovery_slave_removals;
+
+  slaves.recovered.erase(slave.info().id());
+
+  if (flags.registry_strict) {
+    slaves.removing.insert(slave.info().id());
+
+    registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
+      .onAny(defer(self(),
+                   &Self::_removeSlave,
+                   slave.info(),
+                   vector<StatusUpdate>(), // No TASK_LOST updates to send.
+                   lambda::_1));
+  } else {
+    // When a non-strict registry is in use, we want to ensure the
+    // registry is used in a write-only manner. Therefore we remove
+    // the slave from the registry but we do not inform the
+    // framework.
+    const string& message =
+      "Failed to remove slave " + stringify(slave.info().id());
+
+    registrar->apply(Owned<Operation>(new RemoveSlave(slave.info())))
+      .onFailed(lambda::bind(fail, message, lambda::_1));
+  }
+
+  return Nothing();
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/26546a4b/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 3e8a8dc..ce0e0b3 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -362,6 +362,10 @@ protected:
       const std::vector<Archive::Framework>& completedFrameworks =
         std::vector<Archive::Framework>());
 
+  // Remove the slave from the registrar. Called when the slave
+  // does not re-register in time after a master failover.
+  Nothing removeSlave(const Registry::Slave& slave);
+
   // Remove the slave from the registrar and from the master's state.
   void removeSlave(Slave* slave);
   void _removeSlave(