You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2014/04/17 04:28:17 UTC
[4/6] git commit: Added a configurable limit on the percentage of
slaves that can be removed after the re-registration timeout.
Added a configurable limit on the percentage of slaves that can be removed after the re-registration timeout.
Review: https://reviews.apache.org/r/20097
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8e02ce20
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8e02ce20
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8e02ce20
Branch: refs/heads/master
Commit: 8e02ce2070faf676ee977bc31e2c412dc4652cb7
Parents: 1a88f09
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Fri Mar 28 17:15:56 2014 -0700
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Wed Apr 16 19:17:04 2014 -0700
----------------------------------------------------------------------
src/local/local.cpp | 4 ---
src/master/constants.cpp | 1 +
src/master/constants.hpp | 11 +++++++
src/master/flags.hpp | 21 +++++++++++++
src/master/main.cpp | 4 ---
src/master/master.cpp | 71 +++++++++++++++++++++++++++++++++++++------
src/tests/cluster.hpp | 4 ---
7 files changed, 94 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/8e02ce20/src/local/local.cpp
----------------------------------------------------------------------
diff --git a/src/local/local.cpp b/src/local/local.cpp
index 7daa5ec..297f35b 100644
--- a/src/local/local.cpp
+++ b/src/local/local.cpp
@@ -116,10 +116,6 @@ PID<Master> launch(const Flags& flags, Allocator* _allocator)
<< "master flags from the environment: " << load.error();
}
- if (flags.registry_strict) {
- EXIT(1) << "Cannot run with --registry_strict; currently not supported";
- }
-
if (flags.registry == "in_memory") {
storage = new state::InMemoryStorage();
} else {
http://git-wip-us.apache.org/repos/asf/mesos/blob/8e02ce20/src/master/constants.cpp
----------------------------------------------------------------------
diff --git a/src/master/constants.cpp b/src/master/constants.cpp
index 1cb8f22..ed966bc 100644
--- a/src/master/constants.cpp
+++ b/src/master/constants.cpp
@@ -33,6 +33,7 @@ const double MIN_CPUS = 0.1;
const Bytes MIN_MEM = Megabytes(32);
const Duration SLAVE_PING_TIMEOUT = Seconds(15);
const uint32_t MAX_SLAVE_PING_TIMEOUTS = 5;
+const double RECOVERY_SLAVE_REMOVAL_PERCENT_LIMIT = 1.0; // 100%.
const size_t MAX_DEACTIVATED_SLAVES = 100000;
const uint32_t MAX_COMPLETED_FRAMEWORKS = 50;
const uint32_t MAX_COMPLETED_TASKS_PER_FRAMEWORK = 1000;
http://git-wip-us.apache.org/repos/asf/mesos/blob/8e02ce20/src/master/constants.hpp
----------------------------------------------------------------------
diff --git a/src/master/constants.hpp b/src/master/constants.hpp
index 52d8d77..27ae4f8 100644
--- a/src/master/constants.hpp
+++ b/src/master/constants.hpp
@@ -53,6 +53,17 @@ extern const Duration SLAVE_PING_TIMEOUT;
// Maximum number of ping timeouts until slave is considered failed.
extern const uint32_t MAX_SLAVE_PING_TIMEOUTS;
+// Default limit on the percentage of slaves that will be removed
+// after recovering if no re-registration attempts were made.
+// TODO(bmahler): There's no value here that works for all setups.
+// Currently the default is 100% which is favorable to those running
+// small clusters or experimenting with Mesos. However, it's important
+// that we also prevent the catastrophic 100% removal case for
+// production clusters. This TODO is to provide a --production flag
+// which would allow flag defaults that are more appropriate for
+// production use-cases.
+extern const double RECOVERY_SLAVE_REMOVAL_PERCENT_LIMIT;
+
// Maximum number of deactivated slaves to store in the cache.
extern const size_t MAX_DEACTIVATED_SLAVES;
http://git-wip-us.apache.org/repos/asf/mesos/blob/8e02ce20/src/master/flags.hpp
----------------------------------------------------------------------
diff --git a/src/master/flags.hpp b/src/master/flags.hpp
index 024f86d..acf3963 100644
--- a/src/master/flags.hpp
+++ b/src/master/flags.hpp
@@ -26,6 +26,8 @@
#include "logging/flags.hpp"
+#include "master/constants.hpp"
+
namespace mesos {
namespace internal {
namespace master {
@@ -72,6 +74,24 @@ public:
"bootstrap the persistent state on a running cluster.",
false);
+ // TODO(bmahler): Add a 'Percentage' abstraction for flags.
+ // TODO(bmahler): Add a --production flag for production defaults.
+ add(&Flags::recovery_slave_removal_limit,
+ "recovery_slave_removal_limit",
+ "For failovers, limit on the percentage of slaves that can be removed\n"
+ "from the registry *and* shutdown after the re-registration timeout\n"
+ "elapses. If the limit is exceeded, the master will fail over rather\n"
+ "than remove the slaves.\n"
+ "This can be used to provide safety guarantees for production\n"
+ "environments. Production environments may expect that across Master\n"
+ "failovers, at most a certain percentage of slaves will fail\n"
+ "permanently (e.g. due to rack-level failures).\n"
+ "Setting this limit would ensure that a human needs to get\n"
+ "involved should an unexpected widespread failure of slaves occur\n"
+ "in the cluster.\n"
+ "Values: [0%-100%]",
+ stringify(RECOVERY_SLAVE_REMOVAL_PERCENT_LIMIT * 100.0) + "%");
+
add(&Flags::webui_dir,
"webui_dir",
"Location of the webui files/assets",
@@ -141,6 +161,7 @@ public:
std::string work_dir;
std::string registry;
bool registry_strict;
+ std::string recovery_slave_removal_limit;
std::string webui_dir;
std::string whitelist;
std::string user_sorter;
http://git-wip-us.apache.org/repos/asf/mesos/blob/8e02ce20/src/master/main.cpp
----------------------------------------------------------------------
diff --git a/src/master/main.cpp b/src/master/main.cpp
index f12f20a..ec23781 100644
--- a/src/master/main.cpp
+++ b/src/master/main.cpp
@@ -156,10 +156,6 @@ int main(int argc, char** argv)
allocator::Allocator* allocator =
new allocator::Allocator(allocatorProcess);
- if (flags.registry_strict) {
- EXIT(1) << "Cannot run with --registry_strict; currently not supported";
- }
-
state::Storage* storage = NULL;
if (flags.registry == "in_memory") {
http://git-wip-us.apache.org/repos/asf/mesos/blob/8e02ce20/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index 3803c60..b37c6f2 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -35,6 +35,7 @@
#include <stout/memory.hpp>
#include <stout/multihashmap.hpp>
#include <stout/nothing.hpp>
+#include <stout/numify.hpp>
#include <stout/os.hpp>
#include <stout/path.hpp>
#include <stout/stringify.hpp>
@@ -252,9 +253,6 @@ Master::Master(
}
info_.set_hostname(hostname);
-
- LOG(INFO) << "Master ID: " << info_.id()
- << " Hostname: " << info_.hostname();
}
@@ -263,8 +261,39 @@ Master::~Master() {}
void Master::initialize()
{
- LOG(INFO) << "Master started on " << string(self()).substr(7);
+ LOG(INFO) << "Master " << info_.id() << " (" << info_.hostname() << ")"
+ << " started on " << string(self()).substr(7);
+
+ if (flags.registry_strict) {
+ EXIT(1) << "Cannot run with --registry_strict; currently not supported";
+ }
+
+ // Parse the percentage for the slave removal limit.
+ // TODO(bmahler): Add a 'Percentage' abstraction.
+ if (!strings::endsWith(flags.recovery_slave_removal_limit, "%")) {
+ EXIT(1) << "Invalid value '" << flags.recovery_slave_removal_limit << "' "
+ << "for --recovery_slave_removal_percent_limit: " << "missing '%'";
+ }
+
+ Try<double> limit = numify<double>(
+ strings::remove(
+ flags.recovery_slave_removal_limit,
+ "%",
+ strings::SUFFIX));
+
+ if (limit.isError()) {
+ EXIT(1) << "Invalid value '" << flags.recovery_slave_removal_limit << "' "
+ << "for --recovery_slave_removal_percent_limit: " << limit.error();
+ }
+
+ if (limit.get() < 0.0 || limit.get() > 100.0) {
+ EXIT(1) << "Invalid value '" << flags.recovery_slave_removal_limit << "' "
+ << "for --recovery_slave_removal_percent_limit: "
+ << "Must be within [0%-100%]";
+ }
+
+ // Validate authentication flags.
if (flags.authenticate) {
LOG(INFO) << "Master only allowing authenticated frameworks to register!";
@@ -276,7 +305,7 @@ void Master::initialize()
LOG(INFO) << "Master allowing unauthenticated frameworks to register!!";
}
-
+ // Load credentials.
if (flags.credentials.isSome()) {
vector<Credential> credentials;
@@ -775,11 +804,33 @@ void Master::recoveredSlavesTimeout(const Registry& registry)
{
CHECK(elected());
- // TODO(bmahler): Provide a (configurable) limit on the number of
- // slaves that can be removed here, e.g. maximum 10% of slaves can
- // be removed after failover if they do not re-register.
- // This can serve as a configurable safety net for operators of
- // production environments.
+ // TODO(bmahler): Add a 'Percentage' abstraction.
+ Try<double> limit_ = numify<double>(
+ strings::remove(
+ flags.recovery_slave_removal_limit,
+ "%",
+ strings::SUFFIX));
+
+ CHECK_SOME(limit_);
+
+ double limit = limit_.get() / 100.0;
+
+ // Compute the percentage of slaves to be removed, if it exceeds the
+ // safety-net limit, bail!
+ double removalPercentage =
+ (1.0 * slaves.recovered.size()) /
+ (1.0 * registry.slaves().slaves().size());
+
+ if (removalPercentage > limit) {
+ EXIT(1) << "Post-recovery slave removal limit exceeded! After "
+ << SLAVE_PING_TIMEOUT * MAX_SLAVE_PING_TIMEOUTS
+ << " there were " << slaves.recovered.size()
+ << " (" << removalPercentage * 100 << "%) slaves recovered from the"
+ << " registry that did not re-register: \n"
+ << stringify(slaves.recovered) << "\n "
+ << " The configured removal limit is " << limit * 100 << "%. Please"
+ << " investigate or increase this limit to proceed further";
+ }
foreach (const Registry::Slave& slave, registry.slaves().slaves()) {
if (!slaves.recovered.contains(slave.info().id())) {
http://git-wip-us.apache.org/repos/asf/mesos/blob/8e02ce20/src/tests/cluster.hpp
----------------------------------------------------------------------
diff --git a/src/tests/cluster.hpp b/src/tests/cluster.hpp
index 8479fe3..49d1f40 100644
--- a/src/tests/cluster.hpp
+++ b/src/tests/cluster.hpp
@@ -275,10 +275,6 @@ inline Try<process::PID<master::Master> > Cluster::Masters::start(
new master::allocator::Allocator(allocatorProcess.get());
}
- if (flags.registry_strict) {
- EXIT(1) << "Cannot run with --registry_strict; currently not supported";
- }
-
if (flags.registry == "in_memory") {
master.storage = new state::InMemoryStorage();
} else {