You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2016/08/26 21:52:50 UTC
[3/6] mesos git commit: Added registrar operations for marking agents
(un-)reachable.
Added registrar operations for marking agents (un-)reachable.
Review: https://reviews.apache.org/r/50701/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/af496f3a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/af496f3a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/af496f3a
Branch: refs/heads/master
Commit: af496f3a80da9a8e7961fb62f839aacf1658222e
Parents: 5405914
Author: Neil Conway <ne...@gmail.com>
Authored: Fri Aug 26 14:48:07 2016 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Fri Aug 26 14:48:07 2016 -0700
----------------------------------------------------------------------
src/master/master.hpp | 118 ++++++++++++++++++++++++++++++++++++-
src/tests/registrar_tests.cpp | 95 ++++++++++++++++++++---------
2 files changed, 183 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/af496f3a/src/master/master.hpp
----------------------------------------------------------------------
diff --git a/src/master/master.hpp b/src/master/master.hpp
index 6decff6..b118293 100644
--- a/src/master/master.hpp
+++ b/src/master/master.hpp
@@ -1891,7 +1891,7 @@ private:
};
-// Implementation of slave admission Registrar operation.
+// Add a new slave to the list of admitted slaves.
class AdmitSlave : public Operation
{
public:
@@ -1906,7 +1906,7 @@ protected:
hashset<SlaveID>* slaveIDs,
bool strict)
{
- // Check and see if this slave already exists.
+ // Check and see if this slave is currently admitted.
if (slaveIDs->contains(info.id())) {
if (strict) {
return Error("Agent already admitted");
@@ -1915,6 +1915,9 @@ protected:
}
}
+ // TODO(neilc): Check if the slave appears in the list of
+ // `unreachable` slaves in the registry?
+
Registry::Slave* slave = registry->mutable_slaves()->add_slaves();
slave->mutable_info()->CopyFrom(info);
slaveIDs->insert(info.id());
@@ -1926,6 +1929,117 @@ private:
};
+// Move a slave from the list of admitted slaves to the list of
+// unreachable slaves.
+class MarkSlaveUnreachable : public Operation
+{
+public:
+ explicit MarkSlaveUnreachable(const SlaveInfo& _info) : info(_info) {
+ CHECK(info.has_id()) << "SlaveInfo is missing the 'id' field";
+ }
+
+protected:
+ virtual Try<bool> perform(
+ Registry* registry,
+ hashset<SlaveID>* slaveIDs,
+ bool strict)
+ {
+ // As currently implemented, this should not be possible: the
+ // master will only mark slaves unreachable that are currently
+ // admitted.
+ if (!slaveIDs->contains(info.id())) {
+ return Error("Agent not yet admitted");
+ }
+
+ for (int i = 0; i < registry->slaves().slaves().size(); i++) {
+ const Registry::Slave& slave = registry->slaves().slaves(i);
+
+ if (slave.info().id() == info.id()) {
+ registry->mutable_slaves()->mutable_slaves()->DeleteSubrange(i, 1);
+ slaveIDs->erase(info.id());
+
+ Registry::UnreachableSlave* unreachable =
+ registry->mutable_unreachable()->add_slaves();
+
+ unreachable->mutable_id()->CopyFrom(info.id());
+ unreachable->mutable_timestamp()->CopyFrom(protobuf::getCurrentTime());
+
+ return true; // Mutation.
+ }
+ }
+
+ // Should not happen.
+ return Error("Failed to find agent " + stringify(info.id()));
+ }
+
+private:
+ const SlaveInfo info;
+};
+
+
+// Add a slave back to the list of admitted slaves. The slave will
+// typically be in the "unreachable" list; if so, it is removed from
+// that list. The slave might also be in the "admitted" list already.
+// Finally, the slave might be in neither the "unreachable" or
+// "admitted" lists, if its metadata has been garbage collected from
+// the registry.
+class MarkSlaveReachable : public Operation
+{
+public:
+ explicit MarkSlaveReachable(const SlaveInfo& _info) : info(_info) {
+ CHECK(info.has_id()) << "SlaveInfo is missing the 'id' field";
+ }
+
+protected:
+ virtual Try<bool> perform(
+ Registry* registry,
+ hashset<SlaveID>* slaveIDs,
+ bool strict)
+ {
+ // A slave might try to reregister that appears in the list of
+ // admitted slaves. This can occur when the master fails over:
+ // agents will usually attempt to reregister with the new master
+ // before they are marked unreachable. In this situation, the
+ // registry is already in the correct state, so no changes are
+ // needed.
+ if (slaveIDs->contains(info.id())) {
+ return false; // No mutation.
+ }
+
+ // Check whether the slave is in the unreachable list.
+ // TODO(neilc): Optimize this to avoid linear scan.
+ bool found = false;
+ for (int i = 0; i < registry->unreachable().slaves().size(); i++) {
+ const Registry::UnreachableSlave& slave =
+ registry->unreachable().slaves(i);
+
+ if (slave.id() == info.id()) {
+ registry->mutable_unreachable()->mutable_slaves()->DeleteSubrange(i, 1);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ LOG(WARNING) << "Allowing UNKNOWN agent to reregister: " << info;
+ }
+
+ // Add the slave to the admitted list, even if we didn't find it
+ // in the unreachable list. This accounts for when the slave was
+ // unreachable for a long time, was GC'd from the unreachable
+ // list, but then eventually reregistered.
+ Registry::Slave* slave = registry->mutable_slaves()->add_slaves();
+ slave->mutable_info()->CopyFrom(info);
+ slaveIDs->insert(info.id());
+
+ return true; // Mutation.
+ }
+
+private:
+ const SlaveInfo info;
+};
+
+
// Implementation of slave readmission Registrar operation.
class ReadmitSlave : public Operation
{
http://git-wip-us.apache.org/repos/asf/mesos/blob/af496f3a/src/tests/registrar_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/registrar_tests.cpp b/src/tests/registrar_tests.cpp
index 9a71d8f..b04fc92 100644
--- a/src/tests/registrar_tests.cpp
+++ b/src/tests/registrar_tests.cpp
@@ -236,7 +236,9 @@ TEST_P(RegistrarTest, Recover)
AWAIT_EXPECT_FAILED(
registrar.apply(Owned<Operation>(new AdmitSlave(slave))));
AWAIT_EXPECT_FAILED(
- registrar.apply(Owned<Operation>(new ReadmitSlave(slave))));
+ registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(slave))));
+ AWAIT_EXPECT_FAILED(
+ registrar.apply(Owned<Operation>(new MarkSlaveReachable(slave))));
AWAIT_EXPECT_FAILED(
registrar.apply(Owned<Operation>(new RemoveSlave(slave))));
@@ -246,8 +248,10 @@ TEST_P(RegistrarTest, Recover)
// operations to ensure they do not fail.
Future<bool> admit = registrar.apply(
Owned<Operation>(new AdmitSlave(slave)));
- Future<bool> readmit = registrar.apply(
- Owned<Operation>(new ReadmitSlave(slave)));
+ Future<bool> unreachable = registrar.apply(
+ Owned<Operation>(new MarkSlaveUnreachable(slave)));
+ Future<bool> reachable = registrar.apply(
+ Owned<Operation>(new MarkSlaveReachable(slave)));
Future<bool> remove = registrar.apply(
Owned<Operation>(new RemoveSlave(slave)));
@@ -255,7 +259,8 @@ TEST_P(RegistrarTest, Recover)
EXPECT_EQ(master, registry.get().master().info());
AWAIT_TRUE(admit);
- AWAIT_TRUE(readmit);
+ AWAIT_TRUE(unreachable);
+ AWAIT_TRUE(reachable);
AWAIT_TRUE(remove);
}
@@ -275,7 +280,7 @@ TEST_P(RegistrarTest, Admit)
}
-TEST_P(RegistrarTest, Readmit)
+TEST_P(RegistrarTest, MarkReachable)
{
Registrar registrar(flags, state);
AWAIT_READY(registrar.recover(master));
@@ -296,13 +301,51 @@ TEST_P(RegistrarTest, Readmit)
AWAIT_TRUE(registrar.apply(Owned<Operation>(new AdmitSlave(info1))));
- AWAIT_TRUE(registrar.apply(Owned<Operation>(new ReadmitSlave(info1))));
+ // Attempting to mark a slave as reachable that is already reachable
+ // should not result in an error.
+ AWAIT_TRUE(registrar.apply(Owned<Operation>(new MarkSlaveReachable(info1))));
+ AWAIT_TRUE(registrar.apply(Owned<Operation>(new MarkSlaveReachable(info1))));
- if (flags.registry_strict) {
- AWAIT_FALSE(registrar.apply(Owned<Operation>(new ReadmitSlave(info2))));
- } else {
- AWAIT_TRUE(registrar.apply(Owned<Operation>(new ReadmitSlave(info2))));
- }
+ // Attempting to mark a slave as reachable that is not in the
+ // unreachable list should not result in error.
+ AWAIT_TRUE(registrar.apply(Owned<Operation>(new MarkSlaveReachable(info2))));
+}
+
+
+TEST_P(RegistrarTest, MarkUnreachable)
+{
+ Registrar registrar(flags, state);
+ AWAIT_READY(registrar.recover(master));
+
+ SlaveInfo info1;
+ info1.set_hostname("localhost");
+
+ SlaveID id1;
+ id1.set_value("1");
+ info1.mutable_id()->CopyFrom(id1);
+
+ SlaveID id2;
+ id2.set_value("2");
+
+ SlaveInfo info2;
+ info2.set_hostname("localhost");
+ info2.mutable_id()->CopyFrom(id2);
+
+ AWAIT_TRUE(registrar.apply(Owned<Operation>(new AdmitSlave(info1))));
+
+ AWAIT_TRUE(
+ registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(info1))));
+
+ AWAIT_TRUE(
+ registrar.apply(Owned<Operation>(new MarkSlaveReachable(info1))));
+
+ AWAIT_TRUE(
+ registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(info1))));
+
+ // If a slave is already unreachable, trying to mark it unreachable
+ // again should fail.
+ AWAIT_FALSE(
+ registrar.apply(Owned<Operation>(new MarkSlaveUnreachable(info1))));
}
@@ -977,20 +1020,17 @@ TEST_P(RegistrarTest, UpdateWeights)
TEST_P(RegistrarTest, Bootstrap)
{
- // Run 1 readmits a slave that is not present.
+ // Run 1 simulates the reregistration of a slave that is not present
+ // in the registry.
{
Registrar registrar(flags, state);
AWAIT_READY(registrar.recover(master));
- // If not strict, we should be allowed to readmit the slave.
- if (flags.registry_strict) {
- AWAIT_FALSE(registrar.apply(Owned<Operation>(new ReadmitSlave(slave))));
- } else {
- AWAIT_TRUE(registrar.apply(Owned<Operation>(new ReadmitSlave(slave))));
- }
+ AWAIT_TRUE(
+ registrar.apply(Owned<Operation>(new MarkSlaveReachable(slave))));
}
- // Run 2 should see the slave if not strict.
+ // Run 2 should see the slave.
{
Registrar registrar(flags, state);
@@ -998,12 +1038,8 @@ TEST_P(RegistrarTest, Bootstrap)
AWAIT_READY(registry);
- if (flags.registry_strict) {
- EXPECT_EQ(0, registry.get().slaves().slaves().size());
- } else {
- ASSERT_EQ(1, registry.get().slaves().slaves().size());
- EXPECT_EQ(slave, registry.get().slaves().slaves(0).info());
- }
+ ASSERT_EQ(1, registry.get().slaves().slaves().size());
+ EXPECT_EQ(slave, registry.get().slaves().slaves(0).info());
}
}
@@ -1206,13 +1242,16 @@ TEST_P(Registrar_BENCHMARK_Test, Performance)
// same as in production).
std::random_shuffle(infos.begin(), infos.end());
- // Readmit slaves.
+ // Mark slaves reachable again. This simulates the master failing
+ // over, and then the previously registered agents reregistering
+ // with the new master.
watch.start();
foreach (const SlaveInfo& info, infos) {
- result = registrar.apply(Owned<Operation>(new ReadmitSlave(info)));
+ result = registrar.apply(Owned<Operation>(new MarkSlaveReachable(info)));
}
AWAIT_READY_FOR(result, Minutes(5));
- LOG(INFO) << "Readmitted " << slaveCount << " agents in " << watch.elapsed();
+ LOG(INFO) << "Marked " << slaveCount
+ << " agents reachable in " << watch.elapsed();
// Recover slaves.
Registrar registrar2(flags, state);