You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2014/08/19 03:53:21 UTC
git commit: Fixed master to reject completed frameworks from
re-registering.
Repository: mesos
Updated Branches:
refs/heads/master f7b5f5bee -> e9643f477
Fixed master to reject completed frameworks from re-registering.
Review: https://reviews.apache.org/r/20507
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e9643f47
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e9643f47
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e9643f47
Branch: refs/heads/master
Commit: e9643f47720d8c5c9e6278ec068e771b94e94fbc
Parents: f7b5f5b
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri Apr 18 15:31:59 2014 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Aug 18 18:52:17 2014 -0700
----------------------------------------------------------------------
src/master/master.cpp | 18 +++-
src/tests/fault_tolerance_tests.cpp | 155 +++++++++++++++++++++++++++++++
2 files changed, 172 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/e9643f47/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index e948803..18464ba 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1415,11 +1415,27 @@ void Master::reregisterFramework(
if (!frameworkInfo.has_id() || frameworkInfo.id() == "") {
LOG(ERROR) << "Framework re-registering without an id!";
FrameworkErrorMessage message;
- message.set_message("Framework reregistered without a framework id");
+ message.set_message("Framework reregistering without a framework id");
send(from, message);
return;
}
+ foreach (const shared_ptr<Framework>& framework, frameworks.completed) {
+ if (framework->id == frameworkInfo.id()) {
+ // This could happen if a framework tries to re-register after
+ // its failover timeout has elapsed or it unregistered itself
+ // by calling 'stop()' on the scheduler driver.
+ // TODO(vinod): Master should persist admitted frameworks to the
+ // registry and remove them from it after failover timeout.
+ LOG(WARNING) << "Completed framework " << framework->id
+ << " attempted to re-register";
+ FrameworkErrorMessage message;
+ message.set_message("Completed framework attempted to re-register");
+ send(from, message);
+ return;
+ }
+ }
+
LOG(INFO) << "Received re-registration request from framework "
<< frameworkInfo.id() << " at " << from;
http://git-wip-us.apache.org/repos/asf/mesos/blob/e9643f47/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 5646cd7..b0e74b9 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -42,6 +42,7 @@
#include "common/protobuf_utils.hpp"
+#include "master/allocator.hpp"
#include "master/master.hpp"
#include "slave/slave.hpp"
@@ -941,6 +942,160 @@ TEST_F(FaultToleranceTest, SchedulerFailover)
}
+// This test verifies that a framework attempting to re-register
+// after its failover timeout has elapsed is disallowed.
+TEST_F(FaultToleranceTest, SchedulerReregisterAfterFailoverTimeout)
+{
+ Try<PID<Master> > master = StartMaster();
+ ASSERT_SOME(master);
+
+ // Launch the first (i.e., failing) scheduler and wait until
+ // registered gets called to launch the second (i.e., failover)
+ // scheduler.
+
+ MockScheduler sched1;
+ MesosSchedulerDriver driver1(
+ &sched1, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+ Future<process::Message> frameworkRegisteredMessage =
+ FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+
+ Future<FrameworkID> frameworkId;
+ EXPECT_CALL(sched1, registered(&driver1, _, _))
+ .WillOnce(FutureArg<1>(&frameworkId));
+
+ driver1.start();
+
+ AWAIT_READY(frameworkRegisteredMessage);
+ AWAIT_READY(frameworkId);
+
+ Future<Nothing> frameworkDeactivated = FUTURE_DISPATCH(
+ _, &master::allocator::AllocatorProcess::frameworkDeactivated);
+
+ Future<Nothing> frameworkFailoverTimeout =
+ FUTURE_DISPATCH(_, &Master::frameworkFailoverTimeout);
+
+ // Simulate framework disconnection.
+ ASSERT_TRUE(process::inject::exited(
+ frameworkRegisteredMessage.get().to, master.get()));
+
+ // Wait until master schedules the framework for removal.
+ AWAIT_READY(frameworkDeactivated);
+
+ // Simulate framework failover timeout.
+ Clock::pause();
+ Clock::settle();
+
+ Try<Duration> failoverTimeout =
+ Duration::create(FrameworkInfo().failover_timeout());
+
+ ASSERT_SOME(failoverTimeout);
+ Clock::advance(failoverTimeout.get());
+
+ // Wait until master actually marks the framework as completed.
+ AWAIT_READY(frameworkFailoverTimeout);
+
+ // Now launch the second (i.e., failover) scheduler using the
+ // framework id recorded from the first scheduler.
+ MockScheduler sched2;
+
+ FrameworkInfo framework2; // Bug in gcc 4.1.*, must assign on next line.
+ framework2 = DEFAULT_FRAMEWORK_INFO;
+ framework2.mutable_id()->MergeFrom(frameworkId.get());
+
+ MesosSchedulerDriver driver2(
+ &sched2, framework2, master.get(), DEFAULT_CREDENTIAL);
+
+ EXPECT_CALL(sched2, registered(&driver2, frameworkId.get(), _))
+ .Times(0);
+
+ Future<Nothing> sched2Error;
+ EXPECT_CALL(sched2, error(&driver2, _))
+ .WillOnce(FutureSatisfy(&sched2Error));
+
+ driver2.start();
+
+ // Framework should get 'Error' message because the framework
+ // with this id is marked as completed.
+ AWAIT_READY(sched2Error);
+
+ driver2.stop();
+ driver2.join();
+ driver1.stop();
+ driver1.join();
+
+ Shutdown();
+}
+
+
+// This test verifies that a framework attempting to re-register
+// after it is unregistered is disallowed.
+TEST_F(FaultToleranceTest, SchedulerReregisterAfterUnregistration)
+{
+ Try<PID<Master> > master = StartMaster();
+ ASSERT_SOME(master);
+
+ // Launch the first (i.e., failing) scheduler and wait until
+ // registered gets called to launch the second (i.e., failover)
+ // scheduler.
+
+ MockScheduler sched1;
+ MesosSchedulerDriver driver1(
+ &sched1, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+ Future<process::Message> frameworkRegisteredMessage =
+ FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+
+ Future<FrameworkID> frameworkId;
+ EXPECT_CALL(sched1, registered(&driver1, _, _))
+ .WillOnce(FutureArg<1>(&frameworkId));
+
+ driver1.start();
+
+ AWAIT_READY(frameworkRegisteredMessage);
+ AWAIT_READY(frameworkId);
+
+ Future<Nothing> frameworkRemoved = FUTURE_DISPATCH(
+ _, &master::allocator::AllocatorProcess::frameworkRemoved);
+
+ // Unregister the framework.
+ driver1.stop();
+ driver1.join();
+
+ // Wait until master actually marks the framework as completed.
+ AWAIT_READY(frameworkRemoved);
+
+ // Now launch the second (i.e., failover) scheduler using the
+ // framework id recorded from the first scheduler.
+ MockScheduler sched2;
+
+ FrameworkInfo framework2; // Bug in gcc 4.1.*, must assign on next line.
+ framework2 = DEFAULT_FRAMEWORK_INFO;
+ framework2.mutable_id()->MergeFrom(frameworkId.get());
+
+ MesosSchedulerDriver driver2(
+ &sched2, framework2, master.get(), DEFAULT_CREDENTIAL);
+
+ EXPECT_CALL(sched2, registered(&driver2, frameworkId.get(), _))
+ .Times(0);
+
+ Future<Nothing> sched2Error;
+ EXPECT_CALL(sched2, error(&driver2, _))
+ .WillOnce(FutureSatisfy(&sched2Error));
+
+ driver2.start();
+
+ // Framework should get 'Error' message because the framework
+ // with this id is marked as completed.
+ AWAIT_READY(sched2Error);
+
+ driver2.stop();
+ driver2.join();
+
+ Shutdown();
+}
+
+
// This test was added to cover a fix for MESOS-659.
// Here, we drop the initial FrameworkReregisteredMessage from the
// master, so that the scheduler driver retries the initial failover