You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by vi...@apache.org on 2014/08/19 03:53:21 UTC

git commit: Fixed master to reject completed frameworks from re-registering.

Repository: mesos
Updated Branches:
  refs/heads/master f7b5f5bee -> e9643f477


Fixed master to reject completed frameworks from re-registering.

Review: https://reviews.apache.org/r/20507


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e9643f47
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e9643f47
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e9643f47

Branch: refs/heads/master
Commit: e9643f47720d8c5c9e6278ec068e771b94e94fbc
Parents: f7b5f5b
Author: Vinod Kone <vi...@twitter.com>
Authored: Fri Apr 18 15:31:59 2014 -0700
Committer: Vinod Kone <vi...@gmail.com>
Committed: Mon Aug 18 18:52:17 2014 -0700

----------------------------------------------------------------------
 src/master/master.cpp               |  18 +++-
 src/tests/fault_tolerance_tests.cpp | 155 +++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e9643f47/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index e948803..18464ba 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -1415,11 +1415,27 @@ void Master::reregisterFramework(
   if (!frameworkInfo.has_id() || frameworkInfo.id() == "") {
     LOG(ERROR) << "Framework re-registering without an id!";
     FrameworkErrorMessage message;
-    message.set_message("Framework reregistered without a framework id");
+    message.set_message("Framework reregistering without a framework id");
     send(from, message);
     return;
   }
 
+  foreach (const shared_ptr<Framework>& framework, frameworks.completed) {
+    if (framework->id == frameworkInfo.id()) {
+      // This could happen if a framework tries to re-register after
+      // its failover timeout has elapsed or it unregistered itself
+      // by calling 'stop()' on the scheduler driver.
+      // TODO(vinod): Master should persist admitted frameworks to the
+      // registry and remove them from it after failover timeout.
+      LOG(WARNING) << "Completed framework " << framework->id
+                   << " attempted to re-register";
+      FrameworkErrorMessage message;
+      message.set_message("Completed framework attempted to re-register");
+      send(from, message);
+      return;
+    }
+  }
+
   LOG(INFO) << "Received re-registration request from framework "
             << frameworkInfo.id() << " at " << from;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/e9643f47/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 5646cd7..b0e74b9 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -42,6 +42,7 @@
 
 #include "common/protobuf_utils.hpp"
 
+#include "master/allocator.hpp"
 #include "master/master.hpp"
 
 #include "slave/slave.hpp"
@@ -941,6 +942,160 @@ TEST_F(FaultToleranceTest, SchedulerFailover)
 }
 
 
+// This test verifies that a framework attempting to re-register
+// after its failover timeout has elapsed is disallowed.
+TEST_F(FaultToleranceTest, SchedulerReregisterAfterFailoverTimeout)
+{
+  Try<PID<Master> > master = StartMaster();
+  ASSERT_SOME(master);
+
+  // Launch the first (i.e., failing) scheduler and wait until
+  // registered gets called to launch the second (i.e., failover)
+  // scheduler.
+
+  MockScheduler sched1;
+  MesosSchedulerDriver driver1(
+      &sched1, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+  Future<process::Message> frameworkRegisteredMessage =
+    FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched1, registered(&driver1, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  driver1.start();
+
+  AWAIT_READY(frameworkRegisteredMessage);
+  AWAIT_READY(frameworkId);
+
+  Future<Nothing> frameworkDeactivated = FUTURE_DISPATCH(
+      _, &master::allocator::AllocatorProcess::frameworkDeactivated);
+
+  Future<Nothing> frameworkFailoverTimeout =
+    FUTURE_DISPATCH(_, &Master::frameworkFailoverTimeout);
+
+  // Simulate framework disconnection.
+  ASSERT_TRUE(process::inject::exited(
+      frameworkRegisteredMessage.get().to, master.get()));
+
+  // Wait until master schedules the framework for removal.
+  AWAIT_READY(frameworkDeactivated);
+
+  // Simulate framework failover timeout.
+  Clock::pause();
+  Clock::settle();
+
+  Try<Duration> failoverTimeout =
+    Duration::create(FrameworkInfo().failover_timeout());
+
+  ASSERT_SOME(failoverTimeout);
+  Clock::advance(failoverTimeout.get());
+
+  // Wait until master actually marks the framework as completed.
+  AWAIT_READY(frameworkFailoverTimeout);
+
+  // Now launch the second (i.e., failover) scheduler using the
+  // framework id recorded from the first scheduler.
+  MockScheduler sched2;
+
+  FrameworkInfo framework2; // Bug in gcc 4.1.*, must assign on next line.
+  framework2 = DEFAULT_FRAMEWORK_INFO;
+  framework2.mutable_id()->MergeFrom(frameworkId.get());
+
+  MesosSchedulerDriver driver2(
+      &sched2, framework2, master.get(), DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched2, registered(&driver2, frameworkId.get(), _))
+    .Times(0);
+
+  Future<Nothing> sched2Error;
+  EXPECT_CALL(sched2, error(&driver2, _))
+    .WillOnce(FutureSatisfy(&sched2Error));
+
+  driver2.start();
+
+  // Framework should get 'Error' message because the framework
+  // with this id is marked as completed.
+  AWAIT_READY(sched2Error);
+
+  driver2.stop();
+  driver2.join();
+  driver1.stop();
+  driver1.join();
+
+  Shutdown();
+}
+
+
+// This test verifies that a framework attempting to re-register
+// after it is unregistered is disallowed.
+TEST_F(FaultToleranceTest, SchedulerReregisterAfterUnregistration)
+{
+  Try<PID<Master> > master = StartMaster();
+  ASSERT_SOME(master);
+
+  // Launch the first (i.e., failing) scheduler and wait until
+  // registered gets called to launch the second (i.e., failover)
+  // scheduler.
+
+  MockScheduler sched1;
+  MesosSchedulerDriver driver1(
+      &sched1, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+  Future<process::Message> frameworkRegisteredMessage =
+    FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched1, registered(&driver1, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  driver1.start();
+
+  AWAIT_READY(frameworkRegisteredMessage);
+  AWAIT_READY(frameworkId);
+
+  Future<Nothing> frameworkRemoved = FUTURE_DISPATCH(
+      _, &master::allocator::AllocatorProcess::frameworkRemoved);
+
+  // Unregister the framework.
+  driver1.stop();
+  driver1.join();
+
+  // Wait until master actually marks the framework as completed.
+  AWAIT_READY(frameworkRemoved);
+
+  // Now launch the second (i.e., failover) scheduler using the
+  // framework id recorded from the first scheduler.
+  MockScheduler sched2;
+
+  FrameworkInfo framework2; // Bug in gcc 4.1.*, must assign on next line.
+  framework2 = DEFAULT_FRAMEWORK_INFO;
+  framework2.mutable_id()->MergeFrom(frameworkId.get());
+
+  MesosSchedulerDriver driver2(
+      &sched2, framework2, master.get(), DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched2, registered(&driver2, frameworkId.get(), _))
+    .Times(0);
+
+  Future<Nothing> sched2Error;
+  EXPECT_CALL(sched2, error(&driver2, _))
+    .WillOnce(FutureSatisfy(&sched2Error));
+
+  driver2.start();
+
+  // Framework should get 'Error' message because the framework
+  // with this id is marked as completed.
+  AWAIT_READY(sched2Error);
+
+  driver2.stop();
+  driver2.join();
+
+  Shutdown();
+}
+
+
 // This test was added to cover a fix for MESOS-659.
 // Here, we drop the initial FrameworkReregisteredMessage from the
 // master, so that the scheduler driver retries the initial failover