You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2013/04/30 03:31:02 UTC

svn commit: r1477440 - in /incubator/mesos/trunk/src: detector/detector.cpp detector/detector.hpp tests/zookeeper_tests.cpp

Author: bmahler
Date: Tue Apr 30 01:31:02 2013
New Revision: 1477440

URL: http://svn.apache.org/r1477440
Log:
When sending NoMasterDetectedMessage, we need to clear the master
sequence number in order to ensure we still send
NewMasterDetectedMessage if the master is the same. Otherwise, we can
get stuck in a master-less state.

Review: https://reviews.apache.org/r/10161

Modified:
    incubator/mesos/trunk/src/detector/detector.cpp
    incubator/mesos/trunk/src/detector/detector.hpp
    incubator/mesos/trunk/src/tests/zookeeper_tests.cpp

Modified: incubator/mesos/trunk/src/detector/detector.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/detector/detector.cpp?rev=1477440&r1=1477439&r2=1477440&view=diff
==============================================================================
--- incubator/mesos/trunk/src/detector/detector.cpp (original)
+++ incubator/mesos/trunk/src/detector/detector.cpp Tue Apr 30 01:31:02 2013
@@ -60,60 +60,8 @@ using std::vector;
 namespace mesos {
 namespace internal {
 
-// TODO(benh): Make this value configurable via flags and verify that
-// it is always LESS THAN the slave heartbeat timeout.
-const Seconds ZOOKEEPER_SESSION_TIMEOUT(10.0);
 
-
-class ZooKeeperMasterDetectorProcess
-  : public Process<ZooKeeperMasterDetectorProcess>
-{
-public:
-  ZooKeeperMasterDetectorProcess(
-    const zookeeper::URL& url,
-    const UPID& pid,
-    bool contend,
-    bool quiet);
-
-  virtual ~ZooKeeperMasterDetectorProcess();
-
-  virtual void initialize();
-
-  // ZooKeeperMasterDetector implementation.
-  int64_t session();
-
-  // ZooKeeper events.
-  void connected(bool reconnect);
-  void reconnecting();
-  void expired();
-  void updated(const string& path);
-  void created(const string& path);
-  void deleted(const string& path);
-
-private:
-  // Handles reconnecting "timeouts" by prematurely expiring a session
-  // (only used for contending instances). TODO(benh): Remove 'const
-  // &' after fixing libprocess.
-  void timedout(const int64_t& sessionId);
-
-  // Attempts to detect a master.
-  void detectMaster();
-
-  const zookeeper::URL url;
-  const ACL_vector acl;
-
-  const UPID pid;
-  bool contend;
-
-  Watcher* watcher;
-  ZooKeeper* zk;
-
-  bool expire;
-  Option<Timer> timer;
-
-  string currentMasterSeq;
-  UPID currentMasterPID;
-};
+const Duration ZOOKEEPER_SESSION_TIMEOUT = Seconds(10);
 
 
 MasterDetector::~MasterDetector() {}
@@ -246,7 +194,10 @@ ZooKeeperMasterDetectorProcess::ZooKeepe
     contend(_contend),
     watcher(NULL),
     zk(NULL),
-    expire(false)
+    expire(false),
+    timer(),
+    currentMasterSeq(),
+    currentMasterPID()
 {
   // Set verbosity level for underlying ZooKeeper library logging.
   // TODO(benh): Put this in the C++ API.
@@ -438,24 +389,19 @@ void ZooKeeperMasterDetectorProcess::tim
     expire = true;
 
     // We only send a NoMasterDetectedMessage if we are a
-    // contending detector AND ALSO the current leader.
-    // This is because:
-    // 1) If we are a non-contending detector (e.g. slave), a zk session
+    // contending detector. This is because:
+    //    If we are a non-contending detector (e.g. slave), a zk session
     //    expiration doesn't necessarily mean a new leader (master) is elected
     //    (e.g. the slave is partitioned from the zk server). If the leading
     //    master stays the same (i.e., no leader election), then the
     //    slave should still accept a ShutDownMessage from the master.
     //    If a new master does get elected, the slave would know about it
     //    because it would do a leader detection after it connects/re-connects.
-    // 2) If we are a contender but not the leader (e.g. non-leading master),
-    //    sending a NoMasterDetectedMessage() is bad because, a partitioned
-    //    non-leading master would never know about the leading master that
-    //    stays the same (i.e., no leader election) even after it is
-    //    connected/reconnected with the ZooKeeper. This is because, the
-    //    the master detection code (detectMaster()) will not send a
-    //    NewMasterDetectedMessage to the non-leading master as there is no
-    //    change in the currentMasterSeq.
-    if (contend && currentMasterPID == pid) {
+    if (contend) {
+      // TODO(bmahler): We always want to clear the sequence number
+      // prior to sending NoMasterDetectedMessage. It might be prudent
+      // to use a helper function to enforce this.
+      currentMasterSeq = "";  // Clear the master sequence number.
       process::post(pid, NoMasterDetectedMessage());
     }
   }
@@ -504,6 +450,7 @@ void ZooKeeperMasterDetectorProcess::det
   // No master present (lost or possibly hasn't come up yet).
   if (masterSeq.empty()) {
     LOG(INFO) << "Master detector (" << pid << ") couldn't find any masters";
+    currentMasterSeq = "";  // Clear the master sequence number.
     process::post(pid, NoMasterDetectedMessage());
   } else if (masterSeq != currentMasterSeq) {
     // Okay, let's fetch the master pid from ZooKeeper.
@@ -538,6 +485,7 @@ void ZooKeeperMasterDetectorProcess::det
         // might have failed because of DNS, and whoever is using the
         // detector might sit "unconnected" indefinitely!
         LOG(ERROR) << "Failed to parse new master pid!";
+        currentMasterSeq = "";  // Clear the master sequence number.
         process::post(pid, NoMasterDetectedMessage());
       } else {
         currentMasterSeq = masterSeq;

Modified: incubator/mesos/trunk/src/detector/detector.hpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/detector/detector.hpp?rev=1477440&r1=1477439&r2=1477440&view=diff
==============================================================================
--- incubator/mesos/trunk/src/detector/detector.hpp (original)
+++ incubator/mesos/trunk/src/detector/detector.hpp Tue Apr 30 01:31:02 2013
@@ -28,16 +28,22 @@
 #include <process/future.hpp>
 #include <process/pid.hpp>
 #include <process/process.hpp>
+#include <process/timer.hpp>
 
 #include <stout/try.hpp>
 
+#include "zookeeper/authentication.hpp"
 #include "zookeeper/url.hpp"
+#include "zookeeper/zookeeper.hpp"
+
+class Watcher;
+class ZooKeeper;
 
 namespace mesos {
 namespace internal {
 
-class ZooKeeperMasterDetectorProcess; // Forward declaration.
-
+// Forward declarations.
+class ZooKeeperMasterDetectorProcess;
 
 /**
  * Implements functionality for:
@@ -138,10 +144,66 @@ public:
    */
   process::Future<int64_t> session();
 
-private:
+  // Visible for testing.
   ZooKeeperMasterDetectorProcess* process;
 };
 
+
+// TODO(benh): Make this value configurable via flags and verify that
+// it is always LESS THAN the slave heartbeat timeout.
+extern const Duration ZOOKEEPER_SESSION_TIMEOUT;
+
+
+class ZooKeeperMasterDetectorProcess
+  : public process::Process<ZooKeeperMasterDetectorProcess>
+{
+public:
+  ZooKeeperMasterDetectorProcess(
+    const zookeeper::URL& url,
+    const process::UPID& pid,
+    bool contend,
+    bool quiet);
+
+  virtual ~ZooKeeperMasterDetectorProcess();
+
+  virtual void initialize();
+
+  // ZooKeeperMasterDetector implementation.
+  int64_t session();
+
+  // ZooKeeper events.
+  void connected(bool reconnect);
+  void reconnecting();
+  void expired();
+  void updated(const std::string& path);
+  void created(const std::string& path);
+  void deleted(const std::string& path);
+
+private:
+  // Handles reconnecting "timeouts" by prematurely expiring a session
+  // (only used for contending instances). TODO(benh): Remove 'const
+  // &' after fixing libprocess.
+  void timedout(const int64_t& sessionId);
+
+  // Attempts to detect a master.
+  void detectMaster();
+
+  const zookeeper::URL url;
+  const ACL_vector acl;
+
+  const process::UPID pid;
+  bool contend;
+
+  Watcher* watcher;
+  ZooKeeper* zk;
+
+  bool expire;
+  Option<process::Timer> timer;
+
+  std::string currentMasterSeq;
+  process::UPID currentMasterPID;
+};
+
 } // namespace internal {
 } // namespace mesos {
 

Modified: incubator/mesos/trunk/src/tests/zookeeper_tests.cpp
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/tests/zookeeper_tests.cpp?rev=1477440&r1=1477439&r2=1477440&view=diff
==============================================================================
--- incubator/mesos/trunk/src/tests/zookeeper_tests.cpp (original)
+++ incubator/mesos/trunk/src/tests/zookeeper_tests.cpp Tue Apr 30 01:31:02 2013
@@ -304,6 +304,130 @@ TEST_F(ZooKeeperTest, MasterDetectorShut
 }
 
 
+// Tests that a detector sends a NoMasterDetectedMessage when we
+// reach our ZooKeeper session timeout. This is to enforce that we
+// manually expire the session when we don't get reconnected within
+// the ZOOKEEPER_SESSION_TIMEOUT.
+TEST_F(ZooKeeperTest, MasterDetectorTimedoutSession)
+{
+  Try<zookeeper::URL> url =
+    zookeeper::URL::parse("zk://" + server->connectString() + "/mesos");
+  ASSERT_SOME(url);
+
+  // First we bring up three master detector listeners:
+  //   1. A leading contender.
+  //   2. A non-leading contender.
+  //   3. A non-contender.
+
+  // 1. Simulate a leading contender.
+  MockMasterDetectorListenerProcess leader;
+
+  Future<Nothing> newMasterDetected;
+  EXPECT_CALL(leader, newMasterDetected(_))
+    .WillOnce(FutureSatisfy(&newMasterDetected));
+
+  process::spawn(leader);
+
+  ZooKeeperMasterDetector leaderDetector(
+      url.get(), leader.self(), true, true);
+
+  AWAIT_READY(newMasterDetected);
+
+  // 2. Simulate a non-leading contender.
+  MockMasterDetectorListenerProcess follower;
+
+  EXPECT_CALL(follower, newMasterDetected(_))
+    .WillOnce(FutureSatisfy(&newMasterDetected));
+
+  process::spawn(follower);
+
+  ZooKeeperMasterDetector followerDetector(
+      url.get(), follower.self(), true, true);
+
+  AWAIT_READY(newMasterDetected);
+
+  // 3. Simulate a non-contender.
+  MockMasterDetectorListenerProcess nonContender;
+
+  EXPECT_CALL(nonContender, newMasterDetected(_))
+    .WillOnce(FutureSatisfy(&newMasterDetected));
+
+  process::spawn(nonContender);
+
+  ZooKeeperMasterDetector nonContenderDetector(
+      url.get(), nonContender.self(), false, true);
+
+  AWAIT_READY(newMasterDetected);
+
+  // Now we want to induce lost connections on each of the
+  // master detectors.
+  // Induce a reconnection on the leader.
+  Future<Nothing> leaderReconnecting = FUTURE_DISPATCH(
+      leaderDetector.process->self(),
+      &ZooKeeperMasterDetectorProcess::reconnecting);
+
+  dispatch(leaderDetector.process,
+           &ZooKeeperMasterDetectorProcess::reconnecting);
+
+  AWAIT_READY(leaderReconnecting);
+
+  // Induce a reconnection on the follower.
+  Future<Nothing> followerReconnecting = FUTURE_DISPATCH(
+      followerDetector.process->self(),
+      &ZooKeeperMasterDetectorProcess::reconnecting);
+
+  dispatch(followerDetector.process,
+           &ZooKeeperMasterDetectorProcess::reconnecting);
+
+  AWAIT_READY(followerReconnecting);
+
+  // Induce a reconnection on the non-contender.
+  Future<Nothing> nonContenderReconnecting = FUTURE_DISPATCH(
+      nonContenderDetector.process->self(),
+      &ZooKeeperMasterDetectorProcess::reconnecting);
+
+  dispatch(nonContenderDetector.process,
+           &ZooKeeperMasterDetectorProcess::reconnecting);
+
+  AWAIT_READY(nonContenderReconnecting);
+
+  // Now induce the reconnection timeout.
+  Future<Nothing> leaderNoMasterDetected;
+  EXPECT_CALL(leader, noMasterDetected())
+    .WillOnce(FutureSatisfy(&leaderNoMasterDetected));
+
+  Future<Nothing> followerNoMasterDetected;
+  EXPECT_CALL(follower, noMasterDetected())
+    .WillOnce(FutureSatisfy(&followerNoMasterDetected));
+
+  // TODO(bmahler): This will be uncommented by the fix for:
+  // https://issues.apache.org/jira/browse/MESOS-305
+  //  Future<Nothing> nonContenderNoMasterDetected;
+  //  EXPECT_CALL(nonContender, noMasterDetected())
+  //    .WillOnce(FutureSatisfy(&nonContenderNoMasterDetected));
+
+  Clock::pause();
+  Clock::advance(ZOOKEEPER_SESSION_TIMEOUT);
+  Clock::settle();
+
+  AWAIT_READY(leaderNoMasterDetected);
+  AWAIT_READY(followerNoMasterDetected);
+
+  // TODO(bmahler): This will be uncommented by the fix for:
+  // https://issues.apache.org/jira/browse/MESOS-305
+  // AWAIT_READY(nonContenderNoMasterDetected);
+
+  process::terminate(leader);
+  process::wait(leader);
+
+  process::terminate(follower);
+  process::wait(follower);
+
+  process::terminate(nonContender);
+  process::wait(nonContender);
+}
+
+
 // Tests whether a leading master correctly detects a new master when its
 // ZooKeeper session is expired.
 TEST_F(ZooKeeperTest, MasterDetectorExpireMasterZKSession)
@@ -327,8 +451,7 @@ TEST_F(ZooKeeperTest, MasterDetectorExpi
   ASSERT_SOME(url);
 
   // Leader's detector.
-  ZooKeeperMasterDetector leaderDetector(
-      url.get(), leader.self(), true, true);
+  ZooKeeperMasterDetector leaderDetector(url.get(), leader.self(), true, true);
 
   AWAIT_READY(newMasterDetected1);
 
@@ -395,8 +518,7 @@ TEST_F(ZooKeeperTest, MasterDetectorExpi
   ASSERT_SOME(url);
 
   // Leading master's detector.
-  ZooKeeperMasterDetector masterDetector(
-      url.get(), master.self(), true, true);
+  ZooKeeperMasterDetector masterDetector(url.get(), master.self(), true, true);
 
   AWAIT_READY(newMasterDetected1);
 
@@ -414,8 +536,7 @@ TEST_F(ZooKeeperTest, MasterDetectorExpi
   process::spawn(slave);
 
   // Slave's master detector.
-  ZooKeeperMasterDetector slaveDetector(
-      url.get(), slave.self(), false, true);
+  ZooKeeperMasterDetector slaveDetector(url.get(), slave.self(), false, true);
 
   AWAIT_READY(newMasterDetected2);
 
@@ -498,8 +619,7 @@ TEST_F(ZooKeeperTest, MasterDetectorExpi
   process::spawn(slave);
 
   // Slave's master detector.
-  ZooKeeperMasterDetector slaveDetector(
-      url.get(), slave.self(), false, true);
+  ZooKeeperMasterDetector slaveDetector(url.get(), slave.self(), false, true);
 
   AWAIT_READY(newMasterDetected3);