You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by fp...@apache.org on 2013/10/21 23:35:13 UTC

svn commit: r1534388 - in /zookeeper/branches/branch-3.4: ./ src/java/main/org/apache/zookeeper/server/quorum/ src/java/test/org/apache/zookeeper/test/

Author: fpj
Date: Mon Oct 21 21:35:12 2013
New Revision: 1534388

URL: http://svn.apache.org/r1534388
Log:
ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj)


Modified:
    zookeeper/branches/branch-3.4/CHANGES.txt
    zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java
    zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java
    zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java
    zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java
    zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java

Modified: zookeeper/branches/branch-3.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/CHANGES.txt?rev=1534388&r1=1534387&r2=1534388&view=diff
==============================================================================
--- zookeeper/branches/branch-3.4/CHANGES.txt (original)
+++ zookeeper/branches/branch-3.4/CHANGES.txt Mon Oct 21 21:35:12 2013
@@ -137,6 +137,8 @@ BUGFIXES:
 
   ZOOKEEPER-1558. Leader should not snapshot uncommitted state (fpj)
 
+  ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj)
+
 IMPROVEMENTS:
 
   ZOOKEEPER-1564. Allow JUnit test build with IBM Java

Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java
URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java?rev=1534388&r1=1534387&r2=1534388&view=diff
==============================================================================
--- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java (original)
+++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java Mon Oct 21 21:35:12 2013
@@ -66,6 +66,14 @@ public class FastLeaderElection implemen
      */
 
     final static int maxNotificationInterval = 60000;
+    
+    /**
+     * This value is passed to the methods that check the quorum
+     * majority of an established ensemble for those values that
+     * should not be taken into account in the comparison 
+     * (electionEpoch and zxid). 
+     */
+    final static int IGNOREVALUE = -1;
 
     /**
      * Connection manager. Fast leader election uses TCP for
@@ -324,7 +332,7 @@ public class FastLeaderElection implemen
                                             ToSend.mType.notification,
                                             current.getId(),
                                             current.getZxid(),
-                                            logicalclock,
+                                            current.getElectionEpoch(),
                                             self.getPeerState(),
                                             response.sid,
                                             current.getPeerEpoch());
@@ -867,15 +875,25 @@ public class FastLeaderElection implemen
                             }
                         }
 
-                        /**
+                        /*
                          * Before joining an established ensemble, verify that
                          * a majority are following the same leader.
+                         * Only peer epoch is used to check that the votes come
+                         * from the same ensemble. This is because there is at
+                         * least one corner case in which the ensemble can be
+                         * created with inconsistent zxid and election epoch
+                         * info. However, given that only one ensemble can be
+                         * running at a single point in time and that each 
+                         * epoch is used only once, using only the epoch to 
+                         * compare the votes is sufficient.
+                         * 
+                         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
                          */
-                        outofelection.put(n.sid, new Vote(n.leader, n.zxid,
-                                n.electionEpoch, n.peerEpoch, n.state));
+                        outofelection.put(n.sid, new Vote(n.leader, 
+                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
                         if (termPredicate(outofelection, new Vote(n.leader,
-                                n.zxid, n.electionEpoch, n.peerEpoch, n.state))
-                                && checkLeader(outofelection, n.leader, n.electionEpoch)) {
+                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state))
+                                && checkLeader(outofelection, n.leader, IGNOREVALUE)) {
                             synchronized(this){
                                 logicalclock = n.electionEpoch;
                                 self.setPeerState((n.leader == self.getId()) ?

Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java
URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java?rev=1534388&r1=1534387&r2=1534388&view=diff
==============================================================================
--- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java (original)
+++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java Mon Oct 21 21:35:12 2013
@@ -945,6 +945,15 @@ public class Leader {
                 + " ]; starting up and setting last processed zxid: 0x{}",
                 Long.toHexString(zk.getZxid()));
         zk.startup();
+        /*
+         * Update the election vote here to ensure that all members of the
+         * ensemble report the same vote to new servers that start up and
+         * send leader election notifications to the ensemble.
+         * 
+         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+         */
+        self.updateElectionVote(getEpoch());
+
         zk.getZKDatabase().setlastProcessedZxid(zk.getZxid());
     }
 

Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java
URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java?rev=1534388&r1=1534387&r2=1534388&view=diff
==============================================================================
--- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java (original)
+++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java Mon Oct 21 21:35:12 2013
@@ -435,6 +435,15 @@ public class Learner {       
         writePacket(ack, true);
         sock.setSoTimeout(self.tickTime * self.syncLimit);
         zk.startup();
+        /*
+         * Update the election vote here to ensure that all members of the
+         * ensemble report the same vote to new servers that start up and
+         * send leader election notifications to the ensemble.
+         * 
+         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+         */
+        self.updateElectionVote(newEpoch);
+
         // We need to log the stuff that came in between the snapshot and the uptodate
         if (zk instanceof FollowerZooKeeperServer) {
             FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk;

Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java
URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java?rev=1534388&r1=1534387&r2=1534388&view=diff
==============================================================================
--- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java (original)
+++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java Mon Oct 21 21:35:12 2013
@@ -1194,4 +1194,21 @@ public class QuorumPeer extends Thread i
 		acceptedEpoch = e;
 		writeLongToFile(ACCEPTED_EPOCH_FILENAME, e);
 	}
+
+    /**
+     * Updates leader election info to avoid inconsistencies when
+     * a new server tries to join the ensemble.
+     * See ZOOKEEPER-1732 for more info.
+     */
+    protected void updateElectionVote(long newEpoch) {
+        Vote currentVote = getCurrentVote();
+        if (currentVote != null) {
+            setCurrentVote(new Vote(currentVote.getId(),
+                currentVote.getZxid(),
+                currentVote.getElectionEpoch(),
+                newEpoch,
+                currentVote.getState()));
+        }
+    }
+
 }

Modified: zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java
URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java?rev=1534388&r1=1534387&r2=1534388&view=diff
==============================================================================
--- zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java (original)
+++ zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java Mon Oct 21 21:35:12 2013
@@ -403,4 +403,67 @@ public class FLETest extends ZKTestCase 
             }
         }
     }
+
+    /*
+     * For ZOOKEEPER-1732 verify that it is possible to join an ensemble with
+     * inconsistent election round information.
+     */
+    @Test
+    public void testJoinInconsistentEnsemble() throws Exception {
+        int sid;
+        QuorumPeer peer;
+        int waitTime = 10 * 1000;
+        ArrayList<QuorumPeer> peerList = new ArrayList<QuorumPeer>();
+        for(sid = 0; sid < 3; sid++) {
+            peers.put(Long.valueOf(sid),
+                    new QuorumServer(sid,
+                            new InetSocketAddress(PortAssignment.unique()),
+                    new InetSocketAddress(PortAssignment.unique())));
+            tmpdir[sid] = ClientBase.createTmpDir();
+            port[sid] = PortAssignment.unique();
+        }
+        // start 2 peers and verify if they form the cluster
+        for (sid = 0; sid < 2; sid++) {
+            peer = new QuorumPeer(peers, tmpdir[sid], tmpdir[sid],
+                                             port[sid], 3, sid, 2000, 2, 2);
+            LOG.info("Starting peer " + peer.getId());
+            peer.start();
+            peerList.add(sid, peer);
+        }
+        peer = peerList.get(0);
+        VerifyState v1 = new VerifyState(peerList.get(0));
+        v1.start();
+        v1.join(waitTime);
+        Assert.assertFalse("Unable to form cluster in " +
+            waitTime + " ms",
+            !v1.isSuccess());
+        // Change the election round for one of the members of the ensemble
+        long leaderSid = peer.getCurrentVote().getId();
+        long zxid = peer.getCurrentVote().getZxid();
+        long electionEpoch = peer.getCurrentVote().getElectionEpoch();
+        ServerState state = peer.getCurrentVote().getState();
+        long peerEpoch = peer.getCurrentVote().getPeerEpoch();
+        Vote newVote = new Vote(leaderSid, zxid+100, electionEpoch+100, peerEpoch, state);
+        peer.setCurrentVote(newVote);
+        // Start 3rd peer and check if it joins the quorum
+        peer = new QuorumPeer(peers, tmpdir[2], tmpdir[2],
+                 port[2], 3, 2, 2000, 2, 2);
+        LOG.info("Starting peer " + peer.getId());
+        peer.start();
+        peerList.add(sid, peer);
+        v1 = new VerifyState(peer);
+        v1.start();
+        v1.join(waitTime);
+        if (v1.isAlive()) {
+               Assert.fail("Peer " + peer.getId() + " failed to join the cluster " +
+                "within " + waitTime + " ms");
+        }
+        // cleanup
+        for (int id = 0; id < 3; id++) {
+            peer = peerList.get(id);
+            if (peer != null) {
+                peer.shutdown();
+            }
+        }
+    }
 }