You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by an...@apache.org on 2019/12/17 12:48:32 UTC

[zookeeper] branch master updated: ZOOKEEPER-2307: ZooKeeper not starting because acceptedEpoch is less than the currentEpoch

This is an automated email from the ASF dual-hosted git repository.

andor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zookeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new 19d8567  ZOOKEEPER-2307: ZooKeeper not starting because acceptedEpoch is less than the currentEpoch
19d8567 is described below

commit 19d85670a3c87565bc63a5f84169e4c6c72e2915
Author: Mohammad Arshad <ar...@apache.org>
AuthorDate: Tue Dec 17 13:48:25 2019 +0100

    ZOOKEEPER-2307: ZooKeeper not starting because acceptedEpoch is less than the currentEpoch
    
    Update acceptedEpoch and currentEpoch in file first then in memory.
    
    Author: Mohammad Arshad <ar...@apache.org>
    
    Reviewers: andor@apache.org
    
    Closes #1145 from arshadmohammad/ZOOKEEPER-2307-epochUpdate and squashes the following commits:
    
    b05bc1f1c [Mohammad Arshad] review comment fix
    c8d620f39 [Mohammad Arshad] ZOOKEEPER-2307:ZooKeeper not starting because acceptedEpoch is less than the currentEpoch
---
 .../apache/zookeeper/server/quorum/QuorumPeer.java |   7 +-
 .../server/quorum/EpochWriteFailureTest.java       | 152 +++++++++++++++++++++
 .../apache/zookeeper/server/quorum/Zab1_0Test.java |  23 +++-
 3 files changed, 177 insertions(+), 5 deletions(-)

diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
index 72d06ec..dba5b27 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
@@ -2074,7 +2074,8 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
      * @param value the long value to write to the named file
      * @throws IOException if the file cannot be written atomically
      */
-    private void writeLongToFile(String name, final long value) throws IOException {
+    // visibleForTest
+     void writeLongToFile(String name, final long value) throws IOException {
         File file = new File(logFactory.getSnapDir(), name);
         new AtomicFileWritingIdiom(file, new WriterStatement() {
             @Override
@@ -2099,14 +2100,14 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
     }
 
     public void setCurrentEpoch(long e) throws IOException {
-        currentEpoch = e;
         writeLongToFile(CURRENT_EPOCH_FILENAME, e);
+        currentEpoch = e;
 
     }
 
     public void setAcceptedEpoch(long e) throws IOException {
-        acceptedEpoch = e;
         writeLongToFile(ACCEPTED_EPOCH_FILENAME, e);
+        acceptedEpoch = e;
     }
 
     public boolean processReconfig(QuorumVerifier qv, Long suggestedLeaderId, Long zxid, boolean restartLE) {
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/EpochWriteFailureTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/EpochWriteFailureTest.java
new file mode 100644
index 0000000..526dc0b
--- /dev/null
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/EpochWriteFailureTest.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.zookeeper.server.quorum;
+
+import static org.apache.zookeeper.test.ClientBase.CONNECTION_TIMEOUT;
+import java.io.File;
+import java.io.IOException;
+import java.util.Map;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.PortAssignment;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.test.ClientBase;
+import org.apache.zookeeper.test.ClientBase.CountdownWatcher;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class EpochWriteFailureTest extends QuorumPeerTestBase {
+    private static int SERVER_COUNT = 3;
+    private static int[] clientPorts = new int[SERVER_COUNT];
+    private static MainThread[] mt = new MainThread[SERVER_COUNT];
+    private static ZooKeeper zk;
+
+    /*
+     * Test case for https://issues.apache.org/jira/browse/ZOOKEEPER-2307
+     * Expectation: During leader election when accepted epoch write to file
+     * fails, it should not complete leader election, also it should not update
+     * run time values of acceptedEpoch,
+     */
+    @Test(timeout = 120000)
+    public void testAcceptedEpochWriteFailure() throws Exception {
+        StringBuilder sb = new StringBuilder();
+        sb.append("admin.enableServer=false");
+        sb.append("\n");
+        String server;
+        for (int i = 0; i < SERVER_COUNT; i++) {
+            clientPorts[i] = PortAssignment.unique();
+            server = "server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":"
+                    + PortAssignment.unique() + ":participant;127.0.0.1:" + clientPorts[i];
+            sb.append(server);
+            sb.append("\n");
+        }
+        String currentQuorumCfgSection = sb.toString();
+        for (int i = 0; i < SERVER_COUNT - 1; i++) {
+            mt[i] = new MainThread(i, clientPorts[i], currentQuorumCfgSection, false);
+            mt[i].start();
+        }
+
+        // ensure two servers started
+        for (int i = 0; i < SERVER_COUNT - 1; i++) {
+            Assert.assertTrue("waiting for server " + i + " being up",
+                    ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[i], CONNECTION_TIMEOUT));
+        }
+
+        CountdownWatcher watch1 = new CountdownWatcher();
+        zk = new ZooKeeper("127.0.0.1:" + clientPorts[0], ClientBase.CONNECTION_TIMEOUT,
+                watch1);
+        watch1.waitForConnected(ClientBase.CONNECTION_TIMEOUT);
+
+        String data = "originalData";
+        zk.create("/epochIssue", data.getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
+
+        //initialize third server
+        mt[2] = new MainThread(2, clientPorts[2], currentQuorumCfgSection, false) {
+
+            @Override
+            public TestQPMain getTestQPMain() {
+                return new MockTestQPMain();
+            }
+        };
+
+        //This server has problem it fails while writing acceptedEpoch.
+        mt[2].start();
+
+        /*
+         * Verify that problematic server does not start as acceptedEpoch update
+         * failure is injected and it keeps on trying to join the quorum
+         */
+
+        Assert.assertFalse("verify server 2 not started",
+                ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[2], CONNECTION_TIMEOUT / 2));
+
+        QuorumPeer quorumPeer = mt[2].getQuorumPeer();
+
+        Assert.assertEquals("acceptedEpoch must not have changed", 0,
+                quorumPeer.getAcceptedEpoch());
+        Assert.assertEquals("currentEpoch must not have changed", 0,
+                quorumPeer.getCurrentEpoch());
+    }
+
+    static class CustomQuorumPeer extends QuorumPeer {
+        CustomQuorumPeer(Map<Long, QuorumServer> quorumPeers, File snapDir, File logDir, int clientPort,
+                         int electionAlg, long myid, int tickTime, int initLimit, int syncLimit,
+                         int connectToLearnerMasterLimit) throws IOException {
+            super(quorumPeers, snapDir, logDir, clientPort, electionAlg, myid, tickTime, initLimit, syncLimit, connectToLearnerMasterLimit);
+        }
+
+        @Override
+        protected void writeLongToFile(String name, long value) throws IOException {
+            // initial epoch writing should be successful
+            if (0 != value) {
+                throw new IOException("Input/output error");
+            }
+        }
+    }
+
+    private static class MockTestQPMain extends TestQPMain {
+        @Override
+        public void runFromConfig(QuorumPeerConfig config)
+                throws IOException {
+            quorumPeer = new CustomQuorumPeer(config.getQuorumVerifier().getAllMembers(),
+                    config.getDataDir(), config.getDataLogDir(),
+                    config.getClientPortAddress().getPort(), config.getElectionAlg(),
+                    config.getServerId(), config.getTickTime(), config.getInitLimit(),
+                    config.getSyncLimit(), config.getSyncLimit());
+            quorumPeer.start();
+            try {
+                quorumPeer.join();
+            } catch (InterruptedException e) {
+                LOG.warn("Quorum Peer interrupted", e);
+            }
+        }
+    }
+
+    @AfterClass
+    public static void tearDownAfterClass() throws InterruptedException {
+        for (int i = 0; i < SERVER_COUNT; i++) {
+            if (mt[i] != null) {
+                mt[i].shutdown();
+            }
+        }
+        if (zk != null) {
+            zk.close();
+        }
+    }
+}
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
index 5302416..9f8486e 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
@@ -60,6 +60,7 @@ import org.apache.zookeeper.server.ZKDatabase;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
 import org.apache.zookeeper.server.util.ZxidUtils;
+import org.apache.zookeeper.test.ClientBase;
 import org.apache.zookeeper.test.TestUtils;
 import org.apache.zookeeper.txn.CreateSessionTxn;
 import org.apache.zookeeper.txn.CreateTxn;
@@ -852,7 +853,7 @@ public class Zab1_0Test extends ZKTestCase {
                 assertEquals(Leader.NEWLEADER, qp.getType());
                 assertEquals(ZxidUtils.makeZxid(1, 0), qp.getZxid());
                 assertEquals(1, l.self.getAcceptedEpoch());
-                assertEquals(1, l.self.getCurrentEpoch());
+                assertCurrentEpochGotUpdated(1, l.self, ClientBase.CONNECTION_TIMEOUT);
 
                 qp = new QuorumPacket(Leader.ACK, qp.getZxid(), null, null);
                 oa.writeRecord(qp, null);
@@ -893,7 +894,7 @@ public class Zab1_0Test extends ZKTestCase {
                 assertEquals(Leader.NEWLEADER, qp.getType());
                 assertEquals(ZxidUtils.makeZxid(1, 0), qp.getZxid());
                 assertEquals(1, l.self.getAcceptedEpoch());
-                assertEquals(1, l.self.getCurrentEpoch());
+                assertCurrentEpochGotUpdated(1, l.self, ClientBase.CONNECTION_TIMEOUT);
 
                 qp = new QuorumPacket(Leader.ACK, qp.getZxid(), null, null);
                 oa.writeRecord(qp, null);
@@ -1212,4 +1213,22 @@ public class Zab1_0Test extends ZKTestCase {
         }
     }
 
+    /*
+     * Epoch is first written to file then updated in memory. Give some time to
+     * write the epoch in file and then go for assert.
+     */
+    private void assertCurrentEpochGotUpdated(int expected, QuorumPeer self, long timeout)
+            throws IOException {
+        long elapsedTime = 0;
+        long waitInterval = 10;
+        while (self.getCurrentEpoch() != expected && elapsedTime < timeout) {
+            try {
+                Thread.sleep(waitInterval);
+            } catch (InterruptedException e) {
+                fail("CurrentEpoch update failed");
+            }
+            elapsedTime = elapsedTime + waitInterval;
+        }
+        assertEquals("CurrentEpoch update failed", expected, self.getCurrentEpoch());
+    }
 }