You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by an...@apache.org on 2019/12/17 12:48:32 UTC
[zookeeper] branch master updated: ZOOKEEPER-2307: ZooKeeper not
starting because acceptedEpoch is less than the currentEpoch
This is an automated email from the ASF dual-hosted git repository.
andor pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zookeeper.git
The following commit(s) were added to refs/heads/master by this push:
new 19d8567 ZOOKEEPER-2307: ZooKeeper not starting because acceptedEpoch is less than the currentEpoch
19d8567 is described below
commit 19d85670a3c87565bc63a5f84169e4c6c72e2915
Author: Mohammad Arshad <ar...@apache.org>
AuthorDate: Tue Dec 17 13:48:25 2019 +0100
ZOOKEEPER-2307: ZooKeeper not starting because acceptedEpoch is less than the currentEpoch
Update acceptedEpoch and currentEpoch in file first then in memory.
Author: Mohammad Arshad <ar...@apache.org>
Reviewers: andor@apache.org
Closes #1145 from arshadmohammad/ZOOKEEPER-2307-epochUpdate and squashes the following commits:
b05bc1f1c [Mohammad Arshad] review comment fix
c8d620f39 [Mohammad Arshad] ZOOKEEPER-2307:ZooKeeper not starting because acceptedEpoch is less than the currentEpoch
---
.../apache/zookeeper/server/quorum/QuorumPeer.java | 7 +-
.../server/quorum/EpochWriteFailureTest.java | 152 +++++++++++++++++++++
.../apache/zookeeper/server/quorum/Zab1_0Test.java | 23 +++-
3 files changed, 177 insertions(+), 5 deletions(-)
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
index 72d06ec..dba5b27 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
@@ -2074,7 +2074,8 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
* @param value the long value to write to the named file
* @throws IOException if the file cannot be written atomically
*/
- private void writeLongToFile(String name, final long value) throws IOException {
+ // visibleForTest
+ void writeLongToFile(String name, final long value) throws IOException {
File file = new File(logFactory.getSnapDir(), name);
new AtomicFileWritingIdiom(file, new WriterStatement() {
@Override
@@ -2099,14 +2100,14 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
}
public void setCurrentEpoch(long e) throws IOException {
- currentEpoch = e;
writeLongToFile(CURRENT_EPOCH_FILENAME, e);
+ currentEpoch = e;
}
public void setAcceptedEpoch(long e) throws IOException {
- acceptedEpoch = e;
writeLongToFile(ACCEPTED_EPOCH_FILENAME, e);
+ acceptedEpoch = e;
}
public boolean processReconfig(QuorumVerifier qv, Long suggestedLeaderId, Long zxid, boolean restartLE) {
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/EpochWriteFailureTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/EpochWriteFailureTest.java
new file mode 100644
index 0000000..526dc0b
--- /dev/null
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/EpochWriteFailureTest.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.zookeeper.server.quorum;
+
+import static org.apache.zookeeper.test.ClientBase.CONNECTION_TIMEOUT;
+import java.io.File;
+import java.io.IOException;
+import java.util.Map;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.PortAssignment;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.test.ClientBase;
+import org.apache.zookeeper.test.ClientBase.CountdownWatcher;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class EpochWriteFailureTest extends QuorumPeerTestBase {
+ private static int SERVER_COUNT = 3;
+ private static int[] clientPorts = new int[SERVER_COUNT];
+ private static MainThread[] mt = new MainThread[SERVER_COUNT];
+ private static ZooKeeper zk;
+
+ /*
+ * Test case for https://issues.apache.org/jira/browse/ZOOKEEPER-2307
+ * Expectation: During leader election when accepted epoch write to file
+ * fails, it should not complete leader election, also it should not update
+ * run time values of acceptedEpoch,
+ */
+ @Test(timeout = 120000)
+ public void testAcceptedEpochWriteFailure() throws Exception {
+ StringBuilder sb = new StringBuilder();
+ sb.append("admin.enableServer=false");
+ sb.append("\n");
+ String server;
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ clientPorts[i] = PortAssignment.unique();
+ server = "server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":"
+ + PortAssignment.unique() + ":participant;127.0.0.1:" + clientPorts[i];
+ sb.append(server);
+ sb.append("\n");
+ }
+ String currentQuorumCfgSection = sb.toString();
+ for (int i = 0; i < SERVER_COUNT - 1; i++) {
+ mt[i] = new MainThread(i, clientPorts[i], currentQuorumCfgSection, false);
+ mt[i].start();
+ }
+
+ // ensure two servers started
+ for (int i = 0; i < SERVER_COUNT - 1; i++) {
+ Assert.assertTrue("waiting for server " + i + " being up",
+ ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[i], CONNECTION_TIMEOUT));
+ }
+
+ CountdownWatcher watch1 = new CountdownWatcher();
+ zk = new ZooKeeper("127.0.0.1:" + clientPorts[0], ClientBase.CONNECTION_TIMEOUT,
+ watch1);
+ watch1.waitForConnected(ClientBase.CONNECTION_TIMEOUT);
+
+ String data = "originalData";
+ zk.create("/epochIssue", data.getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
+
+ //initialize third server
+ mt[2] = new MainThread(2, clientPorts[2], currentQuorumCfgSection, false) {
+
+ @Override
+ public TestQPMain getTestQPMain() {
+ return new MockTestQPMain();
+ }
+ };
+
+ //This server has problem it fails while writing acceptedEpoch.
+ mt[2].start();
+
+ /*
+ * Verify that problematic server does not start as acceptedEpoch update
+ * failure is injected and it keeps on trying to join the quorum
+ */
+
+ Assert.assertFalse("verify server 2 not started",
+ ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[2], CONNECTION_TIMEOUT / 2));
+
+ QuorumPeer quorumPeer = mt[2].getQuorumPeer();
+
+ Assert.assertEquals("acceptedEpoch must not have changed", 0,
+ quorumPeer.getAcceptedEpoch());
+ Assert.assertEquals("currentEpoch must not have changed", 0,
+ quorumPeer.getCurrentEpoch());
+ }
+
+ static class CustomQuorumPeer extends QuorumPeer {
+ CustomQuorumPeer(Map<Long, QuorumServer> quorumPeers, File snapDir, File logDir, int clientPort,
+ int electionAlg, long myid, int tickTime, int initLimit, int syncLimit,
+ int connectToLearnerMasterLimit) throws IOException {
+ super(quorumPeers, snapDir, logDir, clientPort, electionAlg, myid, tickTime, initLimit, syncLimit, connectToLearnerMasterLimit);
+ }
+
+ @Override
+ protected void writeLongToFile(String name, long value) throws IOException {
+ // initial epoch writing should be successful
+ if (0 != value) {
+ throw new IOException("Input/output error");
+ }
+ }
+ }
+
+ private static class MockTestQPMain extends TestQPMain {
+ @Override
+ public void runFromConfig(QuorumPeerConfig config)
+ throws IOException {
+ quorumPeer = new CustomQuorumPeer(config.getQuorumVerifier().getAllMembers(),
+ config.getDataDir(), config.getDataLogDir(),
+ config.getClientPortAddress().getPort(), config.getElectionAlg(),
+ config.getServerId(), config.getTickTime(), config.getInitLimit(),
+ config.getSyncLimit(), config.getSyncLimit());
+ quorumPeer.start();
+ try {
+ quorumPeer.join();
+ } catch (InterruptedException e) {
+ LOG.warn("Quorum Peer interrupted", e);
+ }
+ }
+ }
+
+ @AfterClass
+ public static void tearDownAfterClass() throws InterruptedException {
+ for (int i = 0; i < SERVER_COUNT; i++) {
+ if (mt[i] != null) {
+ mt[i].shutdown();
+ }
+ }
+ if (zk != null) {
+ zk.close();
+ }
+ }
+}
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
index 5302416..9f8486e 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java
@@ -60,6 +60,7 @@ import org.apache.zookeeper.server.ZKDatabase;
import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
import org.apache.zookeeper.server.util.ZxidUtils;
+import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.test.TestUtils;
import org.apache.zookeeper.txn.CreateSessionTxn;
import org.apache.zookeeper.txn.CreateTxn;
@@ -852,7 +853,7 @@ public class Zab1_0Test extends ZKTestCase {
assertEquals(Leader.NEWLEADER, qp.getType());
assertEquals(ZxidUtils.makeZxid(1, 0), qp.getZxid());
assertEquals(1, l.self.getAcceptedEpoch());
- assertEquals(1, l.self.getCurrentEpoch());
+ assertCurrentEpochGotUpdated(1, l.self, ClientBase.CONNECTION_TIMEOUT);
qp = new QuorumPacket(Leader.ACK, qp.getZxid(), null, null);
oa.writeRecord(qp, null);
@@ -893,7 +894,7 @@ public class Zab1_0Test extends ZKTestCase {
assertEquals(Leader.NEWLEADER, qp.getType());
assertEquals(ZxidUtils.makeZxid(1, 0), qp.getZxid());
assertEquals(1, l.self.getAcceptedEpoch());
- assertEquals(1, l.self.getCurrentEpoch());
+ assertCurrentEpochGotUpdated(1, l.self, ClientBase.CONNECTION_TIMEOUT);
qp = new QuorumPacket(Leader.ACK, qp.getZxid(), null, null);
oa.writeRecord(qp, null);
@@ -1212,4 +1213,22 @@ public class Zab1_0Test extends ZKTestCase {
}
}
+ /*
+ * Epoch is first written to file then updated in memory. Give some time to
+ * write the epoch in file and then go for assert.
+ */
+ private void assertCurrentEpochGotUpdated(int expected, QuorumPeer self, long timeout)
+ throws IOException {
+ long elapsedTime = 0;
+ long waitInterval = 10;
+ while (self.getCurrentEpoch() != expected && elapsedTime < timeout) {
+ try {
+ Thread.sleep(waitInterval);
+ } catch (InterruptedException e) {
+ fail("CurrentEpoch update failed");
+ }
+ elapsedTime = elapsedTime + waitInterval;
+ }
+ assertEquals("CurrentEpoch update failed", expected, self.getCurrentEpoch());
+ }
}