You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by ar...@apache.org on 2021/03/31 21:21:07 UTC

[zookeeper] branch branch-3.7 updated: ZOOKEEPER-4269: acceptedEpoch.tmp rename failure will cause server startup error

This is an automated email from the ASF dual-hosted git repository.

arshad pushed a commit to branch branch-3.7
in repository https://gitbox.apache.org/repos/asf/zookeeper.git


The following commit(s) were added to refs/heads/branch-3.7 by this push:
     new e892430  ZOOKEEPER-4269: acceptedEpoch.tmp rename failure will cause server startup error
e892430 is described below

commit e892430a2c72188193300e44be9e0724caee41e9
Author: Mohammad Arshad <ar...@apache.org>
AuthorDate: Thu Apr 1 02:52:49 2021 +0530

    ZOOKEEPER-4269: acceptedEpoch.tmp rename failure will cause server startup error
    
    Using accepted epoch from acceptedEpoch.tmp if it is available
    
    Author: Mohammad Arshad <ar...@apache.org>
    
    Reviewers: Enrico Olivelli <eo...@apache.org>,Damien Diederen <dd...@crosstwine.com>
    
    Closes #1664 from arshadmohammad/ZOOKEEPER-4269-master and squashes the following commits:
    
    d8ae084c1 [Mohammad Arshad] Handled review comments
    a2cc9a5b6 [Mohammad Arshad] Added test cases
    b56837e55 [Mohammad Arshad] ZOOKEEPER-4269: acceptedEpoch.tmp rename failure will cause server startup error Using accepted epoch from acceptedEpoch.tmp if it is available
    
    (cherry picked from commit cdddda4c55acf29d4e0b2bc8f3de7b5c676e8ffc)
    Signed-off-by: Mohammad Arshad <ar...@apache.org>
---
 .../zookeeper/common/AtomicFileOutputStream.java   |   2 +-
 .../apache/zookeeper/server/quorum/QuorumPeer.java |  17 ++-
 .../quorum/CurrentEpochWriteFailureTest.java       | 118 +++++++++++++++++++++
 3 files changed, 132 insertions(+), 5 deletions(-)

diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/common/AtomicFileOutputStream.java b/zookeeper-server/src/main/java/org/apache/zookeeper/common/AtomicFileOutputStream.java
index 1574815..d6b7cf6 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/common/AtomicFileOutputStream.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/common/AtomicFileOutputStream.java
@@ -45,7 +45,7 @@ import org.slf4j.LoggerFactory;
  */
 public class AtomicFileOutputStream extends FilterOutputStream {
 
-    private static final String TMP_EXTENSION = ".tmp";
+    public static final String TMP_EXTENSION = ".tmp";
 
     private static final Logger LOG = LoggerFactory.getLogger(AtomicFileOutputStream.class);
 
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
index 3102c63..48d2afd 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
@@ -52,6 +52,7 @@ import java.util.stream.IntStream;
 import javax.security.sasl.SaslException;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.apache.zookeeper.KeeperException.BadArgumentsException;
+import org.apache.zookeeper.common.AtomicFileOutputStream;
 import org.apache.zookeeper.common.AtomicFileWritingIdiom;
 import org.apache.zookeeper.common.AtomicFileWritingIdiom.WriterStatement;
 import org.apache.zookeeper.common.QuorumX509Util;
@@ -1162,10 +1163,18 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
                 writeLongToFile(CURRENT_EPOCH_FILENAME, currentEpoch);
             }
             if (epochOfZxid > currentEpoch) {
-                throw new IOException("The current epoch, "
-                                      + ZxidUtils.zxidToString(currentEpoch)
-                                      + ", is older than the last zxid, "
-                                      + lastProcessedZxid);
+                // acceptedEpoch.tmp file in snapshot directory
+                File currentTmp = new File(getTxnFactory().getSnapDir(),
+                    CURRENT_EPOCH_FILENAME + AtomicFileOutputStream.TMP_EXTENSION);
+                if (currentTmp.exists()) {
+                    long epochOfTmp = readLongFromFile(currentTmp.getName());
+                    LOG.info("{} found. Setting current epoch to {}.", currentTmp, epochOfTmp);
+                    setCurrentEpoch(epochOfTmp);
+                } else {
+                    throw new IOException(
+                        "The current epoch, " + ZxidUtils.zxidToString(currentEpoch)
+                            + ", is older than the last zxid, " + lastProcessedZxid);
+                }
             }
             try {
                 acceptedEpoch = readLongFromFile(ACCEPTED_EPOCH_FILENAME);
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CurrentEpochWriteFailureTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CurrentEpochWriteFailureTest.java
new file mode 100644
index 0000000..ccd4d49
--- /dev/null
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/CurrentEpochWriteFailureTest.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.zookeeper.server.quorum;
+
+import static org.apache.zookeeper.test.ClientBase.CONNECTION_TIMEOUT;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import java.io.File;
+import java.io.IOException;
+import org.apache.commons.io.FileUtils;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.common.AtomicFileOutputStream;
+import org.apache.zookeeper.test.ClientBase;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CurrentEpochWriteFailureTest extends QuorumPeerTestBase {
+    protected static final Logger LOG = LoggerFactory.getLogger(CurrentEpochWriteFailureTest.class);
+    private Servers servers;
+    private int clientPort;
+
+    @AfterEach
+    public void tearDown() throws InterruptedException {
+        if (servers != null) {
+            servers.shutDownAllServers();
+        }
+    }
+
+    /*
+     * ZOOKEEPER-4269:
+     * accepted epoch is first written to temporary file acceptedEpoch.tmp then this file is
+     * renamed to acceptedEpoch.
+     * Failure, either because of exception or power-off, in renaming the acceptedEpoch.tmp file
+     * will cause server startup error with message "The current epoch, x, is older than the last
+     * zxid y"
+     * To handle this scenario we should read accepted epoch from this temp file as well.
+     */
+    @Test
+    public void testReadCurrentEpochFromAcceptedEpochTmpFile() throws Exception {
+        startServers();
+        writeSomeData();
+
+        restartServers();
+        writeSomeData();
+
+        MainThread firstServer = servers.mt[0];
+
+        // As started servers two times, current epoch must be two
+        long currentEpoch = firstServer.getQuorumPeer().getCurrentEpoch();
+        assertEquals(2, currentEpoch);
+
+        // Initialize files for later use
+        File snapDir = firstServer.getQuorumPeer().getTxnFactory().getSnapDir();
+        File currentEpochFile = new File(snapDir, QuorumPeer.CURRENT_EPOCH_FILENAME);
+        File currentEpochTempFile = new File(snapDir,
+            QuorumPeer.CURRENT_EPOCH_FILENAME + AtomicFileOutputStream.TMP_EXTENSION);
+
+        // Shutdown servers
+        servers.shutDownAllServers();
+        waitForAll(servers, ZooKeeper.States.CONNECTING);
+
+        // Create scenario of file currentEpoch.tmp rename to currentEpoch failure.
+        // In this case currentEpoch file will have old epoch and currentEpoch.tmp will have the latest epoch
+        FileUtils.write(currentEpochFile, Long.toString(currentEpoch - 1), "UTF-8");
+        FileUtils.write(currentEpochTempFile, Long.toString(currentEpoch), "UTF-8");
+
+        // Restart the serves, all serves should restart successfully.
+        servers.restartAllServersAndClients(this);
+
+        // Check the first server where problem was injected.
+        assertTrue(ClientBase
+                .waitForServerUp("127.0.0.1:" + firstServer.getClientPort(), CONNECTION_TIMEOUT),
+            "server " + firstServer.getMyid()
+                + " is not up as file currentEpoch.tmp rename to currentEpoch file was failed"
+                + " which lead current epoch inconsistent state.");
+    }
+
+    private void restartServers() throws InterruptedException, IOException {
+        servers.shutDownAllServers();
+        waitForAll(servers, ZooKeeper.States.CONNECTING);
+        servers.restartAllServersAndClients(this);
+        waitForAll(servers, ZooKeeper.States.CONNECTED);
+    }
+
+    private void writeSomeData() throws Exception {
+        ZooKeeper client = ClientBase.createZKClient("127.0.0.1:" + clientPort);
+        String path = "/somePath" + System.currentTimeMillis();
+        String data = "someData";
+        client.create(path, data.getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
+        byte[] data1 = client.getData(path, false, null);
+        assertEquals(data, new String(data1));
+        client.close();
+    }
+
+    private void startServers() throws Exception {
+        servers = LaunchServers(3);
+        clientPort = servers.clientPorts[0];
+    }
+}