You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by nk...@apache.org on 2020/01/28 16:26:36 UTC

[zookeeper] branch branch-3.6 updated: ZOOKEEPER-3701: Split brain on log disk full

This is an automated email from the ASF dual-hosted git repository.

nkalmar pushed a commit to branch branch-3.6
in repository https://gitbox.apache.org/repos/asf/zookeeper.git


The following commit(s) were added to refs/heads/branch-3.6 by this push:
     new 9b6badb  ZOOKEEPER-3701: Split brain on log disk full
9b6badb is described below

commit 9b6badb5b43d6b6d2ef5c84ad5659d48cf968558
Author: Andor Molnar <an...@apache.org>
AuthorDate: Tue Jan 28 17:26:14 2020 +0100

    ZOOKEEPER-3701: Split brain on log disk full
    
    Issue described here:
    https://issues.apache.org/jira/browse/ZOOKEEPER-3701
    
    Proposing a fix with catching `IOException` within the truncate method to properly return with `false` if truncate fails.
    
    Author: Andor Molnar <an...@apache.org>
    Author: Andor Molnar <an...@cloudera.com>
    
    Reviewers: Ivan Kelly <iv...@apache.org>, Enrico Olivelli <eo...@apache.org>, Norbert Kalmar <nk...@apache.org>
    
    Closes #1233 from anmolnar/ZOOKEEPER-3701
    
    (cherry picked from commit a4bc9857e19b6fa3832813d7e47f683b582565b8)
    Signed-off-by: Norbert Kalmar <nk...@apache.org>
---
 .../zookeeper/server/persistence/FileSnap.java     |  2 +
 .../zookeeper/server/persistence/FileTxnLog.java   |  3 +-
 .../server/persistence/FileTxnSnapLog.java         | 49 ++++++++++++--------
 .../zookeeper/server/SnapshotDigestTest.java       |  2 +
 .../zookeeper/server/quorum/LearnerTest.java       | 53 ++++++++++++++++++++++
 .../org/apache/zookeeper/test/TruncateTest.java    |  7 ++-
 6 files changed, 92 insertions(+), 24 deletions(-)

diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileSnap.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileSnap.java
index fde577a..b608c21 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileSnap.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileSnap.java
@@ -260,6 +260,8 @@ public class FileSnap implements SnapShot {
                     Util.getZxidFromName(snapShot.getName(), SNAPSHOT_FILE_PREFIX),
                     snapShot.lastModified() / 1000);
             }
+        } else {
+            throw new IOException("FileSnap has already been closed");
         }
     }
 
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
index 973e741..62969ba 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
@@ -20,6 +20,7 @@ package org.apache.zookeeper.server.persistence;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
+import java.io.Closeable;
 import java.io.EOFException;
 import java.io.File;
 import java.io.FileInputStream;
@@ -93,7 +94,7 @@ import org.slf4j.LoggerFactory;
  *     0 padded to EOF (filled during preallocation stage)
  * </pre></blockquote>
  */
-public class FileTxnLog implements TxnLog {
+public class FileTxnLog implements TxnLog, Closeable {
 
     private static final Logger LOG;
 
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java
index b6014ef..0ec12d8 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java
@@ -499,23 +499,28 @@ public class FileTxnSnapLog {
      * @return true if able to truncate the log, false if not
      * @throws IOException
      */
-    public boolean truncateLog(long zxid) throws IOException {
-        // close the existing txnLog and snapLog
-        close();
-
-        // truncate it
-        FileTxnLog truncLog = new FileTxnLog(dataDir);
-        boolean truncated = truncLog.truncate(zxid);
-        truncLog.close();
-
-        // re-open the txnLog and snapLog
-        // I'd rather just close/reopen this object itself, however that
-        // would have a big impact outside ZKDatabase as there are other
-        // objects holding a reference to this object.
-        txnLog = new FileTxnLog(dataDir);
-        snapLog = new FileSnap(snapDir);
-
-        return truncated;
+    public boolean truncateLog(long zxid) {
+        try {
+            // close the existing txnLog and snapLog
+            close();
+
+            // truncate it
+            try (FileTxnLog truncLog = new FileTxnLog(dataDir)) {
+                boolean truncated = truncLog.truncate(zxid);
+
+                // re-open the txnLog and snapLog
+                // I'd rather just close/reopen this object itself, however that
+                // would have a big impact outside ZKDatabase as there are other
+                // objects holding a reference to this object.
+                txnLog = new FileTxnLog(dataDir);
+                snapLog = new FileSnap(snapDir);
+
+                return truncated;
+            }
+        } catch (IOException e) {
+            LOG.error("Unable to truncate Txn log", e);
+            return false;
+        }
     }
 
     /**
@@ -594,8 +599,14 @@ public class FileTxnSnapLog {
      * @throws IOException
      */
     public void close() throws IOException {
-        txnLog.close();
-        snapLog.close();
+        if (txnLog != null) {
+            txnLog.close();
+            txnLog = null;
+        }
+        if (snapLog != null) {
+            snapLog.close();
+            snapLog = null;
+        }
     }
 
     @SuppressWarnings("serial")
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/SnapshotDigestTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/SnapshotDigestTest.java
index 40f07a8..d40debf 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/SnapshotDigestTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/SnapshotDigestTest.java
@@ -194,6 +194,8 @@ public class SnapshotDigestTest extends ClientBase {
         startServer();
         QuorumPeerMainTest.waitForOne(zk, States.CONNECTED);
 
+        server = serverFactory.getZooKeeperServer();
+
         // Snapshot digests always match
         assertEquals(0L, ServerMetrics.getMetrics().DIGEST_MISMATCHES_COUNT.get());
 
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/LearnerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/LearnerTest.java
index 1d121ac..63381fc 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/LearnerTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/LearnerTest.java
@@ -20,7 +20,10 @@ package org.apache.zookeeper.server.quorum;
 
 import static java.util.Arrays.asList;
 import static java.util.Collections.emptySet;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 import static org.mockito.Mockito.mock;
@@ -36,17 +39,20 @@ import java.net.Socket;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.function.Consumer;
 import org.apache.jute.BinaryInputArchive;
 import org.apache.jute.BinaryOutputArchive;
 import org.apache.zookeeper.ZKTestCase;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.common.X509Exception;
 import org.apache.zookeeper.data.ACL;
+import org.apache.zookeeper.server.ExitCode;
 import org.apache.zookeeper.server.ZKDatabase;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.apache.zookeeper.test.TestUtils;
 import org.apache.zookeeper.txn.CreateTxn;
 import org.apache.zookeeper.txn.TxnHeader;
+import org.apache.zookeeper.util.ServiceUtils;
 import org.junit.Test;
 
 public class LearnerTest extends ZKTestCase {
@@ -300,4 +306,51 @@ public class LearnerTest extends ZKTestCase {
         }
     }
 
+    @Test
+    public void truncFailTest() throws Exception {
+        final boolean[] exitProcCalled = {false};
+
+        ServiceUtils.setSystemExitProcedure(new Consumer<Integer>() {
+            @Override
+            public void accept(Integer exitCode) {
+                exitProcCalled[0] = true;
+                assertThat("System.exit() was called with invalid exit code", exitCode, equalTo(ExitCode.QUORUM_PACKET_ERROR.getValue()));
+            }
+        });
+
+        File tmpFile = File.createTempFile("test", ".dir", testData);
+        tmpFile.delete();
+        try {
+            FileTxnSnapLog txnSnapLog = new FileTxnSnapLog(tmpFile, tmpFile);
+            SimpleLearner sl = new SimpleLearner(txnSnapLog);
+            long startZxid = sl.zk.getLastProcessedZxid();
+
+            // Set up bogus streams
+            ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            BinaryOutputArchive oa = BinaryOutputArchive.getArchive(baos);
+            sl.leaderOs = BinaryOutputArchive.getArchive(new ByteArrayOutputStream());
+
+            // make streams and socket do something innocuous
+            sl.bufferedOutput = new BufferedOutputStream(System.out);
+            sl.sock = new Socket();
+
+            // fake messages from the server
+            QuorumPacket qp = new QuorumPacket(Leader.TRUNC, 0, null, null);
+            oa.writeRecord(qp, null);
+
+            // setup the messages to be streamed to follower
+            sl.leaderIs = BinaryInputArchive.getArchive(new ByteArrayInputStream(baos.toByteArray()));
+
+            try {
+                sl.syncWithLeader(3);
+            } catch (EOFException e) {
+            }
+
+            sl.zk.shutdown();
+
+            assertThat("System.exit() should have been called", exitProcCalled[0], is(true));
+        } finally {
+            TestUtils.deleteFileRecursively(tmpFile);
+        }
+    }
 }
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/test/TruncateTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/test/TruncateTest.java
index f7a448e..089a764 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/test/TruncateTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/test/TruncateTest.java
@@ -18,7 +18,9 @@
 
 package org.apache.zookeeper.test;
 
+import static org.hamcrest.CoreMatchers.is;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 import java.io.File;
@@ -125,10 +127,7 @@ public class TruncateTest extends ZKTestCase {
             assertTrue("Failed to delete log file: " + logs[i].getName(), logs[i].delete());
         }
         try {
-            zkdb.truncateLog(1);
-            assertTrue("Should not get here", false);
-        } catch (IOException e) {
-            assertTrue("Should have received an IOException", true);
+            assertThat("truncateLog() should return false if truncation fails instead of throwing exception", zkdb.truncateLog(1), is(false));
         } catch (NullPointerException npe) {
             fail("This should not throw NPE!");
         }