You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by eo...@apache.org on 2019/12/13 07:51:53 UTC

[zookeeper] branch master updated: ZOOKEEPER-3644: Data loss after upgrading standalone ZK server 3.4.14 to 3.5.6 with snapshot.trust.empty=true

This is an automated email from the ASF dual-hosted git repository.

eolivelli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zookeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new a5d67c8  ZOOKEEPER-3644: Data loss after upgrading standalone ZK server 3.4.14 to 3.5.6 with snapshot.trust.empty=true
a5d67c8 is described below

commit a5d67c8ffcc18289aa89c892d948ad7c75a317b4
Author: Michael Han <ha...@apache.org>
AuthorDate: Fri Dec 13 08:51:38 2019 +0100

    ZOOKEEPER-3644: Data loss after upgrading standalone ZK server 3.4.14 to 3.5.6 with snapshot.trust.empty=true
    
    We didn't replay transactions during recovery when snapshot files are missing and snapshot.trust.empty is set to true; this leads to no actual data being applied to the deserialized in memory data tree. We should make sure always replay the transactions after snapshot loading is done.
    
    Author: Michael Han <ha...@apache.org>
    
    Reviewers: Enrico Olivelli <eo...@apache.org>
    
    Closes #1177 from hanm/ZOOKEEPER-3644
---
 .../server/persistence/FileTxnSnapLog.java         | 45 ++++++++++++++--------
 .../test/EmptiedSnapshotRecoveryTest.java          |  4 +-
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java
index 6b4dd2a..51f2dbb 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java
@@ -85,9 +85,19 @@ public class FileTxnSnapLog {
      * restored.
      */
     public interface PlayBackListener {
-
         void onTxnLoaded(TxnHeader hdr, Record rec);
+    }
 
+    /**
+     * Finalizing restore of data tree through
+     * a set of operations (replaying transaction logs,
+     * calculating data tree digests, and so on.).
+     */
+    private interface RestoreFinalizer {
+        /**
+         * @return the highest zxid of restored data tree.
+         */
+        long run() throws IOException;
     }
 
     /**
@@ -241,6 +251,23 @@ public class FileTxnSnapLog {
             trustEmptyDB = autoCreateDB;
         }
 
+        RestoreFinalizer finalizer = () -> {
+            long highestZxid = fastForwardFromEdits(dt, sessions, listener);
+            // The snapshotZxidDigest will reset after replaying the txn of the
+            // zxid in the snapshotZxidDigest, if it's not reset to null after
+            // restoring, it means either there are not enough txns to cover that
+            // zxid or that txn is missing
+            DataTree.ZxidDigest snapshotZxidDigest = dt.getDigestFromLoadedSnapshot();
+            if (snapshotZxidDigest != null) {
+                LOG.warn(
+                        "Highest txn zxid 0x{} is not covering the snapshot digest zxid 0x{}, "
+                                + "which might lead to inconsistent state",
+                        Long.toHexString(highestZxid),
+                        Long.toHexString(snapshotZxidDigest.getZxid()));
+            }
+            return highestZxid;
+        };
+
         if (-1L == deserializeResult) {
             /* this means that we couldn't find any snapshot, so we need to
              * initialize an empty database (reported in ZOOKEEPER-2325) */
@@ -251,6 +278,7 @@ public class FileTxnSnapLog {
                     throw new IOException(EMPTY_SNAPSHOT_WARNING + "Something is broken!");
                 } else {
                     LOG.warn("{}This should only be allowed during upgrading.", EMPTY_SNAPSHOT_WARNING);
+                    return finalizer.run();
                 }
             }
 
@@ -269,20 +297,7 @@ public class FileTxnSnapLog {
             }
         }
 
-        long highestZxid = fastForwardFromEdits(dt, sessions, listener);
-        // The snapshotZxidDigest will reset after replaying the txn of the
-        // zxid in the snapshotZxidDigest, if it's not reset to null after
-        // restoring, it means either there are not enough txns to cover that
-        // zxid or that txn is missing
-        DataTree.ZxidDigest snapshotZxidDigest = dt.getDigestFromLoadedSnapshot();
-        if (snapshotZxidDigest != null) {
-            LOG.warn(
-                "Highest txn zxid 0x{} is not covering the snapshot digest zxid 0x{}, "
-                    + "which might lead to inconsistent state",
-                Long.toHexString(highestZxid),
-                Long.toHexString(snapshotZxidDigest.getZxid()));
-        }
-        return highestZxid;
+        return finalizer.run();
     }
 
     /**
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/test/EmptiedSnapshotRecoveryTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/test/EmptiedSnapshotRecoveryTest.java
index a8d7169..da50a9c 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/test/EmptiedSnapshotRecoveryTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/test/EmptiedSnapshotRecoveryTest.java
@@ -18,6 +18,7 @@
 
 package org.apache.zookeeper.test;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 import java.io.File;
@@ -100,10 +101,11 @@ public class EmptiedSnapshotRecoveryTest extends ZKTestCase implements Watcher {
         zks = new ZooKeeperServer(tmpSnapDir, tmpLogDir, 3000);
         try {
             zks.startdata();
-            zxid = zks.getZKDatabase().loadDataBase();
+            long currentZxid = zks.getZKDatabase().getDataTreeLastProcessedZxid();
             if (!trustEmptySnap) {
                 fail("Should have gotten exception for corrupted database");
             }
+            assertEquals("zxid mismatch after restoring database", currentZxid, zxid);
         } catch (IOException e) {
             // expected behavior
             if (trustEmptySnap) {