You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by iv...@apache.org on 2013/08/13 15:02:00 UTC

svn commit: r1513467 - in /zookeeper/bookkeeper/branches/branch-4.2: ./ bookkeeper-server/src/main/java/org/apache/bookkeeper/client/ bookkeeper-server/src/test/java/org/apache/bookkeeper/client/

Author: ivank
Date: Tue Aug 13 13:01:59 2013
New Revision: 1513467

URL: http://svn.apache.org/r1513467
Log:
BOOKKEEPER-667: Client write will fail with BadMetadataVersion in case of multiple Bookie failures with AutoRecovery enabled (sijie via ivank)

Modified:
    zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt
    zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerHandle.java
    zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerMetadata.java
    zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/BookieRecoveryTest.java

Modified: zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt?rev=1513467&r1=1513466&r2=1513467&view=diff
==============================================================================
--- zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt (original)
+++ zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt Tue Aug 13 13:01:59 2013
@@ -56,6 +56,8 @@ Release 4.2.2 - Unreleased
 
         BOOKKEEPER-604: Ledger storage can log an exception if GC happens concurrently. (sijie & ivank via ivank)
 
+        BOOKKEEPER-667: Client write will fail with BadMetadataVersion in case of multiple Bookie failures with AutoRecovery enabled (sijie via ivank)
+
       hedwig-server:
 
         BOOKKEEPER-579: TestSubAfterCloseSub was put in a wrong package (sijie via ivank)

Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerHandle.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerHandle.java?rev=1513467&r1=1513466&r2=1513467&view=diff
==============================================================================
--- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerHandle.java (original)
+++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerHandle.java Tue Aug 13 13:01:59 2013
@@ -805,6 +805,24 @@ public class LedgerHandle {
                 return false;
             }
 
+            // We should check number of ensembles since there are two kinds of metadata conflicts:
+            // - Case 1: Multiple bookies involved in ensemble change.
+            //           Number of ensembles should be same in this case.
+            // - Case 2: Recovery (Auto/Manually) replaced ensemble and ensemble changed.
+            //           The metadata changed due to ensemble change would have one more ensemble
+            //           than the metadata changed by recovery.
+            int diff = newMeta.getEnsembles().size() - metadata.getEnsembles().size();
+            if (0 != diff) {
+                if (-1 == diff) {
+                    // Case 1: metadata is changed by other ones (e.g. Recovery)
+                    return updateMetadataIfPossible(newMeta);
+                }
+                return false;
+            }
+
+            //
+            // Case 2:
+            //
             // If the failed the bookie is still existed in the metadata (in zookeeper), it means that
             // the ensemble change of the failed bookie is failed due to metadata conflicts. so try to
             // update the ensemble change metadata again. Otherwise, it means that the ensemble change
@@ -816,26 +834,7 @@ public class LedgerHandle {
                 // update ensemble changed metadata again.
                 if (!metadata.currentEnsemble.get(ensembleInfo.bookieIndex)
                         .equals(ensembleInfo.addr)) {
-                    // if the local metadata is newer than zookeeper metadata, it means that metadata is updated
-                    // again when it was trying re-reading the metatada, re-kick the reread again
-                    if (metadata.isNewerThan(newMeta)) {
-                        rereadMetadata(this);
-                        return true;
-                    }
-                    // make sure the metadata doesn't changed by other ones.
-                    if (metadata.isConflictWith(newMeta)) {
-                        return false;
-                    }
-                    LOG.info("Resolve ledger metadata conflict "
-                            + "while changing ensemble to: "
-                            + ensembleInfo.newEnsemble
-                            + ", old meta data is \n"
-                            + new String(metadata.serialize())
-                            + "\n, new meta data is \n"
-                            + new String(newMeta.serialize()));
-                    // update znode version
-                    metadata.setVersion(newMeta.getVersion());
-                    writeLedgerConfig(new ChangeEnsembleCb(ensembleInfo));
+                    return updateMetadataIfPossible(newMeta);
                 }
             } else {
                 // the failed bookie has been replaced
@@ -845,6 +844,29 @@ public class LedgerHandle {
             return true;
         }
 
+        private boolean updateMetadataIfPossible(LedgerMetadata newMeta) {
+            // if the local metadata is newer than zookeeper metadata, it means that metadata is updated
+            // again when it was trying re-reading the metatada, re-kick the reread again
+            if (metadata.isNewerThan(newMeta)) {
+                rereadMetadata(this);
+                return true;
+            }
+            // make sure the metadata doesn't changed by other ones.
+            if (metadata.isConflictWith(newMeta)) {
+                return false;
+            }
+            LOG.info("Resolve ledger metadata conflict while changing ensemble to: {},"
+                    + " old meta data is \n {} \n, new meta data is \n {}.", new Object[] {
+                    ensembleInfo.newEnsemble, metadata, newMeta });
+            // update znode version
+            metadata.setVersion(newMeta.getVersion());
+            // merge ensemble infos from new meta except last ensemble
+            // since they might be modified by recovery tool.
+            metadata.mergeEnsembles(newMeta.getEnsembles());
+            writeLedgerConfig(new ChangeEnsembleCb(ensembleInfo));
+            return true;
+        }
+
     };
 
     void unsetSuccessAndSendWriteRequest(final int bookieIndex) {

Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerMetadata.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerMetadata.java?rev=1513467&r1=1513466&r2=1513467&view=diff
==============================================================================
--- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerMetadata.java (original)
+++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerMetadata.java Tue Aug 13 13:01:59 2013
@@ -1,5 +1,3 @@
-package org.apache.bookkeeper.client;
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -17,6 +15,7 @@ package org.apache.bookkeeper.client;
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.bookkeeper.client;
 
 import static com.google.common.base.Charsets.UTF_8;
 
@@ -478,8 +477,10 @@ public class LedgerMetadata {
         }
         // if ledger is closed, we can just take the new ensembles
         if (newMeta.state != LedgerMetadataFormat.State.CLOSED) {
-            // ensemble size should be same
-            if (ensembles.size() != newMeta.ensembles.size()) {
+            // allow new metadata to be one ensemble less than current metadata
+            // since ensemble change might kick in when recovery changed metadata
+            int diff = ensembles.size() - newMeta.ensembles.size();
+            if (0 != diff && 1 != diff) {
                 return true;
             }
             // ensemble distribution should be same
@@ -487,7 +488,7 @@ public class LedgerMetadata {
             // using recovery tool.
             Iterator<Long> keyIter = ensembles.keySet().iterator();
             Iterator<Long> newMetaKeyIter = newMeta.ensembles.keySet().iterator();
-            for (int i=0; i<ensembles.size(); i++) {
+            for (int i=0; i<newMeta.ensembles.size(); i++) {
                 Long curKey = keyIter.next();
                 Long newMetaKey = newMetaKeyIter.next();
                 if (!curKey.equals(newMetaKey)) {
@@ -497,4 +498,32 @@ public class LedgerMetadata {
         }
         return false;
     }
+
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("(meta:").append(new String(serialize(), UTF_8)).append(", version:").append(version).append(")");
+        return sb.toString();
+    }
+
+    void mergeEnsembles(SortedMap<Long, ArrayList<InetSocketAddress>> newEnsembles) {
+        // allow new metadata to be one ensemble less than current metadata
+        // since ensemble change might kick in when recovery changed metadata
+        int diff = ensembles.size() - newEnsembles.size();
+        if (0 != diff && 1 != diff) {
+            return;
+        }
+        int i = 0;
+        for (Entry<Long, ArrayList<InetSocketAddress>> entry : newEnsembles.entrySet()) {
+            ++i;
+            if (ensembles.size() != i) {
+                // we should use last ensemble from current metadata
+                // not the new metadata read from zookeeper
+                long key = entry.getKey();
+                ArrayList<InetSocketAddress> ensemble = entry.getValue();
+                ensembles.put(key, ensemble);
+            }
+        }
+    }
+
 }

Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/BookieRecoveryTest.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/BookieRecoveryTest.java?rev=1513467&r1=1513466&r2=1513467&view=diff
==============================================================================
--- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/BookieRecoveryTest.java (original)
+++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/BookieRecoveryTest.java Tue Aug 13 13:01:59 2013
@@ -228,6 +228,45 @@ public class BookieRecoveryTest extends 
     }
 
     /**
+     * This tests the bookie recovery functionality with ensemble changes.
+     * We'll verify that:
+     * - bookie recovery should not affect ensemble change.
+     * - ensemble change should not erase changes made by recovery.
+     *
+     * {@link https://issues.apache.org/jira/browse/BOOKKEEPER-667}
+     */
+    @Test(timeout = 60000)
+    public void testMetadataConflictWithRecovery() throws Exception {
+        int numEntries = 10;
+        byte[] data = "testMetadataConflictWithRecovery".getBytes();
+
+        LedgerHandle lh = bkc.createLedger(2, 2, digestType, baseClientConf.getBookieRecoveryPasswd());
+        for (int i = 0; i < numEntries; i++) {
+            lh.addEntry(data);
+        }
+        InetSocketAddress bookieToKill = lh.getLedgerMetadata().getEnsemble(numEntries - 1).get(1);
+        killBookie(bookieToKill);
+        startNewBookie();
+        for (int i = 0; i < numEntries; i++) {
+            lh.addEntry(data);
+        }
+        bkAdmin.recoverBookieData(bookieToKill, null);
+        // fail another bookie to cause ensemble change again
+        bookieToKill = lh.getLedgerMetadata().getEnsemble(2 * numEntries - 1).get(1);
+        ServerConfiguration confOfKilledBookie = killBookie(bookieToKill);
+        startNewBookie();
+        for (int i = 0; i < numEntries; i++) {
+            lh.addEntry(data);
+        }
+        // start the killed bookie again
+        bsConfs.add(confOfKilledBookie);
+        bs.add(startBookie(confOfKilledBookie));
+        // all ensembles should be fully replicated since it is recovered
+        assertTrue("Not fully replicated", verifyFullyReplicated(lh, 3 * numEntries));
+        lh.close();
+    }
+
+    /**
      * This tests the asynchronous bookie recovery functionality by writing
      * entries into 3 bookies, killing one bookie, starting up a new one to
      * replace it, and then recovering the ledger entries from the killed bookie