You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bookkeeper.apache.org by si...@apache.org on 2016/11/09 02:02:54 UTC

bookkeeper git commit: BOOKKEEPER-946: Provide an option to delay auto recovery of lost bookies

Repository: bookkeeper
Updated Branches:
  refs/heads/master 4383b0b17 -> 0abf37c64


BOOKKEEPER-946: Provide an option to delay auto recovery of lost bookies

If auto recovery is enabled, and a bookie goes down for upgrade or even if it looses zk connection
intermittently, the auditor detects it as a lost bookie and starts under replication detection and
the replication workers on other bookie nodes start replicating the under replicated ledgers. All
of this stops once the bookie comes up but by then a few ledgers would get replicated. Given the
fact that we have multiple copies of data, it is probably not necessary to start the recovery as
soon as a bookie goes down. We can wait for an hour or so and then start recovery. This should
cover cases like planned upgrade, intermittent network connectivity loss, etc.

This change:
    1) Provides a bookie option 'lostBookieRecoveryDelay' in secs, which when set to a non zero value,
       will delay the start of recovery by that number of seconds. By default, this option is set to 0;
       which means the audit is not delayed.
    2) If another bookie were to go down in this interval, the recovery is immediately started and the
       one scheduled for future is canceled.
    3) Adds counters to track how many audits were delayed(#1) and how many scheduled audits were
       canceled due to multiple bookie failures(#2).
    4) Three junit tests to verify the new feature.

Author: Rithin <ri...@salesforce.com>

Reviewers: siddharth.boobna@gmail.com <si...@gmail.com>, Enrico Olivelli <eo...@gmail.com>

Closes #58 from rithin-shetty/audit_delay


Project: http://git-wip-us.apache.org/repos/asf/bookkeeper/repo
Commit: http://git-wip-us.apache.org/repos/asf/bookkeeper/commit/0abf37c6
Tree: http://git-wip-us.apache.org/repos/asf/bookkeeper/tree/0abf37c6
Diff: http://git-wip-us.apache.org/repos/asf/bookkeeper/diff/0abf37c6

Branch: refs/heads/master
Commit: 0abf37c64ced0fe49a6470bc0e2be632e47902d6
Parents: 4383b0b
Author: Rithin <ri...@salesforce.com>
Authored: Tue Nov 8 18:02:34 2016 -0800
Committer: Sijie Guo <si...@apache.org>
Committed: Tue Nov 8 18:02:34 2016 -0800

----------------------------------------------------------------------
 bookkeeper-server/conf/bk_server.conf           |   3 +
 .../bookkeeper/conf/ServerConfiguration.java    |  17 ++
 .../apache/bookkeeper/replication/Auditor.java  | 124 +++++++---
 .../replication/ReplicationStats.java           |   2 +
 .../replication/AuditorLedgerCheckerTest.java   | 232 ++++++++++++++++++-
 5 files changed, 348 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/bookkeeper/blob/0abf37c6/bookkeeper-server/conf/bk_server.conf
----------------------------------------------------------------------
diff --git a/bookkeeper-server/conf/bk_server.conf b/bookkeeper-server/conf/bk_server.conf
index b3f1637..678018c 100644
--- a/bookkeeper-server/conf/bk_server.conf
+++ b/bookkeeper-server/conf/bk_server.conf
@@ -270,6 +270,9 @@ zkTimeout=10000
 # The interval is specified in seconds.
 #auditorPeriodicBookieCheckInterval=86400
 
+# How long to wait, in seconds, before starting auto recovery of a lost bookie
+#lostBookieRecoveryDelay=0
+
 # number of threads that should handle write requests. if zero, the writes would
 # be handled by netty threads directly.
 # numAddWorkerThreads=1

http://git-wip-us.apache.org/repos/asf/bookkeeper/blob/0abf37c6/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
----------------------------------------------------------------------
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
index 67e81cc..92644da 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ServerConfiguration.java
@@ -103,6 +103,7 @@ public class ServerConfiguration extends AbstractConfiguration {
     protected final static String AUDITOR_PERIODIC_CHECK_INTERVAL = "auditorPeriodicCheckInterval";
     protected final static String AUDITOR_PERIODIC_BOOKIE_CHECK_INTERVAL = "auditorPeriodicBookieCheckInterval";
     protected final static String AUTO_RECOVERY_DAEMON_ENABLED = "autoRecoveryDaemonEnabled";
+    protected final static String LOST_BOOKIE_RECOVERY_DELAY = "lostBookieRecoveryDelay";
 
     // Worker Thread parameters.
     protected final static String NUM_ADD_WORKER_THREADS = "numAddWorkerThreads";
@@ -1340,6 +1341,22 @@ public class ServerConfiguration extends AbstractConfiguration {
     }
 
     /**
+     * Get how long to delay the recovery of ledgers of a lost bookie.
+     *
+     * @return delay interval in seconds
+     */
+    public int getLostBookieRecoveryDelay() {
+        return getInt(LOST_BOOKIE_RECOVERY_DELAY, 0);
+    }
+
+    /**
+     * Set the delay interval for starting recovery of a lost bookie.
+     */
+    public void setLostBookieRecoveryDelay(int interval) {
+        setProperty(LOST_BOOKIE_RECOVERY_DELAY, interval);
+    }
+
+    /**
      * Sets that whether force start a bookie in readonly mode
      *
      * @param enabled

http://git-wip-us.apache.org/repos/asf/bookkeeper/blob/0abf37c6/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
----------------------------------------------------------------------
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
index 835b34f..5a4bdcc 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
@@ -98,6 +98,10 @@ public class Auditor implements BookiesListener {
     private final Counter numLedgersChecked;
     private final OpStatsLogger numFragmentsPerLedger;
     private final OpStatsLogger numBookiesPerLedger;
+    private final Counter numBookieAuditsDelayed;
+    private final Counter numDelayedBookieAuditsCancelled;
+    private volatile Future<?> auditTask;
+    private Set<String> bookiesToBeAudited = Sets.newHashSet();
 
     public Auditor(final String bookieIdentifier, ServerConfiguration conf,
                    ZooKeeper zkc, StatsLogger statsLogger) throws UnavailableException {
@@ -106,12 +110,17 @@ public class Auditor implements BookiesListener {
         this.statsLogger = statsLogger;
 
         numUnderReplicatedLedger = this.statsLogger.getOpStatsLogger(ReplicationStats.NUM_UNDER_REPLICATED_LEDGERS);
-        uRLPublishTimeForLostBookies = this.statsLogger.getOpStatsLogger(ReplicationStats.URL_PUBLISH_TIME_FOR_LOST_BOOKIE);
-        bookieToLedgersMapCreationTime = this.statsLogger.getOpStatsLogger(ReplicationStats.BOOKIE_TO_LEDGERS_MAP_CREATION_TIME);
+        uRLPublishTimeForLostBookies = this.statsLogger
+                .getOpStatsLogger(ReplicationStats.URL_PUBLISH_TIME_FOR_LOST_BOOKIE);
+        bookieToLedgersMapCreationTime = this.statsLogger
+                .getOpStatsLogger(ReplicationStats.BOOKIE_TO_LEDGERS_MAP_CREATION_TIME);
         checkAllLedgersTime = this.statsLogger.getOpStatsLogger(ReplicationStats.CHECK_ALL_LEDGERS_TIME);
         numLedgersChecked = this.statsLogger.getCounter(ReplicationStats.NUM_LEDGERS_CHECKED);
         numFragmentsPerLedger = statsLogger.getOpStatsLogger(ReplicationStats.NUM_FRAGMENTS_PER_LEDGER);
         numBookiesPerLedger = statsLogger.getOpStatsLogger(ReplicationStats.NUM_BOOKIES_PER_LEDGER);
+        numBookieAuditsDelayed = this.statsLogger.getCounter(ReplicationStats.NUM_BOOKIE_AUDITS_DELAYED);
+        numDelayedBookieAuditsCancelled = this.statsLogger
+                .getCounter(ReplicationStats.NUM_DELAYED_BOOKIE_AUDITS_DELAYES_CANCELLED);
 
         initialize(conf, zkc);
 
@@ -189,27 +198,64 @@ public class Auditor implements BookiesListener {
                         Collection<String> newBookies = CollectionUtils.subtract(
                                 availableBookies, knownBookies);
                         knownBookies.addAll(newBookies);
+                        if (!bookiesToBeAudited.isEmpty() && knownBookies.containsAll(bookiesToBeAudited)) {
+                            // the bookie, which went down earlier and had an audit scheduled for,
+                            // has come up. So let us stop tracking it and cancel the audit. Since
+                            // we allow delaying of audit when there is only one failed bookie,
+                            // bookiesToBeAudited should just have 1 element and hence containsAll
+                            // check should be ok
+                            if (auditTask != null && auditTask.cancel(false)) {
+                                auditTask = null;
+                                numDelayedBookieAuditsCancelled.inc();
+                            }
+                            bookiesToBeAudited.clear();
+                        }
 
                         // find lost bookies(if any)
-                        Collection<String> lostBookies = CollectionUtils.subtract(
-                                knownBookies, availableBookies);
-
-                        if (lostBookies.size() > 0) {
-                            knownBookies.removeAll(lostBookies);
+                        bookiesToBeAudited.addAll(CollectionUtils.subtract(knownBookies, availableBookies));
+                        if (bookiesToBeAudited.size() == 0) {
+                            return;
+                        }
 
-                            auditBookies();
+                        knownBookies.removeAll(bookiesToBeAudited);
+                        if (conf.getLostBookieRecoveryDelay() == 0) {
+                            startAudit(false);
+                            bookiesToBeAudited.clear();
+                            return;
+                        }
+                        if (bookiesToBeAudited.size() > 1) {
+                            // if more than one bookie is down, start the audit immediately;
+                            LOG.info("Multiple bookie failure; not delaying bookie audit. Bookies lost now: "
+                                     + CollectionUtils.subtract(knownBookies, availableBookies)
+                                     +"; All lost bookies: " + bookiesToBeAudited.toString());
+                            if (auditTask != null && auditTask.cancel(false)) {
+                                auditTask = null;
+                                numDelayedBookieAuditsCancelled.inc();
+                            }
+                            startAudit(false);
+                            bookiesToBeAudited.clear();
+                            return;
+                        }
+                        if (auditTask == null) {
+                            // if there is no scheduled audit, schedule one
+                            auditTask = executor.schedule( new Runnable() {
+                                public void run() {
+                                    startAudit(false);
+                                    auditTask = null;
+                                    bookiesToBeAudited.clear();
+                                }
+                            }, conf.getLostBookieRecoveryDelay(), TimeUnit.SECONDS);
+                            numBookieAuditsDelayed.inc();
+                            LOG.info("Delaying bookie audit by " + conf.getLostBookieRecoveryDelay()
+                                     + "secs for " + bookiesToBeAudited.toString());
                         }
                     } catch (BKException bke) {
                         LOG.error("Exception getting bookie list", bke);
                     } catch (InterruptedException ie) {
                         Thread.currentThread().interrupt();
                         LOG.error("Interrupted while watching available bookies ", ie);
-                    } catch (BKAuditException bke) {
-                        LOG.error("Exception while watching available bookies", bke);
                     } catch (UnavailableException ue) {
                         LOG.error("Exception while watching available bookies", ue);
-                    } catch (KeeperException ke) {
-                        LOG.error("Exception reading bookie list", ke);
                     }
                 }
             });
@@ -231,8 +277,6 @@ public class Auditor implements BookiesListener {
                          + " 'auditorPeriodicCheckInterval' {} seconds", interval);
                 executor.scheduleAtFixedRate(new Runnable() {
                         public void run() {
-                            LOG.info("Running periodic check");
-
                             try {
                                 if (!ledgerUnderreplicationManager.isLedgerReplicationEnabled()) {
                                     LOG.info("Ledger replication disabled, skipping");
@@ -310,6 +354,35 @@ public class Auditor implements BookiesListener {
         admin.notifyReadOnlyBookiesChanged(this);
     }
 
+    /**
+     * Start running the actual audit task
+     *
+     * @param shutDownTask
+     *      A boolean that indicates whether or not to schedule shutdown task on any failure
+     */
+    private void startAudit(boolean shutDownTask) {
+        try {
+            auditBookies();
+            shutDownTask = false;
+        } catch (BKException bke) {
+            LOG.error("Exception getting bookie list", bke);
+            shutDownTask &= true;
+        } catch (InterruptedException ie) {
+            Thread.currentThread().interrupt();
+            LOG.error("Interrupted while watching available bookies ", ie);
+            shutDownTask &= true;
+        } catch (BKAuditException bke) {
+            LOG.error("Exception while watching available bookies", bke);
+            shutDownTask &= true;
+        } catch (KeeperException ke) {
+            LOG.error("Exception reading bookie list", ke);
+            shutDownTask &= true;
+        }
+        if (shutDownTask) {
+            submitShutdownTask();
+        }
+    }
+
     @SuppressWarnings("unchecked")
     private void auditBookies()
             throws BKAuditException, KeeperException,
@@ -585,21 +658,14 @@ public class Auditor implements BookiesListener {
 
     private final Runnable BOOKIE_CHECK = new Runnable() {
             public void run() {
-                try {
-                    auditBookies();
-                } catch (BKException bke) {
-                    LOG.error("Couldn't get bookie list, exiting", bke);
-                    submitShutdownTask();
-                } catch (KeeperException ke) {
-                    LOG.error("Exception while watching available bookies", ke);
-                    submitShutdownTask();
-                } catch (InterruptedException ie) {
-                    Thread.currentThread().interrupt();
-                    LOG.error("Interrupted while watching available bookies ", ie);
-                    submitShutdownTask();
-                } catch (BKAuditException bke) {
-                    LOG.error("Exception while watching available bookies", bke);
-                    submitShutdownTask();
+                if (auditTask == null) {
+                    startAudit(true);
+                } else {
+                    // if due to a lost bookie an audit task was scheduled,
+                    // let us not run this periodic bookie check now, if we
+                    // went ahead, we'll report under replication and the user
+                    // wanted to avoid that(with lostBookieRecoveryDelay option)
+                    LOG.info("Audit already scheduled; skipping periodic bookie check");
                 }
             }
         };

http://git-wip-us.apache.org/repos/asf/bookkeeper/blob/0abf37c6/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationStats.java
----------------------------------------------------------------------
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationStats.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationStats.java
index 231ec01..fb7de20 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationStats.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationStats.java
@@ -33,6 +33,8 @@ public interface ReplicationStats {
     public final static String NUM_FRAGMENTS_PER_LEDGER = "NUM_FRAGMENTS_PER_LEDGER";
     public final static String NUM_BOOKIES_PER_LEDGER = "NUM_BOOKIES_PER_LEDGER";
     public final static String NUM_LEDGERS_CHECKED = "NUM_LEDGERS_CHECKED";
+    public final static String NUM_BOOKIE_AUDITS_DELAYED = "NUM_BOOKIE_AUDITS_DELAYED";
+    public final static String NUM_DELAYED_BOOKIE_AUDITS_DELAYES_CANCELLED = "NUM_DELAYED_BOOKIE_AUDITS_CANCELLED";
 
     public final static String REPLICATION_WORKER_SCOPE = "replication_worker";
     public final static String REREPLICATE_OP = "rereplicate";

http://git-wip-us.apache.org/repos/asf/bookkeeper/blob/0abf37c6/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/AuditorLedgerCheckerTest.java
----------------------------------------------------------------------
diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/AuditorLedgerCheckerTest.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/AuditorLedgerCheckerTest.java
index 8b0c344..f0a4fed 100644
--- a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/AuditorLedgerCheckerTest.java
+++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/AuditorLedgerCheckerTest.java
@@ -26,6 +26,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
@@ -39,7 +40,6 @@ import org.apache.bookkeeper.client.BKException;
 import org.apache.bookkeeper.client.BookKeeper.DigestType;
 import org.apache.bookkeeper.client.LedgerHandle;
 import org.apache.bookkeeper.conf.ServerConfiguration;
-
 import org.apache.bookkeeper.meta.ZkLedgerUnderreplicationManager;
 import org.apache.bookkeeper.proto.BookieServer;
 import org.apache.bookkeeper.proto.DataFormats.UnderreplicatedLedgerFormat;
@@ -49,6 +49,7 @@ import org.apache.bookkeeper.test.MultiLedgerManagerTestCase;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 import org.slf4j.Logger;
@@ -79,6 +80,7 @@ public class AuditorLedgerCheckerTest extends MultiLedgerManagerTestCase {
     private HashMap<String, AuditorElector> auditorElectors = new HashMap<String, AuditorElector>();
     private ZkLedgerUnderreplicationManager urLedgerMgr;
     private Set<Long> urLedgerList;
+    private String electionPath;
 
     private List<Long> ledgerList;
 
@@ -93,6 +95,8 @@ public class AuditorLedgerCheckerTest extends MultiLedgerManagerTestCase {
         baseConf.setLedgerManagerFactoryClassName(ledgerManagerFactoryClass);
         baseClientConf
                 .setLedgerManagerFactoryClassName(ledgerManagerFactoryClass);
+        electionPath = baseConf.getZkLedgersRootPath()
+                + "/underreplication/auditorelection";
     }
 
     @Before
@@ -321,6 +325,188 @@ public class AuditorLedgerCheckerTest extends MultiLedgerManagerTestCase {
                 data.contains(shutdownBookie));
     }
 
+    public void _testDelayedAuditOfLostBookies() throws Exception {
+        LedgerHandle lh1 = createAndAddEntriesToLedger();
+        Long ledgerId = lh1.getId();
+        LOG.debug("Created ledger : " + ledgerId);
+        ledgerList.add(ledgerId);
+        lh1.close();
+
+        final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList
+                .size());
+
+        // wait for 5 seconds before starting the recovery work when a bookie fails
+        baseConf.setLostBookieRecoveryDelay(5);
+
+        // shutdown a non auditor bookie; choosing non-auditor to avoid another election
+        String shutdownBookie = shutDownNonAuditorBookie();
+
+        LOG.debug("Waiting for ledgers to be marked as under replicated");
+        assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(4, TimeUnit.SECONDS));
+        assertEquals("under replicated ledgers identified when it was not expected", 0,
+                urLedgerList.size());
+
+        // wait for another 5 seconds for the ledger to get reported as under replicated
+        assertTrue("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS));
+
+        assertTrue("Ledger is not marked as underreplicated:" + ledgerId,
+                urLedgerList.contains(ledgerId));
+        Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList);
+        String data = urLedgerData.get(ledgerId);
+        assertTrue("Bookie " + shutdownBookie
+                + "is not listed in the ledger as missing replica :" + data,
+                data.contains(shutdownBookie));
+    }
+
+    /**
+     * Test publishing of under replicated ledgers by the auditor
+     * bookie is delayed if LostBookieRecoveryDelay option is set
+     */
+    @Test(timeout=60000)
+    public void testDelayedAuditOfLostBookies() throws Exception {
+        _testDelayedAuditOfLostBookies();
+    }
+
+    /**
+     * Test publishing of under replicated ledgers by the auditor
+     * bookie is delayed if LostBookieRecoveryDelay option is set
+     * and it continues to be delayed even when periodic bookie check
+     *  is set to run every 2 secs. I.e. periodic bookie check doesn't
+     *  override the delay
+     */
+    @Test(timeout=60000)
+    public void testDelayedAuditWithPeriodicBookieCheck() throws Exception {
+        // enable periodic bookie check on a cadence of every 2 seconds.
+        // this requires us to stop the auditor/auditorElectors, set the
+        // periodic check interval and restart the auditorElectors
+        stopAuditorElectors();
+        baseConf.setAuditorPeriodicBookieCheckInterval(2);
+        startAuditorElectors();
+
+        // wait for a second so that the initial periodic check finishes
+        Thread.sleep(1000);
+
+        // the delaying of audit should just work despite the fact
+        // we have enabled periodic bookie check
+        _testDelayedAuditOfLostBookies();
+    }
+
+    /**
+     * Test audit of bookies is delayed when one bookie is down. But when
+     * another one goes down, the audit is started immediately.
+     */
+    @Test(timeout=60000)
+    public void testDelayedAuditWithMultipleBookieFailures() throws Exception {
+        // wait for the periodic bookie check to finish
+        Thread.sleep(1000);
+
+        // create a ledger with a bunch of entries
+        LedgerHandle lh1 = createAndAddEntriesToLedger();
+        Long ledgerId = lh1.getId();
+        LOG.debug("Created ledger : " + ledgerId);
+        ledgerList.add(ledgerId);
+        lh1.close();
+
+        CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size());
+
+        // wait for 10 seconds before starting the recovery work when a bookie fails
+        baseConf.setLostBookieRecoveryDelay(10);
+
+        // shutdown a non auditor bookie to avoid an election
+        String shutdownBookie1 = shutDownNonAuditorBookie();
+
+        // wait for 3 seconds and there shouldn't be any under replicated ledgers
+        // because we have delayed the start of audit by 10 seconds
+        assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(3, TimeUnit.SECONDS));
+        assertEquals("under replicated ledgers identified when it was not expected", 0,
+                urLedgerList.size());
+
+        // Now shutdown the second non auditor bookie; We want to make sure that
+        // the history about having delayed recovery remains. Hence we make sure
+        // we bring down a non auditor bookie. This should cause the audit to take
+        // place immediately and not wait for the remaining 7 seconds to elapse
+        String shutdownBookie2 = shutDownNonAuditorBookie();
+
+        // 2 second grace period for the ledgers to get reported as under replicated
+        Thread.sleep(2000);
+
+        // If the following checks pass, it means that audit happened
+        // within 2 seconds of second bookie going down and it didn't
+        // wait for 7 more seconds. Hence the second bookie failure doesn't
+        // delay the audit
+        assertTrue("Ledger is not marked as underreplicated:" + ledgerId,
+                urLedgerList.contains(ledgerId));
+        Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList);
+        String data = urLedgerData.get(ledgerId);
+        assertTrue("Bookie " + shutdownBookie1 + shutdownBookie2
+                + " are not listed in the ledger as missing replicas :" + data,
+                data.contains(shutdownBookie1) && data.contains(shutdownBookie2));
+    }
+
+    /**
+     * Test audit of bookies is delayed during rolling upgrade scenario:
+     * a bookies goes down and comes up, the next bookie go down and up and so on.
+     * At any time only one bookie is down.
+     */
+    @Test(timeout=60000)
+    public void testDelayedAuditWithRollingUpgrade() throws Exception {
+        // wait for the periodic bookie check to finish
+        Thread.sleep(1000);
+
+        // create a ledger with a bunch of entries
+        LedgerHandle lh1 = createAndAddEntriesToLedger();
+        Long ledgerId = lh1.getId();
+        LOG.debug("Created ledger : " + ledgerId);
+        ledgerList.add(ledgerId);
+        lh1.close();
+
+        CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size());
+
+        // wait for 5 seconds before starting the recovery work when a bookie fails
+        baseConf.setLostBookieRecoveryDelay(5);
+
+        // shutdown a non auditor bookie to avoid an election
+        int idx1 = getShutDownNonAuditorBookieIdx("");
+        ServerConfiguration conf1 = bsConfs.get(idx1);
+        String shutdownBookie1 = shutdownBookie(idx1);
+
+        // wait for 2 seconds and there shouldn't be any under replicated ledgers
+        // because we have delayed the start of audit by 5 seconds
+        assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS));
+        assertEquals("under replicated ledgers identified when it was not expected", 0,
+                urLedgerList.size());
+
+        // restart the bookie we shut down above
+        bs.add(startBookie(conf1));
+
+        // Now to simulate the rolling upgrade, bring down a bookie different from
+        // the one we brought down/up above.
+        String shutdownBookie2 = shutDownNonAuditorBookie(shutdownBookie1);
+
+        // since the first bookie that was brought down/up has come up, there is only
+        // one bookie down at this time. Hence the lost bookie check shouldn't start
+        // immediately; it will start 5 seconds after the second bookie went down
+        assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS));
+        assertEquals("under replicated ledgers identified when it was not expected", 0,
+                urLedgerList.size());
+
+        // wait for a total of 6 seconds(2+4) for the ledgers to get reported as under replicated
+        Thread.sleep(4000);
+
+        // If the following checks pass, it means that auditing happened
+        // after lostBookieRecoveryDelay during rolling upgrade as expected
+        assertTrue("Ledger is not marked as underreplicated:" + ledgerId,
+                urLedgerList.contains(ledgerId));
+        Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList);
+        String data = urLedgerData.get(ledgerId);
+        assertTrue("Bookie " + shutdownBookie1 + "wrongly listed as missing the ledger: " + data,
+                   !data.contains(shutdownBookie1));
+        assertTrue("Bookie " + shutdownBookie2
+                   + " is not listed in the ledger as missing replicas :" + data,
+                   data.contains(shutdownBookie2));
+        LOG.info("*****************Test Complete");
+    }
+
     /**
      * Wait for ledger to be underreplicated, and to be missing all replicas specified
      */
@@ -442,4 +628,48 @@ public class AuditorLedgerCheckerTest extends MultiLedgerManagerTestCase {
             underReplicaLatch.countDown();
         }
     }
+
+    private BookieServer getAuditorBookie() throws Exception {
+        List<BookieServer> auditors = new LinkedList<BookieServer>();
+        byte[] data = zkc.getData(electionPath, false, null);
+        Assert.assertNotNull("Auditor election failed", data);
+        for (BookieServer bks : bs) {
+            if (new String(data).contains(bks.getLocalAddress().getPort() + "")) {
+                auditors.add(bks);
+            }
+        }
+        Assert.assertEquals("Multiple Bookies acting as Auditor!", 1, auditors
+                .size());
+        return auditors.get(0);
+    }
+
+    private String  shutDownNonAuditorBookie() throws Exception {
+        // shutdown bookie which is not an auditor
+        int indexOf = bs.indexOf(getAuditorBookie());
+        int bkIndexDownBookie;
+        if (indexOf < bs.size() - 1) {
+            bkIndexDownBookie = indexOf + 1;
+        } else {
+            bkIndexDownBookie = indexOf - 1;
+        }
+        return shutdownBookie(bkIndexDownBookie);
+    }
+
+    private int getShutDownNonAuditorBookieIdx(String exclude) throws Exception {
+        // shutdown bookie which is not an auditor
+        int indexOf = bs.indexOf(getAuditorBookie());
+        int bkIndexDownBookie = 0;
+        for (int i = 0; i < bs.size(); i++) {
+            if (i == indexOf || bs.get(i).getLocalAddress().toString().equals(exclude)) {
+                continue;
+            }
+            bkIndexDownBookie = i;
+            break;
+        }
+        return bkIndexDownBookie;
+    }
+
+    private String shutDownNonAuditorBookie(String exclude) throws Exception {
+        return shutdownBookie(getShutDownNonAuditorBookieIdx(exclude));
+    }
 }