You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bookkeeper.apache.org by si...@apache.org on 2018/02/19 07:54:25 UTC

[bookkeeper] branch master updated: ISSUE #1139: Add debug to replication fencing

This is an automated email from the ASF dual-hosted git repository.

sijie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new 546eed4  ISSUE #1139: Add debug to replication fencing
546eed4 is described below

commit 546eed4913e148a8a4f075ee412d1abe28398ffa
Author: JV Jujjuri <vj...@salesforce.com>
AuthorDate: Sun Feb 18 23:54:18 2018 -0800

    ISSUE #1139: Add debug to replication fencing
    
    Descriptions of the changes in this PR:
    
    When ledger is fenced, the client may get write error.
    Not having enough logging in this area making debugging harder.
    Optimised the code in addition to adding more logging in this area.
    
    Signed-off-by: Venkateswararao Jujjuri (JV) <vjujjurisalesforce.com>
    
    Master Issue: #1139
    
    Author: JV Jujjuri <vj...@salesforce.com>
    
    Reviewers: Sijie Guo <si...@apache.org>
    
    This closes #1140 from jvrao/bk-issue-1139, closes #1139
---
 .../bookkeeper/replication/ReplicationWorker.java  | 50 ++++++++++++++--------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
index 65c0c2c..f6ed7d1 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
@@ -339,6 +339,10 @@ public class ReplicationWorker implements Runnable {
         Collection<BookieSocketAddress> available = admin.getAvailableBookies();
         for (BookieSocketAddress b : finalEnsemble) {
             if (!available.contains(b)) {
+                if (LOG.isDebugEnabled()) {
+                    LOG.debug("Bookie {} is missing from the list of Available Bookies. ledger {}:ensemble {}.",
+                            b, lh.getId(), finalEnsemble);
+                }
                 return true;
             }
         }
@@ -366,32 +370,45 @@ public class ReplicationWorker implements Runnable {
         TimerTask timerTask = new TimerTask() {
             @Override
             public void run() {
+                boolean isRecoveryOpen = false;
                 LedgerHandle lh = null;
                 try {
                     lh = admin.openLedgerNoRecovery(ledgerId);
                     if (isLastSegmentOpenAndMissingBookies(lh)) {
+                        // Need recovery open, close the old ledger handle.
+                        lh.close();
+                        // Recovery open could result in client write failure.
+                        LOG.warn("Missing bookie(s) from last segment. Opening Ledger{} for Recovery.", ledgerId);
                         lh = admin.openLedger(ledgerId);
+                        isRecoveryOpen = true;
                     }
-
-                    Set<LedgerFragment> fragments =
-                        getUnderreplicatedFragments(lh, conf.getAuditorLedgerVerificationPercentage());
-                    for (LedgerFragment fragment : fragments) {
-                        if (!fragment.isClosed()) {
-                            lh = admin.openLedger(ledgerId);
-                            break;
+                    if (!isRecoveryOpen){
+                        Set<LedgerFragment> fragments =
+                            getUnderreplicatedFragments(lh, conf.getAuditorLedgerVerificationPercentage());
+                        for (LedgerFragment fragment : fragments) {
+                            if (!fragment.isClosed()) {
+                                // Need recovery open, close the old ledger handle.
+                                lh.close();
+                                // Recovery open could result in client write failure.
+                                LOG.warn("Open Fragment{}. Opening Ledger{} for Recovery.",
+                                        fragment.getEnsemble(), ledgerId);
+                                lh = admin.openLedger(ledgerId);
+                                isRecoveryOpen = true;
+                                break;
+                            }
                         }
                     }
                 } catch (InterruptedException e) {
                     Thread.currentThread().interrupt();
-                    LOG.info("InterruptedException "
-                            + "while replicating fragments", e);
+                    LOG.info("InterruptedException while fencing the ledger {}"
+                            + " for rereplication of postponed ledgers", ledgerId, e);
                 } catch (BKNoSuchLedgerExistsException bknsle) {
                     if (LOG.isDebugEnabled()) {
-                        LOG.debug("Ledger was deleted, safe to continue", bknsle);
+                        LOG.debug("Ledger {} was deleted, safe to continue", ledgerId, bknsle);
                     }
                 } catch (BKException e) {
-                    LOG.error("BKException while fencing the ledger"
-                            + " for rereplication of postponed ledgers", e);
+                    LOG.error("BKException while fencing the ledger {}"
+                            + " for rereplication of postponed ledgers", ledgerId, e);
                 } finally {
                     try {
                         if (lh != null) {
@@ -399,20 +416,19 @@ public class ReplicationWorker implements Runnable {
                         }
                     } catch (InterruptedException e) {
                         Thread.currentThread().interrupt();
-                        LOG.info("InterruptedException while closing "
-                                + "ledger", e);
+                        LOG.info("InterruptedException while closing ledger {}", ledgerId, e);
                     } catch (BKException e) {
                         // Lets go ahead and release the lock. Catch actual
                         // exception in normal replication flow and take
                         // action.
-                        LOG.warn("BKException while closing ledger ", e);
+                        LOG.warn("BKException while closing ledger {} ", ledgerId, e);
                     } finally {
                         try {
                             underreplicationManager
                                     .releaseUnderreplicatedLedger(ledgerId);
                         } catch (UnavailableException e) {
-                            LOG.error("UnavailableException "
-                                    + "while replicating fragments", e);
+                            LOG.error("UnavailableException while replicating fragments of ledger {}",
+                                    ledgerId, e);
                             shutdown();
                         }
                     }

-- 
To stop receiving notification emails like this one, please contact
sijie@apache.org.