You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bookkeeper.apache.org by eo...@apache.org on 2019/05/15 06:35:23 UTC

[bookkeeper] branch master updated: [BK-GC] avoid blocking call in gc-thread

This is an automated email from the ASF dual-hosted git repository.

eolivelli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new f5ddc36  [BK-GC] avoid blocking call in gc-thread
f5ddc36 is described below

commit f5ddc36cc30c5bab5d2e8ebc5fa552c2ad0d6eea
Author: Rajan Dhabalia <rd...@apache.org>
AuthorDate: Tue May 14 23:35:18 2019 -0700

    [BK-GC] avoid blocking call in gc-thread
    
    ### Motivation
    
    Right now, we have below 3 issues because of which gc thread gets blocked forever and it can't perform gc-task further. Below issues are mainly related to blocking call while doing zk-operation without timeout.
    
    bug-fixes:
    1. right now, [GC - ScanAndCompareGarbageCollector](https://github.com/apache/bookkeeper/blob/master/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/ScanAndCompareGarbageCollector.java#L142) passes timeout in millisecond to [LedgerManager](https://github.com/apache/bookkeeper/blob/master/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LongHierarchicalLedgerManager.java#L166) but it
    takes it as second and again try to convert it in millis so, 30Kms timeout becomes [30M ms timeout](https://github.com/apache/bookkeeper/blob/master/bookkeeper-server/src/main/java/org/apache/bookkeeper/util/ZkUtils.java#L245). Sp, fix timeout unit during gc.
    
    2. Right now, GC makes blocking call to get list of children on ledger znode and sometime zk-call back doesn't comeback which blocks the gc-thread forever. However, recently we added the timeout on the [object-waiting-lock](https://github.com/apache/bookkeeper/blob/master/bookkeeper-server/src/main/java/org/apache/bookkeeper/util/ZkUtils.java#L243-L248) which doesn't work because it's in while loop and `object.wait(timeout)` completes without any exception and GC threads keep running  [...]
    
    3. add zk-timeout during delete ledgers in bookie else it can also block the GC thread.
    
    
    
    
    
    ### Changes
    
    add timeout while bk-gc makes zk-call to verify deleted ledgers.
    
    Reviewers: Enrico Olivelli <eo...@gmail.com>, Sijie Guo <si...@apache.org>, Rajan Dhabalia <rd...@apache.org>
    
    This closes #1940 from rdhabalia/verify_gc
---
 .../bookie/ScanAndCompareGarbageCollector.java     | 23 ++++++++++++++++------
 .../bookkeeper/meta/CleanupLedgerManager.java      |  4 ++--
 .../apache/bookkeeper/meta/FlatLedgerManager.java  |  4 ++--
 .../bookkeeper/meta/HierarchicalLedgerManager.java |  6 +++---
 .../org/apache/bookkeeper/meta/LedgerManager.java  |  4 ++--
 .../meta/LegacyHierarchicalLedgerManager.java      | 12 +++++------
 .../meta/LongHierarchicalLedgerManager.java        | 12 +++++------
 .../bookkeeper/meta/MSLedgerManagerFactory.java    |  2 +-
 .../java/org/apache/bookkeeper/util/ZkUtils.java   | 12 ++++++++---
 .../apache/bookkeeper/bookie/CompactionTest.java   |  2 +-
 .../client/ParallelLedgerRecoveryTest.java         |  4 ++--
 .../apache/bookkeeper/meta/MockLedgerManager.java  |  2 +-
 .../metadata/etcd/EtcdLedgerManager.java           |  2 +-
 13 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/ScanAndCompareGarbageCollector.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/ScanAndCompareGarbageCollector.java
index 247b610..02e47ae 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/ScanAndCompareGarbageCollector.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/ScanAndCompareGarbageCollector.java
@@ -32,6 +32,9 @@ import java.util.SortedMap;
 import java.util.TreeSet;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
 import org.apache.bookkeeper.client.BKException;
 import org.apache.bookkeeper.conf.ServerConfiguration;
 import org.apache.bookkeeper.meta.LedgerManager;
@@ -139,8 +142,9 @@ public class ScanAndCompareGarbageCollector implements GarbageCollector {
             }
 
             // Iterate over all the ledger on the metadata store
-            long zkOpTimeout = this.conf.getZkTimeout() * 2;
-            LedgerRangeIterator ledgerRangeIterator = ledgerManager.getLedgerRanges(zkOpTimeout);
+            long zkOpTimeoutMs = this.conf.getZkTimeout() * 2;
+            LedgerRangeIterator ledgerRangeIterator = ledgerManager
+                    .getLedgerRanges(zkOpTimeoutMs);
             Set<Long> ledgersInMetadata = null;
             long start;
             long end = -1;
@@ -167,13 +171,20 @@ public class ScanAndCompareGarbageCollector implements GarbageCollector {
                         if (verifyMetadataOnGc) {
                             int rc = BKException.Code.OK;
                             try {
-                                result(ledgerManager.readLedgerMetadata(bkLid));
-                            } catch (BKException e) {
-                                rc = e.getCode();
+                                result(ledgerManager.readLedgerMetadata(bkLid), zkOpTimeoutMs, TimeUnit.MILLISECONDS);
+                            } catch (BKException | TimeoutException e) {
+                                if (e instanceof BKException) {
+                                    rc = ((BKException) e).getCode();
+                                } else {
+                                    LOG.warn("Time-out while fetching metadata for Ledger {} : {}.", bkLid,
+                                            e.getMessage());
+
+                                    continue;
+                                }
                             }
                             if (rc != BKException.Code.NoSuchLedgerExistsOnMetadataServerException) {
                                 LOG.warn("Ledger {} Missing in metadata list, but ledgerManager returned rc: {}.",
-                                         bkLid, rc);
+                                        bkLid, rc);
                                 continue;
                             }
                         }
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/CleanupLedgerManager.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/CleanupLedgerManager.java
index 3c7ddc5..a3ce432 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/CleanupLedgerManager.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/CleanupLedgerManager.java
@@ -210,13 +210,13 @@ public class CleanupLedgerManager implements LedgerManager {
     }
 
     @Override
-    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
+    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
         closeLock.readLock().lock();
         try {
             if (closed) {
                 return new ClosedLedgerRangeIterator();
             }
-            return underlying.getLedgerRanges(zkOpTimeoutSec);
+            return underlying.getLedgerRanges(zkOpTimeoutMs);
         } finally {
             closeLock.readLock().unlock();
         }
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/FlatLedgerManager.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/FlatLedgerManager.java
index 8c7f05f..7c89326 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/FlatLedgerManager.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/FlatLedgerManager.java
@@ -89,7 +89,7 @@ class FlatLedgerManager extends AbstractZkLedgerManager {
     }
 
     @Override
-    public LedgerRangeIterator getLedgerRanges(long zkOpTimeOutSec) {
+    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
         return new LedgerRangeIterator() {
             // single iterator, can visit only one time
             boolean nextCalled = false;
@@ -103,7 +103,7 @@ class FlatLedgerManager extends AbstractZkLedgerManager {
 
                 try {
                     zkActiveLedgers = ledgerListToSet(
-                            ZkUtils.getChildrenInSingleNode(zk, ledgerRootPath, zkOpTimeOutSec),
+                            ZkUtils.getChildrenInSingleNode(zk, ledgerRootPath, zkOpTimeoutMs),
                             ledgerRootPath);
                     nextRange = new LedgerRange(zkActiveLedgers);
                 } catch (KeeperException.NoNodeException e) {
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/HierarchicalLedgerManager.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/HierarchicalLedgerManager.java
index 079fcfa..9cd6aed 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/HierarchicalLedgerManager.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/HierarchicalLedgerManager.java
@@ -87,9 +87,9 @@ class HierarchicalLedgerManager extends AbstractHierarchicalLedgerManager {
     }
 
     @Override
-    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
-        LedgerRangeIterator legacyLedgerRangeIterator = legacyLM.getLedgerRanges(zkOpTimeoutSec);
-        LedgerRangeIterator longLedgerRangeIterator = longLM.getLedgerRanges(zkOpTimeoutSec);
+    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
+        LedgerRangeIterator legacyLedgerRangeIterator = legacyLM.getLedgerRanges(zkOpTimeoutMs);
+        LedgerRangeIterator longLedgerRangeIterator = longLM.getLedgerRanges(zkOpTimeoutMs);
         return new HierarchicalLedgerRangeIterator(legacyLedgerRangeIterator, longLedgerRangeIterator);
     }
 
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LedgerManager.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LedgerManager.java
index 15da14b..56c447d 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LedgerManager.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LedgerManager.java
@@ -149,12 +149,12 @@ public interface LedgerManager extends Closeable {
     /**
      * Loop to scan a range of metadata from metadata storage.
      *
-     * @param zkOpTimeOutSec
+     * @param zkOpTimeOutMs
      *            Iterator considers timeout while fetching ledger-range from
      *            zk.
      * @return will return a iterator of the Ranges
      */
-    LedgerRangeIterator getLedgerRanges(long zkOpTimeOutSec);
+    LedgerRangeIterator getLedgerRanges(long zkOpTimeOutMs);
 
     /**
      * Used to represent the Ledgers range returned from the
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LegacyHierarchicalLedgerManager.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LegacyHierarchicalLedgerManager.java
index 1a63407..9828719 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LegacyHierarchicalLedgerManager.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LegacyHierarchicalLedgerManager.java
@@ -153,8 +153,8 @@ class LegacyHierarchicalLedgerManager extends AbstractHierarchicalLedgerManager
     }
 
     @Override
-    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
-        return new LegacyHierarchicalLedgerRangeIterator(zkOpTimeoutSec);
+    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
+        return new LegacyHierarchicalLedgerRangeIterator(zkOpTimeoutMs);
     }
 
     /**
@@ -166,10 +166,10 @@ class LegacyHierarchicalLedgerManager extends AbstractHierarchicalLedgerManager
         private String curL1Nodes = "";
         private boolean iteratorDone = false;
         private LedgerRange nextRange = null;
-        private final long zkOpTimeoutSec;
+        private final long zkOpTimeoutMs;
 
-        public LegacyHierarchicalLedgerRangeIterator(long zkOpTimeoutSec) {
-            this.zkOpTimeoutSec = zkOpTimeoutSec;
+        public LegacyHierarchicalLedgerRangeIterator(long zkOpTimeoutMs) {
+            this.zkOpTimeoutMs = zkOpTimeoutMs;
         }
 
         /**
@@ -266,7 +266,7 @@ class LegacyHierarchicalLedgerManager extends AbstractHierarchicalLedgerManager
             String nodePath = nodeBuilder.toString();
             List<String> ledgerNodes = null;
             try {
-                ledgerNodes = ZkUtils.getChildrenInSingleNode(zk, nodePath, zkOpTimeoutSec);
+                ledgerNodes = ZkUtils.getChildrenInSingleNode(zk, nodePath, zkOpTimeoutMs);
             } catch (KeeperException.NoNodeException e) {
                 /* If the node doesn't exist, we must have raced with a recursive node removal, just
                  * return an empty list. */
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LongHierarchicalLedgerManager.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LongHierarchicalLedgerManager.java
index 95f8e48..7ec2f5a 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LongHierarchicalLedgerManager.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/LongHierarchicalLedgerManager.java
@@ -139,8 +139,8 @@ class LongHierarchicalLedgerManager extends AbstractHierarchicalLedgerManager {
     }
 
     @Override
-    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
-        return new LongHierarchicalLedgerRangeIterator(zkOpTimeoutSec);
+    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
+        return new LongHierarchicalLedgerRangeIterator(zkOpTimeoutMs);
     }
 
 
@@ -149,7 +149,7 @@ class LongHierarchicalLedgerManager extends AbstractHierarchicalLedgerManager {
      */
     private class LongHierarchicalLedgerRangeIterator implements LedgerRangeIterator {
         LedgerRangeIterator rootIterator;
-        final long zkOpTimeoutSec;
+        final long zkOpTimeoutMs;
 
         /**
          * Returns all children with path as a parent.  If path is non-existent,
@@ -163,7 +163,7 @@ class LongHierarchicalLedgerManager extends AbstractHierarchicalLedgerManager {
          */
         List<String> getChildrenAt(String path) throws IOException {
             try {
-                List<String> children = ZkUtils.getChildrenInSingleNode(zk, path, zkOpTimeoutSec);
+                List<String> children = ZkUtils.getChildrenInSingleNode(zk, path, zkOpTimeoutMs);
                 Collections.sort(children);
                 return children;
             } catch (KeeperException.NoNodeException e) {
@@ -285,8 +285,8 @@ class LongHierarchicalLedgerManager extends AbstractHierarchicalLedgerManager {
             }
         }
 
-        private LongHierarchicalLedgerRangeIterator(long zkOpTimeoutSec) {
-            this.zkOpTimeoutSec = zkOpTimeoutSec;
+        private LongHierarchicalLedgerRangeIterator(long zkOpTimeoutMs) {
+            this.zkOpTimeoutMs = zkOpTimeoutMs;
         }
 
         private void bootstrap() throws IOException {
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/MSLedgerManagerFactory.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/MSLedgerManagerFactory.java
index 201a0a3..9b21c87 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/MSLedgerManagerFactory.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/meta/MSLedgerManagerFactory.java
@@ -642,7 +642,7 @@ public class MSLedgerManagerFactory extends AbstractZkLedgerManagerFactory {
         }
 
         @Override
-        public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
+        public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
             return new MSLedgerRangeIterator();
         }
 
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/util/ZkUtils.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/util/ZkUtils.java
index 23d0b42..cc0612f 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/util/ZkUtils.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/util/ZkUtils.java
@@ -25,7 +25,6 @@ import java.io.File;
 import java.io.IOException;
 import java.util.List;
 import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.bookkeeper.conf.AbstractConfiguration;
@@ -222,7 +221,7 @@ public class ZkUtils {
      * @throws InterruptedException
      * @throws IOException
      */
-    public static List<String> getChildrenInSingleNode(final ZooKeeper zk, final String node, long timeOutSec)
+    public static List<String> getChildrenInSingleNode(final ZooKeeper zk, final String node, long zkOpTimeoutMs)
             throws InterruptedException, IOException, KeeperException.NoNodeException {
         final GetChildrenCtx ctx = new GetChildrenCtx();
         getChildrenInSingleNode(zk, node, new GenericCallback<List<String>>() {
@@ -240,13 +239,20 @@ public class ZkUtils {
         });
 
         synchronized (ctx) {
+            long startTime = System.currentTimeMillis();
             while (!ctx.done) {
                 try {
-                    ctx.wait(timeOutSec > 0 ? TimeUnit.SECONDS.toMillis(timeOutSec) : 0);
+                    ctx.wait(zkOpTimeoutMs > 0 ? zkOpTimeoutMs : 0);
                 } catch (InterruptedException e) {
                     ctx.rc = Code.OPERATIONTIMEOUT.intValue();
                     ctx.done = true;
                 }
+                // timeout the process if get-children response not received
+                // zkOpTimeoutMs.
+                if (zkOpTimeoutMs > 0 && (System.currentTimeMillis() - startTime) >= zkOpTimeoutMs) {
+                    ctx.rc = Code.OPERATIONTIMEOUT.intValue();
+                    ctx.done = true;
+                }
             }
         }
         if (Code.NONODE.intValue() == ctx.rc) {
diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/CompactionTest.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/CompactionTest.java
index 0d6e5d6..adef116 100644
--- a/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/CompactionTest.java
+++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/CompactionTest.java
@@ -970,7 +970,7 @@ public abstract class CompactionTest extends BookKeeperClusterTestCase {
                 }
 
                 @Override
-                public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
+                public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
                     final AtomicBoolean hasnext = new AtomicBoolean(true);
                     return new LedgerManager.LedgerRangeIterator() {
                         @Override
diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/ParallelLedgerRecoveryTest.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/ParallelLedgerRecoveryTest.java
index e6c3c22..177cbe3 100644
--- a/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/ParallelLedgerRecoveryTest.java
+++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/client/ParallelLedgerRecoveryTest.java
@@ -112,8 +112,8 @@ public class ParallelLedgerRecoveryTest extends BookKeeperClusterTestCase {
         }
 
         @Override
-        public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
-            return lm.getLedgerRanges(zkOpTimeoutSec);
+        public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
+            return lm.getLedgerRanges(zkOpTimeoutMs);
         }
 
         @Override
diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/meta/MockLedgerManager.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/meta/MockLedgerManager.java
index 0f818b6..952837b 100644
--- a/bookkeeper-server/src/test/java/org/apache/bookkeeper/meta/MockLedgerManager.java
+++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/meta/MockLedgerManager.java
@@ -188,7 +188,7 @@ public class MockLedgerManager implements LedgerManager {
     }
 
     @Override
-    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutSec) {
+    public LedgerRangeIterator getLedgerRanges(long zkOpTimeoutMs) {
         return null;
     }
 
diff --git a/metadata-drivers/etcd/src/main/java/org/apache/bookkeeper/metadata/etcd/EtcdLedgerManager.java b/metadata-drivers/etcd/src/main/java/org/apache/bookkeeper/metadata/etcd/EtcdLedgerManager.java
index a40a4bc..f91f2f7 100644
--- a/metadata-drivers/etcd/src/main/java/org/apache/bookkeeper/metadata/etcd/EtcdLedgerManager.java
+++ b/metadata-drivers/etcd/src/main/java/org/apache/bookkeeper/metadata/etcd/EtcdLedgerManager.java
@@ -420,7 +420,7 @@ class EtcdLedgerManager implements LedgerManager {
     }
 
     @Override
-    public LedgerRangeIterator getLedgerRanges(long opTimeOutSec) {
+    public LedgerRangeIterator getLedgerRanges(long opTimeOutMs) {
         KeyStream<Long> ks = new KeyStream<>(
             kvClient,
             ByteSequence.fromString(EtcdUtils.getLedgerKey(scope, 0L)),