You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2020/06/12 14:58:26 UTC

[hbase] branch branch-2 updated: HBASE-24545 Add backoff to SCP check on WAL split completion (#1891)

This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new a4e1d07  HBASE-24545 Add backoff to SCP check on WAL split completion (#1891)
a4e1d07 is described below

commit a4e1d073f488befd554477289148248657445702
Author: Michael Stack <sa...@users.noreply.github.com>
AuthorDate: Fri Jun 12 07:57:07 2020 -0700

    HBASE-24545 Add backoff to SCP check on WAL split completion (#1891)
    
    Signed-off-by: Duo Zhang <zh...@apache.org>
---
 .../org/apache/hadoop/hbase/master/SplitLogManager.java | 17 ++++++++++++++++-
 .../hbase/master/procedure/ServerCrashProcedure.java    |  3 ++-
 .../apache/hadoop/hbase/master/TestSplitLogManager.java | 11 ++++++++++-
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
index 2bdfa03..5ac75d9 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
@@ -314,6 +314,21 @@ public class SplitLogManager {
     return false;
   }
 
+  /**
+   * Get the amount of time in milliseconds to wait till next check.
+   * Check less frequently if a bunch of work to do still. At a max, check every minute.
+   * At a minimum, check every 100ms. This is to alleviate case where perhaps there are a bunch of
+   * threads waiting on a completion. For example, if the zk-based implementation, we will scan the
+   * '/hbase/splitWAL' dir every time through this loop. If there are lots of WALs to
+   * split -- could be tens of thousands if big cluster -- then it will take a while. If
+   * the Master has many SCPs waiting on wal splitting -- could be up to 10 x the configured
+   * PE thread count (default would be 160) -- then the Master will be putting up a bunch of
+   * load on zk.
+   */
+  static int getBatchWaitTimeMillis(int remainingTasks) {
+    return remainingTasks < 10? 100: remainingTasks < 100? 1000: 60_000;
+  }
+
   private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
     synchronized (batch) {
       while ((batch.done + batch.error) != batch.installed) {
@@ -338,7 +353,7 @@ public class SplitLogManager {
               return;
             }
           }
-          batch.wait(100);
+          batch.wait(getBatchWaitTimeMillis(remainingTasks));
           if (server.isStopped()) {
             LOG.warn("Stopped while waiting for log splits to be completed");
             return;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 6ca8c0c..9267b8a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -308,7 +308,8 @@ public class ServerCrashProcedure
     MasterWalManager mwm = env.getMasterServices().getMasterWalManager();
     AssignmentManager am = env.getMasterServices().getAssignmentManager();
     // TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
-    // PROBLEM!!! WE BLOCK HERE.
+    // PROBLEM!!! WE BLOCK HERE. Can block for hours if hundreds of WALs to split and hundreds
+    // of SCPs running because big cluster crashed down.
     am.getRegionStates().logSplitting(this.serverName);
     mwm.splitLog(this.serverName);
     if (!carryingMeta) {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java
index 8e40fc4..3dbe86b 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -156,6 +156,15 @@ public class TestSplitLogManager {
     TEST_UTIL.shutdownMiniZKCluster();
   }
 
+  @Test
+  public void testBatchWaitMillis() {
+    assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(0));
+    assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(1));
+    assertEquals(1000, SplitLogManager.getBatchWaitTimeMillis(10));
+    assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(101));
+    assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(1011));
+  }
+
   private interface Expr {
     long eval();
   }