You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2020/06/12 14:58:26 UTC
[hbase] branch branch-2 updated: HBASE-24545 Add backoff to SCP
check on WAL split completion (#1891)
This is an automated email from the ASF dual-hosted git repository.
stack pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new a4e1d07 HBASE-24545 Add backoff to SCP check on WAL split completion (#1891)
a4e1d07 is described below
commit a4e1d073f488befd554477289148248657445702
Author: Michael Stack <sa...@users.noreply.github.com>
AuthorDate: Fri Jun 12 07:57:07 2020 -0700
HBASE-24545 Add backoff to SCP check on WAL split completion (#1891)
Signed-off-by: Duo Zhang <zh...@apache.org>
---
.../org/apache/hadoop/hbase/master/SplitLogManager.java | 17 ++++++++++++++++-
.../hbase/master/procedure/ServerCrashProcedure.java | 3 ++-
.../apache/hadoop/hbase/master/TestSplitLogManager.java | 11 ++++++++++-
3 files changed, 28 insertions(+), 3 deletions(-)
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
index 2bdfa03..5ac75d9 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java
@@ -314,6 +314,21 @@ public class SplitLogManager {
return false;
}
+ /**
+ * Get the amount of time in milliseconds to wait till next check.
+ * Check less frequently if a bunch of work to do still. At a max, check every minute.
+ * At a minimum, check every 100ms. This is to alleviate case where perhaps there are a bunch of
+ * threads waiting on a completion. For example, if the zk-based implementation, we will scan the
+ * '/hbase/splitWAL' dir every time through this loop. If there are lots of WALs to
+ * split -- could be tens of thousands if big cluster -- then it will take a while. If
+ * the Master has many SCPs waiting on wal splitting -- could be up to 10 x the configured
+ * PE thread count (default would be 160) -- then the Master will be putting up a bunch of
+ * load on zk.
+ */
+ static int getBatchWaitTimeMillis(int remainingTasks) {
+ return remainingTasks < 10? 100: remainingTasks < 100? 1000: 60_000;
+ }
+
private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
synchronized (batch) {
while ((batch.done + batch.error) != batch.installed) {
@@ -338,7 +353,7 @@ public class SplitLogManager {
return;
}
}
- batch.wait(100);
+ batch.wait(getBatchWaitTimeMillis(remainingTasks));
if (server.isStopped()) {
LOG.warn("Stopped while waiting for log splits to be completed");
return;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 6ca8c0c..9267b8a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -308,7 +308,8 @@ public class ServerCrashProcedure
MasterWalManager mwm = env.getMasterServices().getMasterWalManager();
AssignmentManager am = env.getMasterServices().getAssignmentManager();
// TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
- // PROBLEM!!! WE BLOCK HERE.
+ // PROBLEM!!! WE BLOCK HERE. Can block for hours if hundreds of WALs to split and hundreds
+ // of SCPs running because big cluster crashed down.
am.getRegionStates().logSplitting(this.serverName);
mwm.splitLog(this.serverName);
if (!carryingMeta) {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java
index 8e40fc4..3dbe86b 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -156,6 +156,15 @@ public class TestSplitLogManager {
TEST_UTIL.shutdownMiniZKCluster();
}
+ @Test
+ public void testBatchWaitMillis() {
+ assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(0));
+ assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(1));
+ assertEquals(1000, SplitLogManager.getBatchWaitTimeMillis(10));
+ assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(101));
+ assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(1011));
+ }
+
private interface Expr {
long eval();
}