You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2020/05/29 17:02:34 UTC
[hbase] branch branch-2.3 updated: HBASE-22287 inifinite retries on
failed server in RSProcedureDispatcher (#1800)
This is an automated email from the ASF dual-hosted git repository.
stack pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.3 by this push:
new 41096a8 HBASE-22287 inifinite retries on failed server in RSProcedureDispatcher (#1800)
41096a8 is described below
commit 41096a83a92ae0e1e3bab0b57df298388b59d61d
Author: Michael Stack <sa...@users.noreply.github.com>
AuthorDate: Fri May 29 10:00:53 2020 -0700
HBASE-22287 inifinite retries on failed server in RSProcedureDispatcher (#1800)
Adds backoff in place of retry every 100ms.
Signed-off-by: Duo Zhang <zh...@apache.org>
---
.../master/procedure/RSProcedureDispatcher.java | 37 ++++++++++++++--------
1 file changed, 23 insertions(+), 14 deletions(-)
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
index 9072c3a..74e00f5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
@@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
@@ -256,31 +257,33 @@ public class RSProcedureDispatcher
}
private boolean scheduleForRetry(IOException e) {
- LOG.debug("request to {} failed, try={}", serverName, numberOfAttemptsSoFar, e);
+ LOG.debug("Request to {} failed, try={}", serverName, numberOfAttemptsSoFar, e);
// Should we wait a little before retrying? If the server is starting it's yes.
if (e instanceof ServerNotRunningYetException) {
long remainingTime = getMaxWaitTime() - EnvironmentEdgeManager.currentTime();
if (remainingTime > 0) {
- LOG.warn("waiting a little before trying on the same server={}," +
- " try={}, can wait up to {}ms", serverName, numberOfAttemptsSoFar, remainingTime);
+ LOG.warn("Waiting a little before retrying {}, try={}, can wait up to {}ms",
+ serverName, numberOfAttemptsSoFar, remainingTime);
numberOfAttemptsSoFar++;
+ // Retry every 100ms up to maximum wait time.
submitTask(this, 100, TimeUnit.MILLISECONDS);
return true;
}
- LOG.warn("server {} is not up for a while; try a new one", serverName);
+ LOG.warn("{} is throwing ServerNotRunningYetException for {}ms; trying another server",
+ serverName, getMaxWaitTime());
return false;
}
if (e instanceof DoNotRetryIOException) {
- LOG.warn("server {} tells us do not retry due to {}, try={}, give up", serverName,
+ LOG.warn("{} tells us DoNotRetry due to {}, try={}, give up", serverName,
e.toString(), numberOfAttemptsSoFar);
return false;
}
- // this exception is thrown in the rpc framework, where we can make sure that the call has not
+ // This exception is thrown in the rpc framework, where we can make sure that the call has not
// been executed yet, so it is safe to mark it as fail. Especially for open a region, we'd
- // better choose another region server
- // notice that, it is safe to quit only if this is the first time we send request to region
- // server. Maybe the region server has accept our request the first time, and then there is a
- // network error which prevents we receive the response, and the second time we hit a
+ // better choose another region server.
+ // Notice that, it is safe to quit only if this is the first time we send request to region
+ // server. Maybe the region server has accepted our request the first time, and then there is
+ // a network error which prevents we receive the response, and the second time we hit a
// CallQueueTooBigException, obviously it is not safe to quit here, otherwise it may lead to a
// double assign...
if (e instanceof CallQueueTooBigException && numberOfAttemptsSoFar == 0) {
@@ -290,7 +293,7 @@ public class RSProcedureDispatcher
}
// Always retry for other exception types if the region server is not dead yet.
if (!master.getServerManager().isServerOnline(serverName)) {
- LOG.warn("request to {} failed due to {}, try={}, and the server is dead, give up",
+ LOG.warn("Request to {} failed due to {}, try={} and the server is not online, give up",
serverName, e.toString(), numberOfAttemptsSoFar);
return false;
}
@@ -299,14 +302,20 @@ public class RSProcedureDispatcher
// background task to check whether the region server is dead. And if it is dead, call
// remoteCallFailed to tell the upper layer. Keep retrying here does not lead to incorrect
// result, but waste some resources.
- LOG.warn("server {} is aborted or stopped, for safety we still need to" +
+ LOG.warn("{} is aborted or stopped, for safety we still need to" +
" wait until it is fully dead, try={}", serverName, numberOfAttemptsSoFar);
} else {
- LOG.warn("request to server {} failed due to {}, try={}, retrying...", serverName,
+ LOG.warn("request to {} failed due to {}, try={}, retrying...", serverName,
e.toString(), numberOfAttemptsSoFar);
}
numberOfAttemptsSoFar++;
- submitTask(this, 100, TimeUnit.MILLISECONDS);
+ // Add some backoff here as the attempts rise otherwise if a stuck condition, will fill logs
+ // with failed attempts. None of our backoff classes -- RetryCounter or ClientBackoffPolicy
+ // -- fit here nicely so just do something simple; increment by 100ms * retry^2 on each try
+ // up to max of 10 seconds (don't want to back off too much in case of situation change).
+ submitTask(this,
+ Math.min(100 * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar), 10 * 1000),
+ TimeUnit.MILLISECONDS);
return true;
}