You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by nd...@apache.org on 2020/05/18 19:09:17 UTC

[hbase] branch branch-2.3 updated: HBASE-24360 RollingBatchRestartRsAction loses track of dead servers

This is an automated email from the ASF dual-hosted git repository.

ndimiduk pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new ce9051e  HBASE-24360 RollingBatchRestartRsAction loses track of dead servers
ce9051e is described below

commit ce9051ea915f80b4dc07c1edbc8ccb25edbfb9de
Author: Nick Dimiduk <nd...@apache.org>
AuthorDate: Mon May 18 12:08:52 2020 -0700

    HBASE-24360 RollingBatchRestartRsAction loses track of dead servers
    
    `RollingBatchRestartRsAction` doesn't handle failure cases when
    tracking its list of dead servers. The original author believed that a
    failure to restart would result in a retry. However, by removing the
    dead server from the failed list, that state is lost, and retry never
    occurs. Because this action doesn't ever look back to the current
    state of the cluster, relying only on its local state for the current
    action invocation, it never realizes the abandoned server is still
    dead. Instead, be more careful to only remove the dead server from the
    list when the `startRs` invocation claims to have been successful.
    
    Signed-off-by: stack <st...@apache.org>
---
 .../java/org/apache/hadoop/hbase/chaos/actions/Action.java |  2 +-
 .../hbase/chaos/actions/RollingBatchRestartRsAction.java   | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index 13b67ae..113f1aa 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -190,7 +190,7 @@ public abstract class Action {
     getLogger().info("Stopping regionserver {}", server);
     cluster.stopRegionServer(server);
     cluster.waitForRegionServerToStop(server, killRsTimeout);
-    getLogger().info("Stoppiong regionserver {}. Reported num of rs:{}", server,
+    getLogger().info("Stopping regionserver {}. Reported num of rs:{}", server,
       cluster.getClusterMetrics().getLiveServerMetrics().size());
   }
 
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java
index bd136bb..c25a6b3 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java
@@ -20,8 +20,10 @@ package org.apache.hadoop.hbase.chaos.actions;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Objects;
 import java.util.Queue;
 import org.apache.commons.lang3.RandomUtils;
 import org.apache.hadoop.hbase.ServerName;
@@ -70,7 +72,7 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction {
     List<ServerName> selectedServers = selectServers();
 
     Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers);
-    Queue<ServerName> deadServers = new LinkedList<>();
+    LinkedList<ServerName> deadServers = new LinkedList<>();
 
     // loop while there are servers to be killed or dead servers to be restarted
     while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty())  && !context.isStopping()) {
@@ -103,13 +105,17 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction {
           deadServers.add(server);
           break;
         case START:
+          server = Objects.requireNonNull(deadServers.peek());
           try {
-            server = deadServers.remove();
             startRs(server);
+            // only remove the server from the known dead list if `startRs` succeeds.
+            deadServers.remove(server);
           } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
             // The start may fail but better to just keep going though we may lose server.
-            //
-            getLogger().info("Problem starting, will retry; code={}", e.getExitCode(), e);
+            // Shuffle the dead list to avoid getting stuck on a single stubborn host.
+            Collections.shuffle(deadServers);
+            getLogger().info(
+              "Problem starting {}, will retry; code={}", server, e.getExitCode(), e);
           }
           break;
       }