You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by bh...@apache.org on 2020/06/22 17:29:20 UTC

[hbase] branch branch-1 updated (655658c -> 0224dcc)

This is an automated email from the ASF dual-hosted git repository.

bharathv pushed a change to branch branch-1
in repository https://gitbox.apache.org/repos/asf/hbase.git.


    from 655658c  HBASE-24550 Passing '-h' or '--help' to bin/hbase doesn't do as expected
     new 186373b  HBASE-22982: region server suspend/resume (#592)
     new 0224dcc  HBASE-24360 RollingBatchRestartRsAction loses track of dead servers

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../hadoop/hbase/DistributedHBaseCluster.java      |  32 ++++--
 .../apache/hadoop/hbase/chaos/actions/Action.java  |  78 ++++++++++----
 .../chaos/actions/RestartActionBaseAction.java     |  29 +++++
 .../chaos/actions/RollingBatchRestartRsAction.java |  14 ++-
 .../actions/RollingBatchSuspendResumeRsAction.java | 120 +++++++++++++++++++++
 .../hbase/chaos/factories/MonkeyConstants.java     |   6 ++
 .../ServerAndDependenciesKillingMonkeyFactory.java |  18 +++-
 .../factories/ServerKillingMonkeyFactory.java      |  18 +++-
 .../factories/SlowDeterministicMonkeyFactory.java  |  37 ++++++-
 .../StressAssignmentManagerMonkeyFactory.java      |  38 ++++++-
 .../chaos/monkies/PolicyBasedChaosMonkey.java      |  19 ++--
 .../java/org/apache/hadoop/hbase/HBaseCluster.java |  14 +++
 .../org/apache/hadoop/hbase/MiniHBaseCluster.java  |  36 +++++++
 13 files changed, 411 insertions(+), 48 deletions(-)
 create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java


[hbase] 01/02: HBASE-22982: region server suspend/resume (#592)

Posted by bh...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bharathv pushed a commit to branch branch-1
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 186373bea43925c7bd35df83ef068d9378fc2426
Author: BukrosSzabolcs <45...@users.noreply.github.com>
AuthorDate: Thu Sep 26 10:07:38 2019 +0200

    HBASE-22982: region server suspend/resume (#592)
    
    * Add chaos monkey action for suspend/resume region servers
    * Add these to relevant chaos monkeys
    
    branch-1-backport-note: Graceful regionserver restart action wasn't
    backported due to a dependency of "RegionMover" script. Can be done
    later if needed.
    
    Signed-off-by: Balazs Meszaros <me...@apache.org>
    Signed-off-by: Peter Somogyi <ps...@apache.org>
---
 .../hadoop/hbase/DistributedHBaseCluster.java      |  32 ++++--
 .../apache/hadoop/hbase/chaos/actions/Action.java  |  78 ++++++++++----
 .../chaos/actions/RestartActionBaseAction.java     |  29 +++++
 .../actions/RollingBatchSuspendResumeRsAction.java | 120 +++++++++++++++++++++
 .../hbase/chaos/factories/MonkeyConstants.java     |   6 ++
 .../ServerAndDependenciesKillingMonkeyFactory.java |  18 +++-
 .../factories/ServerKillingMonkeyFactory.java      |  18 +++-
 .../factories/SlowDeterministicMonkeyFactory.java  |  37 ++++++-
 .../StressAssignmentManagerMonkeyFactory.java      |  38 ++++++-
 .../chaos/monkies/PolicyBasedChaosMonkey.java      |  19 ++--
 .../java/org/apache/hadoop/hbase/HBaseCluster.java |  14 +++
 .../org/apache/hadoop/hbase/MiniHBaseCluster.java  |  36 +++++++
 12 files changed, 401 insertions(+), 44 deletions(-)

diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
index b477f76..27751ea 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
@@ -130,6 +130,20 @@ public class DistributedHBaseCluster extends HBaseCluster {
   }
 
   @Override
+  public void suspendRegionServer(ServerName serverName) throws IOException {
+    LOG.info("Suspend RS: " + serverName.getServerName());
+    clusterManager.suspend(ServiceType.HBASE_REGIONSERVER,
+        serverName.getHostname(), serverName.getPort());
+  }
+
+  @Override
+  public void resumeRegionServer(ServerName serverName) throws IOException {
+    LOG.info("Resume RS: " + serverName.getServerName());
+    clusterManager.resume(ServiceType.HBASE_REGIONSERVER,
+        serverName.getHostname(), serverName.getPort());
+  }
+
+  @Override
   public void startZkNode(String hostname, int port) throws IOException {
     LOG.info("Starting Zookeeper node on: " + hostname);
     clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
@@ -206,7 +220,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
 
   @Override
   public void stopNameNode(ServerName serverName) throws IOException {
-    LOG.info("Stopping name node on: " + serverName.getServerName());
+    LOG.info(String.format("Stopping name node on: %s", serverName.getServerName()));
     clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
       serverName.getPort());
   }
@@ -223,7 +237,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
 
   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
     throws IOException {
-    LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
+    LOG.info(
+        String.format("Waiting for service: %s to stop: %s", service, serverName.getServerName()));
     long start = System.currentTimeMillis();
 
     while ((System.currentTimeMillis() - start) < timeout) {
@@ -237,7 +252,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
 
   private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
     throws IOException {
-    LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
+    LOG.info(String.format(
+        "Waiting for service: %s to start: ", service, serverName.getServerName()));
     long start = System.currentTimeMillis();
 
     while ((System.currentTimeMillis() - start) < timeout) {
@@ -258,7 +274,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
 
   @Override
   public void startMaster(String hostname, int port) throws IOException {
-    LOG.info("Starting Master on: " + hostname + ":" + port);
+    LOG.info(String.format("Starting Master on: %s:%s", hostname, port));
     clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
   }
 
@@ -437,8 +453,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
       }
     }
     if (!deferred.isEmpty()) {
-      LOG.warn("Restoring cluster - restoring region servers reported "
-              + deferred.size() + " errors:");
+      LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
+          deferred.size()));
       for (int i=0; i<deferred.size() && i < 3; i++) {
         LOG.warn(deferred.get(i));
       }
@@ -500,8 +516,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
       }
     }
     if (!deferred.isEmpty()) {
-      LOG.warn("Restoring cluster - restoring region servers reported "
-              + deferred.size() + " errors:");
+      LOG.warn(String.format("Restoring cluster - restoring region servers reported %s errors:",
+          deferred.size()));
       for (int i=0; i<deferred.size() && i < 3; i++) {
         LOG.warn(deferred.get(i));
       }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index a751b92..1e00659 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -32,6 +32,7 @@ import org.apache.hadoop.hbase.ClusterStatus;
 import org.apache.hadoop.hbase.HBaseCluster;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.IntegrationTestingUtility;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
 import org.apache.hadoop.hbase.ServerLoad;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
@@ -154,65 +155,93 @@ public class Action {
     LOG.info("Started master: " + server);
   }
 
+  protected void stopRs(ServerName server) throws IOException {
+    LOG.info("Stopping regionserver " + server);
+    cluster.stopRegionServer(server);
+    cluster.waitForRegionServerToStop(server, killRsTimeout);
+    LOG.info(String.format("Stoppiong regionserver %s. Reported num of rs: %s", server,
+        cluster.getClusterStatus().getLiveServersLoad().size()));
+  }
+
+  protected void suspendRs(ServerName server) throws IOException {
+    LOG.info("Suspending regionserver %s" + server);
+    cluster.suspendRegionServer(server);
+    if(!(cluster instanceof MiniHBaseCluster)){
+      cluster.waitForRegionServerToStop(server, killRsTimeout);
+    }
+    LOG.info(String.format("Suspending regionserver %s. Reported num of rs: %s", server,
+        cluster.getClusterStatus().getLiveServersLoad().size()));
+  }
+
+  protected void resumeRs(ServerName server) throws IOException {
+    LOG.info("Resuming regionserver " + server);
+    cluster.resumeRegionServer(server);
+    if(!(cluster instanceof MiniHBaseCluster)){
+      cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
+    }
+    LOG.info(String.format("Resuming regionserver %s. Reported num of rs: %s", server,
+        cluster.getClusterStatus().getLiveServersLoad().size()));
+  }
+
   protected void killRs(ServerName server) throws IOException {
-    LOG.info("Killing region server:" + server);
+    LOG.info("Killing regionserver " + server);
     cluster.killRegionServer(server);
     cluster.waitForRegionServerToStop(server, killRsTimeout);
-    LOG.info("Killed region server:" + server + ". Reported num of rs:"
-        + cluster.getClusterStatus().getServersSize());
+    LOG.info(String.format("Killed regionserver %s. Reported num of rs: %s", server,
+        cluster.getClusterStatus().getLiveServersLoad().size()));
   }
 
   protected void startRs(ServerName server) throws IOException {
-    LOG.info("Starting region server:" + server.getHostname());
+    LOG.info("Starting regionserver " + server.getAddress());
     cluster.startRegionServer(server.getHostname(), server.getPort());
     cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
-    LOG.info("Started region server:" + server + ". Reported num of rs:"
-      + cluster.getClusterStatus().getServersSize());
+    LOG.info(String.format("Started regionserver %s. Reported num of rs: %s", server.getAddress(),
+      cluster.getClusterStatus().getLiveServersLoad().size()));
   }
 
   protected void killZKNode(ServerName server) throws IOException {
-    LOG.info("Killing zookeeper node:" + server);
+    LOG.info("Killing zookeeper node " + server);
     cluster.killZkNode(server);
     cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
-    LOG.info("Killed zookeeper node:" + server + ". Reported num of rs:"
-      + cluster.getClusterStatus().getServersSize());
+    LOG.info(String.format("Killed zookeeper node %s. Reported num of rs: %s", server,
+        cluster.getClusterStatus().getLiveServersLoad().size()));
   }
 
   protected void startZKNode(ServerName server) throws IOException {
-    LOG.info("Starting zookeeper node:" + server.getHostname());
+    LOG.info("Starting zookeeper node " + server.getHostname());
     cluster.startZkNode(server.getHostname(), server.getPort());
     cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
-    LOG.info("Started zookeeper node:" + server);
+    LOG.info("Started zookeeper node " + server);
   }
 
   protected void killDataNode(ServerName server) throws IOException {
-    LOG.info("Killing datanode:" + server);
+    LOG.info("Killing datanode " + server);
     cluster.killDataNode(server);
     cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
-    LOG.info("Killed datanode:" + server + ". Reported num of rs:"
-      + cluster.getClusterStatus().getServersSize());
+    LOG.info(String.format("Killed datanode %s. Reported num of rs: %s", server,
+        cluster.getClusterStatus().getLiveServersLoad().size()));
   }
 
   protected void startDataNode(ServerName server) throws IOException {
-    LOG.info("Starting datanode:" + server.getHostname());
+    LOG.info("Starting datanode " + server.getHostname());
     cluster.startDataNode(server);
     cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
-    LOG.info("Started datanode:" + server);
+    LOG.info("Started datanode " + server);
   }
 
   protected void killNameNode(ServerName server) throws IOException {
-    LOG.info("Killing namenode :-" + server.getHostname());
+    LOG.info("Killing namenode : " + server.getHostname());
     cluster.killNameNode(server);
     cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
-    LOG.info("Killed namenode:" + server + ". Reported num of rs:"
+    LOG.info("Killed namenode: " + server + ". Reported num of rs:"
         + cluster.getClusterStatus().getServersSize());
   }
 
   protected void startNameNode(ServerName server) throws IOException {
-    LOG.info("Starting Namenode :-" + server.getHostname());
+    LOG.info("Starting Namenode : " + server.getHostname());
     cluster.startNameNode(server);
     cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
-    LOG.info("Started namenode:" + server);
+    LOG.info("Started namenode: " +  server);
   }
 
   protected void unbalanceRegions(ClusterStatus clusterStatus,
@@ -259,6 +288,15 @@ public class Action {
     }
   }
 
+  protected void setBalancer(boolean onOrOff, boolean synchronous) throws Exception {
+    Admin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
+    try {
+      admin.setBalancerRunning(onOrOff, synchronous);
+    } catch (Exception e) {
+      LOG.warn("Got exception while switching balance ", e);
+    }
+  }
+
   public Configuration getConf() {
     return cluster.getConf();
   }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
index 8376f51..54836d2 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
@@ -49,19 +49,42 @@ public class RestartActionBaseAction extends Action {
       return;
     }
 
+    LOG.info("Killing master: " + server);
     killMaster(server);
     sleep(sleepTime);
+    LOG.info("Starting master: " + server);
     startMaster(server);
   }
 
+  /**
+   * Stop and then restart the region server instead of killing it.
+   * @param server hostname to restart the regionserver on
+   * @param sleepTime number of milliseconds between stop and restart
+   * @throws IOException if something goes wrong
+   */
+  void gracefulRestartRs(ServerName server, long sleepTime) throws IOException {
+    sleepTime = Math.max(sleepTime, 1000);
+    // Don't try the stop if we're stopping already
+    if (context.isStopping()) {
+      return;
+    }
+    LOG.info("Stopping region server: " + server);
+    stopRs(server);
+    sleep(sleepTime);
+    LOG.info("Starting region server: " + server);
+    startRs(server);
+  }
+
   void restartRs(ServerName server, long sleepTime) throws IOException {
     sleepTime = Math.max(sleepTime, 1000);
     // Don't try the kill if we're stopping
     if (context.isStopping()) {
       return;
     }
+    LOG.info("Killing region server: " + server);
     killRs(server);
     sleep(sleepTime);
+    LOG.info("Starting region server: " + server);
     startRs(server);
   }
 
@@ -71,8 +94,10 @@ public class RestartActionBaseAction extends Action {
     if (context.isStopping()) {
       return;
     }
+    LOG.info("Killing zookeeper node: " + server);
     killZKNode(server);
     sleep(sleepTime);
+    LOG.info("Starting zookeeper node: " + server);
     startZKNode(server);
   }
 
@@ -82,8 +107,10 @@ public class RestartActionBaseAction extends Action {
     if (context.isStopping()) {
       return;
     }
+    LOG.info("Killing data node: " + server);
     killDataNode(server);
     sleep(sleepTime);
+    LOG.info("Starting data node: " + server);
     startDataNode(server);
   }
 
@@ -93,8 +120,10 @@ public class RestartActionBaseAction extends Action {
     if (context.isStopping()) {
       return;
     }
+    LOG.info("Killing name node: " + server);
     killNameNode(server);
     sleep(sleepTime);
+    LOG.info("Starting name node: " + server);
     startNameNode(server);
   }
 }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
new file mode 100644
index 0000000..d3c9bce
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+
+import org.apache.commons.lang.math.RandomUtils;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hadoop.util.Shell;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either
+ * suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter
+ * maxSuspendedServers limits the maximum number of servers that can be down at the same time
+ * during rolling restarts.
+ */
+public class RollingBatchSuspendResumeRsAction extends Action {
+  private static final Logger LOG =
+      LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class);
+  private float ratio;
+  private long sleepTime;
+  private int maxSuspendedServers; // number of maximum suspended servers at any given time.
+
+  public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) {
+    this(sleepTime, ratio, 5);
+  }
+
+  public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) {
+    this.ratio = ratio;
+    this.sleepTime = sleepTime;
+    this.maxSuspendedServers = maxSuspendedServers;
+  }
+
+  enum SuspendOrResume {
+    SUSPEND, RESUME
+  }
+
+  @Override
+  public void perform() throws Exception {
+    LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
+        (int) (ratio * 100)));
+    List<ServerName> selectedServers = selectServers();
+
+    Queue<ServerName> serversToBeSuspended = new LinkedList<>(selectedServers);
+    Queue<ServerName> suspendedServers = new LinkedList<>();
+
+    // loop while there are servers to be suspended or suspended servers to be resumed
+    while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context
+        .isStopping()) {
+      SuspendOrResume action;
+
+      if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
+        action = SuspendOrResume.RESUME;
+      } else if (suspendedServers.isEmpty()) {
+        action = SuspendOrResume.SUSPEND; // no more servers to resume
+      } else if (suspendedServers.size() >= maxSuspendedServers) {
+        // we have too many suspended servers. Don't suspend any more
+        action = SuspendOrResume.RESUME;
+      } else {
+        // do a coin toss
+        action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME;
+      }
+
+      ServerName server;
+      switch (action) {
+        case SUSPEND:
+          server = serversToBeSuspended.remove();
+          try {
+            suspendRs(server);
+          } catch (Shell.ExitCodeException e) {
+            LOG.warn("Problem suspending but presume successful; code=" + e.getExitCode(), e);
+          }
+          suspendedServers.add(server);
+          break;
+        case RESUME:
+          server = suspendedServers.remove();
+          try {
+            resumeRs(server);
+          } catch (Shell.ExitCodeException e) {
+            LOG.info("Problem resuming, will retry; code= " + e.getExitCode(), e);
+          }
+          break;
+        default:
+          throw new IllegalArgumentException(
+              "Encountered unexpected action type: " + action.name());
+      }
+
+      LOG.info("Sleeping for: " + sleepTime);
+      Threads.sleep(sleepTime);
+    }
+  }
+
+  protected List<ServerName> selectServers() throws IOException {
+    return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
+  }
+
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java
index 5657d39..9051e98 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java
@@ -45,6 +45,9 @@ public interface MonkeyConstants {
   String UNBALANCE_WAIT_AFTER_BALANCE_MS = "unbalance.action.wait.after.period";
   String UNBALANCE_KILL_META_RS = "unbalance.action.kill.meta.rs";
   String DECREASE_HFILE_SIZE_SLEEP_TIME = "decrease.hfile.size.sleep.time";
+  String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time";
+  String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time";
+  String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio";
 
   /**
    * A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky.
@@ -75,4 +78,7 @@ public interface MonkeyConstants {
   long DEFAULT_UNBALANCE_WAIT_AFTER_BALANCE_MS = 5 * 1000;
   boolean DEFAULT_UNBALANCE_KILL_META_RS = true;
   long DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME = 30 * 1000;
+  long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000;
+  long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000;
+  float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f;
 }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
index 4faa786..1b1eef0 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
 import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
 import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
 import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
+import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
 import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
 import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
@@ -38,8 +39,12 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
  */
 public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
 
+  private long rollingBatchSuspendRSSleepTime;
+  private float rollingBatchSuspendtRSRatio;
+
   @Override
   public ChaosMonkey build() {
+    loadProperties();
 
     // Destructive actions to mess things around. Cannot run batch restart.
     Action[] actions1 = new Action[]{
@@ -48,7 +53,9 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
       new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead.
       new ForceBalancerAction(),
       new RestartRandomDataNodeAction(60000),
-      new RestartRandomZKNodeAction(60000)
+      new RestartRandomZKNodeAction(60000),
+      new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
+          rollingBatchSuspendtRSRatio)
     };
 
     // Action to log more info for debugging
@@ -62,4 +69,13 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
         new PeriodicRandomActionPolicy(60 * 1000, actions1)),
       new PeriodicRandomActionPolicy(60 * 1000, actions2));
   }
+
+  private void loadProperties() {
+    rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+    rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
+  }
 }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java
index 02b5914..360abb8 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
 import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
 import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
 import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
+import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
 import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
 import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
@@ -36,15 +37,21 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
  */
 public class ServerKillingMonkeyFactory extends MonkeyFactory {
 
+  private long rollingBatchSuspendRSSleepTime;
+  private float rollingBatchSuspendtRSRatio;
+
   @Override
   public ChaosMonkey build() {
+    loadProperties();
 
     // Destructive actions to mess things around. Cannot run batch restart
     Action[] actions1 = new Action[] {
         new RestartRandomRsExceptMetaAction(60000),
         new RestartActiveMasterAction(5000),
         new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), //only allow 2 servers to be dead
-        new ForceBalancerAction()
+      new ForceBalancerAction(),
+      new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
+          rollingBatchSuspendtRSRatio)
     };
 
     // Action to log more info for debugging
@@ -58,4 +65,13 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory {
           new PeriodicRandomActionPolicy(60 * 1000, actions1)),
       new PeriodicRandomActionPolicy(60 * 1000, actions2));
   }
+
+  private void loadProperties() {
+    rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+    rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
+  }
 }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java
index 093df2a..76e0877 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java
@@ -18,7 +18,32 @@
 
 package org.apache.hadoop.hbase.chaos.factories;
 
-import org.apache.hadoop.hbase.chaos.actions.*;
+import org.apache.hadoop.hbase.chaos.actions.Action;
+import org.apache.hadoop.hbase.chaos.actions.AddColumnAction;
+import org.apache.hadoop.hbase.chaos.actions.BatchRestartRsAction;
+import org.apache.hadoop.hbase.chaos.actions.ChangeBloomFilterAction;
+import org.apache.hadoop.hbase.chaos.actions.ChangeCompressionAction;
+import org.apache.hadoop.hbase.chaos.actions.ChangeEncodingAction;
+import org.apache.hadoop.hbase.chaos.actions.ChangeSplitPolicyAction;
+import org.apache.hadoop.hbase.chaos.actions.ChangeVersionsAction;
+import org.apache.hadoop.hbase.chaos.actions.CompactRandomRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.CompactTableAction;
+import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction;
+import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
+import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
+import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
+import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
+import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
+import org.apache.hadoop.hbase.chaos.actions.SnapshotTableAction;
+import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
 import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
 import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
@@ -44,6 +69,8 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
   private float compactTableRatio;
   private float compactRandomRegionRatio;
   private long decreaseHFileSizeSleepTime;
+  private long rollingBatchSuspendRSSleepTime;
+  private float rollingBatchSuspendtRSRatio;
 
   @Override
   public ChaosMonkey build() {
@@ -89,6 +116,8 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
         new RestartRsHoldingMetaAction(restartRsHoldingMetaSleepTime),
         new DecreaseMaxHFileSizeAction(decreaseHFileSizeSleepTime, tableName),
         new SplitAllRegionOfTableAction(tableName),
+      new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
+          rollingBatchSuspendtRSRatio)
     };
 
     // Action to log more info for debugging
@@ -158,5 +187,11 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
     decreaseHFileSizeSleepTime = Long.parseLong(this.properties.getProperty(
         MonkeyConstants.DECREASE_HFILE_SIZE_SLEEP_TIME,
         MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME + ""));
+    rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+    rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
   }
 }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java
index 03471ab..7f8fb26 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java
@@ -17,7 +17,26 @@
  */
 package org.apache.hadoop.hbase.chaos.factories;
 
-import org.apache.hadoop.hbase.chaos.actions.*;
+import org.apache.hadoop.hbase.chaos.actions.Action;
+import org.apache.hadoop.hbase.chaos.actions.AddColumnAction;
+import org.apache.hadoop.hbase.chaos.actions.BatchRestartRsAction;
+import org.apache.hadoop.hbase.chaos.actions.ChangeSplitPolicyAction;
+import org.apache.hadoop.hbase.chaos.actions.CompactRandomRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.CompactTableAction;
+import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction;
+import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
+import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
+import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
+import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
+import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
+import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
+import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
 import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
 import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
@@ -25,9 +44,15 @@ import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy;
 import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
 
 public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
+
+  private long rollingBatchSuspendRSSleepTime;
+  private float rollingBatchSuspendtRSRatio;
+
   @Override
   public ChaosMonkey build() {
 
+    loadProperties();
+
     // Actions that could slow down region movement.
     // These could also get regions stuck if there are issues.
     Action[] actions1 = new Action[]{
@@ -55,6 +80,8 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
         new SplitAllRegionOfTableAction(tableName),
         new DecreaseMaxHFileSizeAction(MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME,
             tableName),
+      new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
+          rollingBatchSuspendtRSRatio)
     };
 
     // Action to log more info for debugging
@@ -70,4 +97,13 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
         new PeriodicRandomActionPolicy(90 * 1000, actions3)
     );
   }
+
+  private void loadProperties() {
+    rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+    rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
+        MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
+        MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
+  }
 }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
index 57f7c83..277e221 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
@@ -18,8 +18,9 @@
 
 package org.apache.hadoop.hbase.chaos.monkies;
 
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.commons.lang.math.RandomUtils;
@@ -90,18 +91,12 @@ public class PolicyBasedChaosMonkey extends ChaosMonkey {
 
   /** Selects and returns ceil(ratio * items.length) random items from the given array */
   public static <T> List<T> selectRandomItems(T[] items, float ratio) {
-    int remaining = (int)Math.ceil(items.length * ratio);
+    int selectedNumber = (int)Math.ceil(items.length * ratio);
 
-    List<T> selectedItems = new ArrayList<T>(remaining);
-
-    for (int i=0; i<items.length && remaining > 0; i++) {
-      if (RandomUtils.nextFloat() < ((float)remaining/(items.length-i))) {
-        selectedItems.add(items[i]);
-        remaining--;
-      }
-    }
-
-    return selectedItems;
+    List<T> originalItems = Arrays.asList(items);
+    Collections.shuffle(originalItems);
+    int startIndex = RandomUtils.nextInt(items.length - selectedNumber);
+    return originalItems.subList(startIndex, startIndex + selectedNumber);
   }
 
   private Policy[] policies;
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
index 7de1ecf..001ab2c 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
@@ -161,6 +161,20 @@ public abstract class HBaseCluster implements Closeable, Configurable {
       throws IOException;
 
   /**
+   * Suspend the region server
+   * @param serverName the hostname to suspend the regionserver on
+   * @throws IOException if something goes wrong
+   */
+  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
+
+  /**
+   * Resume the region server
+   * @param serverName the hostname to resume the regionserver on
+   * @throws IOException if something goes wrong
+   */
+  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
+
+  /**
    * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
    * silently logs warning message.
    * @param hostname the hostname to start the regionserver on
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
index 7792be3..714e72b 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
@@ -259,6 +259,16 @@ public class MiniHBaseCluster extends HBaseCluster {
   }
 
   @Override
+  public void suspendRegionServer(ServerName serverName) throws IOException {
+    suspendRegionServer(getRegionServerIndex(serverName));
+  }
+
+  @Override
+  public void resumeRegionServer(ServerName serverName) throws IOException {
+    resumeRegionServer(getRegionServerIndex(serverName));
+  }
+
+  @Override
   public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
     //ignore timeout for now
     waitOnRegionServer(getRegionServerIndex(serverName));
@@ -424,6 +434,32 @@ public class MiniHBaseCluster extends HBaseCluster {
   }
 
   /**
+   * Suspend the specified region server
+   * @param serverNumber Used as index into a list.
+   * @return server instance that was suspended.
+   */
+  public JVMClusterUtil.RegionServerThread suspendRegionServer(int serverNumber) {
+    JVMClusterUtil.RegionServerThread server =
+        hbaseCluster.getRegionServers().get(serverNumber);
+    LOG.info("Suspending " + server.toString());
+    server.suspend();
+    return server;
+  }
+
+  /**
+   * Resume the specified region server
+   * @param serverNumber Used as index into a list.
+   * @return server instance that was resumed.
+   */
+  public JVMClusterUtil.RegionServerThread resumeRegionServer(int serverNumber) {
+    JVMClusterUtil.RegionServerThread server =
+        hbaseCluster.getRegionServers().get(serverNumber);
+    LOG.info("Resuming " + server.toString());
+    server.resume();
+    return server;
+  }
+
+  /**
    * Wait for the specified region server to stop. Removes this thread from list
    * of running threads.
    * @param serverNumber


[hbase] 02/02: HBASE-24360 RollingBatchRestartRsAction loses track of dead servers

Posted by bh...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

bharathv pushed a commit to branch branch-1
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 0224dccdb89952395909f65f2e630822023c2582
Author: Nick Dimiduk <nd...@apache.org>
AuthorDate: Mon May 18 12:08:52 2020 -0700

    HBASE-24360 RollingBatchRestartRsAction loses track of dead servers
    
    `RollingBatchRestartRsAction` doesn't handle failure cases when
    tracking its list of dead servers. The original author believed that a
    failure to restart would result in a retry. However, by removing the
    dead server from the failed list, that state is lost, and retry never
    occurs. Because this action doesn't ever look back to the current
    state of the cluster, relying only on its local state for the current
    action invocation, it never realizes the abandoned server is still
    dead. Instead, be more careful to only remove the dead server from the
    list when the `startRs` invocation claims to have been successful.
    
    Signed-off-by: stack <st...@apache.org>
    (cherry picked from commit 0dae377f53db6388a933fd7a1ebe231cf0d997d7)
---
 .../java/org/apache/hadoop/hbase/chaos/actions/Action.java |  2 +-
 .../hbase/chaos/actions/RollingBatchRestartRsAction.java   | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index 1e00659..7a2016e 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -159,7 +159,7 @@ public class Action {
     LOG.info("Stopping regionserver " + server);
     cluster.stopRegionServer(server);
     cluster.waitForRegionServerToStop(server, killRsTimeout);
-    LOG.info(String.format("Stoppiong regionserver %s. Reported num of rs: %s", server,
+    LOG.info(String.format("Stopping regionserver %s. Reported num of rs: %s", server,
         cluster.getClusterStatus().getLiveServersLoad().size()));
   }
 
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java
index 3473407..7db9f5a 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java
@@ -20,8 +20,10 @@ package org.apache.hadoop.hbase.chaos.actions;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Objects;
 import java.util.Queue;
 
 import org.apache.commons.lang.math.RandomUtils;
@@ -61,7 +63,7 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction {
     List<ServerName> selectedServers = selectServers();
 
     Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
-    Queue<ServerName> deadServers = new LinkedList<ServerName>();
+    LinkedList<ServerName> deadServers = new LinkedList<ServerName>();
 
     // loop while there are servers to be killed or dead servers to be restarted
     while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty())  && !context.isStopping()) {
@@ -94,13 +96,17 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction {
           deadServers.add(server);
           break;
         case START:
+          server = Objects.requireNonNull(deadServers.peek());
           try {
-            server = deadServers.remove();
             startRs(server);
+            // only remove the server from the known dead list if `startRs` succeeds.
+            deadServers.remove(server);
           } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
             // The start may fail but better to just keep going though we may lose server.
-            //
-            LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
+            // Shuffle the dead list to avoid getting stuck on a single stubborn host.
+            Collections.shuffle(deadServers);
+            LOG.info(String.format(
+              "Problem starting %s, will retry; code=%s", server, e.getExitCode(), e));
           }
           break;
       }