You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by me...@apache.org on 2019/11/19 09:16:10 UTC
[hbase] branch branch-2 updated: HBASE-23085 Network and Data
related Actions
This is an automated email from the ASF dual-hosted git repository.
meszibalu pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new 54be3d1 HBASE-23085 Network and Data related Actions
54be3d1 is described below
commit 54be3d1d867f390487b9386459797b86c916ee0b
Author: BukrosSzabolcs <bu...@gmail.com>
AuthorDate: Wed Nov 6 07:43:01 2019 -0600
HBASE-23085 Network and Data related Actions
Add monkey actions:
- manipulate network packages with tc (reorder, loose,...)
- add CPU load
- fill the disk
- corrupt or delete regionserver data files
Extend HBaseClusterManager to allow sudo calls.
Signed-off-by: Josh Elser <el...@apache.org>
Signed-off-by: Balazs Meszaros <me...@apache.org>
---
.../apache/hadoop/hbase/HBaseClusterManager.java | 90 ++++++++++++++-
.../hbase/chaos/actions/AddCPULoadAction.java | 69 ++++++++++++
.../chaos/actions/CorruptDataFilesAction.java | 75 +++++++++++++
.../actions/CorruptPackagesCommandAction.java | 72 ++++++++++++
.../chaos/actions/DelayPackagesCommandAction.java | 71 ++++++++++++
.../hbase/chaos/actions/DeleteDataFilesAction.java | 66 +++++++++++
.../actions/DuplicatePackagesCommandAction.java | 72 ++++++++++++
.../hbase/chaos/actions/FillDiskCommandAction.java | 83 ++++++++++++++
.../chaos/actions/LosePackagesCommandAction.java | 72 ++++++++++++
.../actions/ReorderPackagesCommandAction.java | 76 +++++++++++++
.../hbase/chaos/actions/SudoCommandAction.java | 70 ++++++++++++
.../hbase/chaos/actions/TCCommandAction.java | 33 ++++++
.../chaos/factories/DataIssuesMonkeyFactory.java | 72 ++++++++++++
.../factories/DistributedIssuesMonkeyFactory.java | 121 +++++++++++++++++++++
.../hbase/chaos/factories/MonkeyConstants.java | 29 ++++-
.../hbase/chaos/factories/MonkeyFactory.java | 4 +
.../ServerAndDependenciesKillingMonkeyFactory.java | 8 +-
.../factories/ServerKillingMonkeyFactory.java | 8 +-
.../factories/SlowDeterministicMonkeyFactory.java | 8 +-
.../StressAssignmentManagerMonkeyFactory.java | 12 +-
20 files changed, 1085 insertions(+), 26 deletions(-)
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
index f7c2fc6..2f75c73 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
@@ -62,6 +62,15 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
"timeout 30 /usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo -u %6$s %5$s\"";
private String tunnelCmd;
+ /**
+ * The command format that is used to execute the remote command with sudo. Arguments:
+ * 1 SSH options, 2 user name , 3 "@" if username is set, 4 host,
+ * 5 original command, 6 timeout.
+ */
+ private static final String DEFAULT_TUNNEL_SUDO_CMD =
+ "timeout %6$s /usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo %5$s\"";
+ private String tunnelSudoCmd;
+
private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts";
private static final int DEFAULT_RETRY_ATTEMPTS = 5;
@@ -86,6 +95,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
sshOptions = (sshOptions == null) ? "" : sshOptions;
sshUserName = (sshUserName == null) ? "" : sshUserName;
tunnelCmd = conf.get("hbase.it.clustermanager.ssh.cmd", DEFAULT_TUNNEL_CMD);
+ tunnelSudoCmd = conf.get("hbase.it.clustermanager.ssh.sudo.cmd", DEFAULT_TUNNEL_SUDO_CMD);
// Print out ssh special config if any.
if ((sshUserName != null && sshUserName.length() > 0) ||
(sshOptions != null && sshOptions.length() > 0)) {
@@ -152,10 +162,32 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
LOG.info("Executing full command [" + cmd + "]");
return new String[] { "/usr/bin/env", "bash", "-c", cmd };
}
+ }
+
+ /**
+ * Executes commands over SSH
+ */
+ protected class RemoteSudoShell extends Shell.ShellCommandExecutor {
+ private String hostname;
+
+ public RemoteSudoShell(String hostname, String[] execString, long timeout) {
+ this(hostname, execString, null, null, timeout);
+ }
+
+ public RemoteSudoShell(String hostname, String[] execString, File dir, Map<String, String> env,
+ long timeout) {
+ super(execString, dir, env, timeout);
+ this.hostname = hostname;
+ }
@Override
- public void execute() throws IOException {
- super.execute();
+ public String[] getExecString() {
+ String at = sshUserName.isEmpty() ? "" : "@";
+ String remoteCmd = StringUtils.join(super.getExecString(), " ");
+ String cmd = String.format(tunnelSudoCmd, sshOptions, sshUserName, at, hostname, remoteCmd,
+ timeOutInterval/1000f);
+ LOG.info("Executing full command [" + cmd + "]");
+ return new String[] { "/usr/bin/env", "bash", "-c", cmd };
}
}
@@ -299,7 +331,8 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
*/
private Pair<Integer, String> exec(String hostname, ServiceType service, String... cmd)
throws IOException {
- LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname);
+ LOG.info("Executing remote command: {} , hostname:{}", StringUtils.join(cmd, " "),
+ hostname);
RemoteShell shell = new RemoteShell(hostname, getServiceUser(service), cmd);
try {
@@ -312,8 +345,8 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
+ ", stdout: " + output);
}
- LOG.info("Executed remote command, exit code:" + shell.getExitCode()
- + " , output:" + shell.getOutput());
+ LOG.info("Executed remote command, exit code:{} , output:{}", shell.getExitCode(),
+ shell.getOutput());
return new Pair<>(shell.getExitCode(), shell.getOutput());
}
@@ -331,7 +364,52 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
retryCounter.sleepUntilNextRetry();
} catch (InterruptedException ex) {
// ignore
- LOG.warn("Sleep Interrupted:" + ex);
+ LOG.warn("Sleep Interrupted:", ex);
+ }
+ }
+ }
+
+ /**
+ * Execute the given command on the host using SSH
+ * @return pair of exit code and command output
+ * @throws IOException if something goes wrong.
+ */
+ public Pair<Integer, String> execSudo(String hostname, long timeout, String... cmd)
+ throws IOException {
+ LOG.info("Executing remote command: {} , hostname:{}", StringUtils.join(cmd, " "),
+ hostname);
+
+ RemoteSudoShell shell = new RemoteSudoShell(hostname, cmd, timeout);
+ try {
+ shell.execute();
+ } catch (Shell.ExitCodeException ex) {
+ // capture the stdout of the process as well.
+ String output = shell.getOutput();
+ // add output for the ExitCodeException.
+ throw new Shell.ExitCodeException(ex.getExitCode(), "stderr: " + ex.getMessage()
+ + ", stdout: " + output);
+ }
+
+ LOG.info("Executed remote command, exit code:{} , output:{}", shell.getExitCode(),
+ shell.getOutput());
+
+ return new Pair<>(shell.getExitCode(), shell.getOutput());
+ }
+
+ public Pair<Integer, String> execSudoWithRetries(String hostname, long timeout, String... cmd)
+ throws IOException {
+ RetryCounter retryCounter = retryCounterFactory.create();
+ while (true) {
+ try {
+ return execSudo(hostname, timeout, cmd);
+ } catch (IOException e) {
+ retryOrThrow(retryCounter, e, hostname, cmd);
+ }
+ try {
+ retryCounter.sleepUntilNextRetry();
+ } catch (InterruptedException ex) {
+ // ignore
+ LOG.warn("Sleep Interrupted:", ex);
}
}
}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddCPULoadAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddCPULoadAction.java
new file mode 100644
index 0000000..9d6437e
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddCPULoadAction.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Action that adds high cpu load to a random regionserver for a given duration
+ */
+public class AddCPULoadAction extends SudoCommandAction {
+ protected static final Logger LOG = LoggerFactory.getLogger(AddCPULoadAction.class);
+ private static final String CPU_LOAD_COMMAND =
+ "seq 1 %s | xargs -I{} -n 1 -P %s timeout %s dd if=/dev/urandom of=/dev/null bs=1M " +
+ "iflag=fullblock";
+
+ private final long duration;
+ private long processes;
+
+ /**
+ * Add high load to cpu
+ *
+ * @param duration Duration that this thread should generate the load for in milliseconds
+ * @param processes The number of parallel processes, should be equal to cpu threads for max load
+ */
+ public AddCPULoadAction(long duration, long processes, long timeout) {
+ super(timeout);
+ this.duration = duration;
+ this.processes = processes;
+ }
+
+ protected void localPerform() throws IOException {
+ LOG.info("Starting to execute AddCPULoadAction");
+ ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
+ String hostname = server.getHostname();
+
+ try {
+ clusterManager.execSudo(hostname, timeout, getCommand());
+ } catch (IOException ex){
+ //This will always happen. We use timeout to kill a continously running process
+ //after the duration expires
+ }
+ LOG.info("Finished to execute AddCPULoadAction");
+ }
+
+ private String getCommand(){
+ return String.format(CPU_LOAD_COMMAND, processes, processes, duration/1000f);
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptDataFilesAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptDataFilesAction.java
new file mode 100644
index 0000000..83e8fe0
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptDataFilesAction.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.util.CommonFSUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Action corrupts HFiles with a certain chance.
+ */
+public class CorruptDataFilesAction extends Action {
+ private static final Logger LOG = LoggerFactory.getLogger(CorruptDataFilesAction.class);
+ private float chance;
+
+ /**
+ * Corrupts HFiles with a certain chance
+ * @param chance chance to corrupt any give data file (0.5 => 50%)
+ */
+ public CorruptDataFilesAction(float chance) {
+ this.chance = chance * 100;
+ }
+
+ @Override
+ public void perform() throws Exception {
+ LOG.info("Start corrupting data files");
+
+ FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf());
+ Path rootDir = CommonFSUtils.getRootDir(getConf());
+ Path defaultDir = rootDir.suffix("/data/default");
+ RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(defaultDir, true);
+ while (iterator.hasNext()){
+ LocatedFileStatus status = iterator.next();
+ if(!HFile.isHFileFormat(fs, status.getPath())){
+ continue;
+ }
+ if(RandomUtils.nextFloat(0, 100) > chance){
+ continue;
+ }
+
+ FSDataOutputStream out = fs.create(status.getPath(), true);
+ try {
+ out.write(0);
+ } finally {
+ out.close();
+ }
+ LOG.info("Corrupting {}", status.getPath());
+ }
+ LOG.info("Done corrupting data files");
+ }
+
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptPackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptPackagesCommandAction.java
new file mode 100644
index 0000000..a89d558
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CorruptPackagesCommandAction.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ * Corrupt network packages on a random regionserver.
+ */
+public class CorruptPackagesCommandAction extends TCCommandAction {
+ private static final Logger LOG = LoggerFactory.getLogger(CorruptPackagesCommandAction.class);
+ private float ratio;
+ private long duration;
+
+ /**
+ * Corrupt network packages on a random regionserver.
+ *
+ * @param ratio the ratio of packages corrupted
+ * @param duration the time this issue persists in milliseconds
+ * @param timeout the timeout for executing required commands on the region server in milliseconds
+ * @param network network interface the regionserver uses for communication
+ */
+ public CorruptPackagesCommandAction(float ratio, long duration, long timeout, String network) {
+ super(timeout, network);
+ this.ratio = ratio;
+ this.duration = duration;
+ }
+
+ protected void localPerform() throws IOException {
+ LOG.info("Starting to execute CorruptPackagesCommandAction");
+ ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
+ String hostname = server.getHostname();
+
+ try {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
+ Thread.sleep(duration);
+ } catch (InterruptedException e) {
+ LOG.debug("Failed to run the command for the full duration", e);
+ } finally {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
+ }
+
+ LOG.info("Finished to execute CorruptPackagesCommandAction");
+ }
+
+ private String getCommand(String operation){
+ return String.format("tc qdisc %s dev %s root netem corrupt %s%%", operation, network,
+ ratio * 100);
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DelayPackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DelayPackagesCommandAction.java
new file mode 100644
index 0000000..e4de0a2
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DelayPackagesCommandAction.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Action adds latency to communication on a random regionserver.
+ */
+public class DelayPackagesCommandAction extends TCCommandAction {
+ private static final Logger LOG = LoggerFactory.getLogger(DelayPackagesCommandAction.class);
+ private long delay;
+ private long duration;
+
+ /**
+ * Adds latency to communication on a random region server
+ *
+ * @param delay the latency wil be delay +/-50% in milliseconds
+ * @param duration the time this issue persists in milliseconds
+ * @param timeout the timeout for executing required commands on the region server in milliseconds
+ * @param network network interface the regionserver uses for communication
+ */
+ public DelayPackagesCommandAction(long delay, long duration, long timeout, String network) {
+ super(timeout, network);
+ this.delay = delay;
+ this.duration = duration;
+ }
+
+ protected void localPerform() throws IOException {
+ LOG.info("Starting to execute DelayPackagesCommandAction");
+ ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
+ String hostname = server.getHostname();
+
+ try {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
+ Thread.sleep(duration);
+ } catch (InterruptedException e) {
+ LOG.debug("Failed to run the command for the full duration", e);
+ } finally {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
+ }
+
+ LOG.info("Finished to execute DelayPackagesCommandAction");
+ }
+
+ private String getCommand(String operation){
+ return String.format("tc qdisc %s dev %s root netem delay %sms %sms",
+ operation, network, delay, delay/2);
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DeleteDataFilesAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DeleteDataFilesAction.java
new file mode 100644
index 0000000..4919adc
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DeleteDataFilesAction.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.hbase.io.hfile.HFile;
+import org.apache.hadoop.hbase.util.CommonFSUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Action deletes HFiles with a certain chance.
+ */
+public class DeleteDataFilesAction extends Action {
+ private static final Logger LOG = LoggerFactory.getLogger(DeleteDataFilesAction.class);
+ private float chance;
+
+ /**
+ * Delets HFiles with a certain chance
+ * @param chance chance to delete any give data file (0.5 => 50%)
+ */
+ public DeleteDataFilesAction(float chance) {
+ this.chance = chance * 100;
+ }
+
+ @Override
+ public void perform() throws Exception {
+ LOG.info("Start deleting data files");
+ FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf());
+ Path rootDir = CommonFSUtils.getRootDir(getConf());
+ Path defaultDir = rootDir.suffix("/data/default");
+ RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(defaultDir, true);
+ while (iterator.hasNext()){
+ LocatedFileStatus status = iterator.next();
+ if(!HFile.isHFileFormat(fs, status.getPath())){
+ continue;
+ }
+ if(RandomUtils.nextFloat(0, 100) > chance){
+ continue;
+ }
+ fs.delete(status.getPath());
+ LOG.info("Deleting {}", status.getPath());
+ }
+ LOG.info("Done deleting data files");
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DuplicatePackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DuplicatePackagesCommandAction.java
new file mode 100644
index 0000000..f3d54f1
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DuplicatePackagesCommandAction.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ * Duplicate network packages on a random regionserver.
+ */
+public class DuplicatePackagesCommandAction extends TCCommandAction {
+ private static final Logger LOG = LoggerFactory.getLogger(DuplicatePackagesCommandAction.class);
+ private float ratio;
+ private long duration;
+
+ /**
+ * Duplicate network packages on a random regionserver.
+ *
+ * @param ratio the ratio of packages duplicated
+ * @param duration the time this issue persists in milliseconds
+ * @param timeout the timeout for executing required commands on the region server in milliseconds
+ * @param network network interface the regionserver uses for communication
+ */
+ public DuplicatePackagesCommandAction(float ratio, long duration, long timeout, String network) {
+ super(timeout, network);
+ this.ratio = ratio;
+ this.duration = duration;
+ }
+
+ protected void localPerform() throws IOException {
+ LOG.info("Starting to execute DuplicatePackagesCommandAction");
+ ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
+ String hostname = server.getHostname();
+
+ try {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
+ Thread.sleep(duration);
+ } catch (InterruptedException e) {
+ LOG.debug("Failed to run the command for the full duration", e);
+ } finally {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
+ }
+
+ LOG.info("Finished to execute DuplicatePackagesCommandAction");
+ }
+
+ private String getCommand(String operation){
+ return String.format("tc qdisc %s dev %s root netem duplicate %s%%", operation, network,
+ ratio * 100);
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FillDiskCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FillDiskCommandAction.java
new file mode 100644
index 0000000..b7af31f
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FillDiskCommandAction.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ * Fill the disk on a random regionserver.
+ */
+public class FillDiskCommandAction extends SudoCommandAction {
+ private static final Logger LOG = LoggerFactory.getLogger(FillDiskCommandAction.class);
+ private long size;
+ private long duration;
+ private String path;
+
+ /**
+ * Fill the disk on a random regionserver.
+ * Please note that the file will be created regardless of the set duration or timeout.
+ * So please use timeout and duration big enough to avoid complication caused by retries.
+ *
+ * @param size size of the generated file in MB or fill the disk if set to 0
+ * @param duration the time this issue persists in milliseconds
+ * @param path the path to the generated file
+ * @param timeout the timeout for executing required commands on the region server in milliseconds
+ */
+ public FillDiskCommandAction(long size, long duration, String path, long timeout) {
+ super(timeout);
+ this.size = size;
+ this.duration = duration;
+ this.path = path;
+ }
+
+ protected void localPerform() throws IOException {
+ LOG.info("Starting to execute FillDiskCommandAction");
+ ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
+ String hostname = server.getHostname();
+
+ try {
+ clusterManager.execSudoWithRetries(hostname, timeout, getFillCommand());
+ Thread.sleep(duration);
+ } catch (InterruptedException e) {
+ LOG.debug("Failed to run the command for the full duration", e);
+ } finally {
+ clusterManager.execSudoWithRetries(hostname, timeout, getClearCommand());
+ }
+
+ LOG.info("Finished to execute FillDiskCommandAction");
+ }
+
+ private String getFillCommand(){
+ if (size == 0){
+ return String.format("dd if=/dev/urandom of=%s/garbage bs=1M iflag=fullblock", path);
+ }
+ return String.format("dd if=/dev/urandom of=%s/garbage bs=1M count=%s iflag=fullblock",
+ path, size);
+ }
+
+ private String getClearCommand(){
+ return String.format("rm -f %s/garbage", path);
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/LosePackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/LosePackagesCommandAction.java
new file mode 100644
index 0000000..e44cac7
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/LosePackagesCommandAction.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ * Lose network packages on a random regionserver.
+ */
+public class LosePackagesCommandAction extends TCCommandAction {
+ private static final Logger LOG = LoggerFactory.getLogger(LosePackagesCommandAction.class);
+ private float ratio;
+ private long duration;
+
+ /**
+ * Lose network packages on a random regionserver.
+ *
+ * @param ratio the ratio of packages lost
+ * @param duration the time this issue persists in milliseconds
+ * @param timeout the timeout for executing required commands on the region server in milliseconds
+ * @param network network interface the regionserver uses for communication
+ */
+ public LosePackagesCommandAction(float ratio, long duration, long timeout, String network) {
+ super(timeout, network);
+ this.ratio = ratio;
+ this.duration = duration;
+ }
+
+ protected void localPerform() throws IOException {
+ LOG.info("Starting to execute LosePackagesCommandAction");
+ ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
+ String hostname = server.getHostname();
+
+ try {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
+ Thread.sleep(duration);
+ } catch (InterruptedException e) {
+ LOG.debug("Failed to run the command for the full duration", e);
+ } finally {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
+ }
+
+ LOG.info("Finished to execute LosePackagesCommandAction");
+ }
+
+ private String getCommand(String operation){
+ return String.format("tc qdisc %s dev %s root netem loss %s%%", operation, network,
+ ratio * 100);
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ReorderPackagesCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ReorderPackagesCommandAction.java
new file mode 100644
index 0000000..c1f196e
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ReorderPackagesCommandAction.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ * Reorder network packages on a random regionserver.
+ */
+public class ReorderPackagesCommandAction extends TCCommandAction {
+ private static final Logger LOG = LoggerFactory.getLogger(ReorderPackagesCommandAction.class);
+ private float ratio;
+ private long duration;
+ private long delay;
+
+ /**
+ * Reorder network packages on a random regionserver.
+ *
+ * @param ratio the ratio of packages reordered
+ * @param duration the time this issue persists in milliseconds
+ * @param delay the delay between reordered and non-reordered packages in milliseconds
+ * @param timeout the timeout for executing required commands on the region server in milliseconds
+ * @param network network interface the regionserver uses for communication
+ */
+ public ReorderPackagesCommandAction(float ratio, long duration, long delay, long timeout,
+ String network) {
+ super(timeout, network);
+ this.ratio = ratio;
+ this.duration = duration;
+ this.delay = delay;
+ }
+
+ protected void localPerform() throws IOException {
+ LOG.info("Starting to execute ReorderPackagesCommandAction");
+ ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getCurrentServers());
+ String hostname = server.getHostname();
+
+ try {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(ADD));
+ Thread.sleep(duration);
+ } catch (InterruptedException e) {
+ LOG.debug("Failed to run the command for the full duration", e);
+ } finally {
+ clusterManager.execSudoWithRetries(hostname, timeout, getCommand(DELETE));
+ }
+
+ LOG.info("Finished to execute ReorderPackagesCommandAction");
+ }
+
+ private String getCommand(String operation){
+ return String.format("tc qdisc %s dev %s root netem delay %sms reorder %s%% 50%",
+ operation, network, delay, ratio * 100);
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/SudoCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/SudoCommandAction.java
new file mode 100644
index 0000000..6092a5d
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/SudoCommandAction.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hbase.DistributedHBaseCluster;
+import org.apache.hadoop.hbase.HBaseCluster;
+import org.apache.hadoop.hbase.HBaseClusterManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Base class for performing Actions based on linux commands requiring sudo privileges
+ */
+abstract public class SudoCommandAction extends Action {
+ private static final Logger LOG = LoggerFactory.getLogger(SudoCommandAction.class);
+
+ protected long timeout;
+ protected HBaseClusterManager clusterManager;
+
+ public SudoCommandAction(long timeout) {
+ this.timeout = timeout;
+ }
+
+ @Override
+ public void init(ActionContext context) throws IOException {
+ super.init(context);
+ HBaseCluster cluster = context.getHBaseCluster();
+ if(cluster != null && cluster instanceof DistributedHBaseCluster){
+ Object manager = ((DistributedHBaseCluster)cluster).getClusterManager();
+ if(manager != null && manager instanceof HBaseClusterManager){
+ clusterManager = (HBaseClusterManager) manager;
+ }
+ }
+ }
+
+ @Override
+ public void perform() throws Exception {
+ if(clusterManager == null){
+ LOG.info("Couldn't perform command action, it requires a distributed cluster.");
+ return;
+ }
+
+ // Don't try the modify if we're stopping
+ if (context.isStopping()) {
+ return;
+ }
+
+ localPerform();
+ }
+
+ abstract protected void localPerform() throws IOException;
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/TCCommandAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/TCCommandAction.java
new file mode 100644
index 0000000..9444f87
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/TCCommandAction.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+/**
+ * Base class for tc command actions
+ */
+abstract public class TCCommandAction extends SudoCommandAction {
+ protected static final String ADD = "add";
+ protected static final String DELETE = "del";
+ protected String network;
+
+ public TCCommandAction(long timeout, String network) {
+ super(timeout);
+ this.network = network;
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DataIssuesMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DataIssuesMonkeyFactory.java
new file mode 100644
index 0000000..a06a977
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DataIssuesMonkeyFactory.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.factories;
+
+import org.apache.hadoop.hbase.chaos.actions.Action;
+import org.apache.hadoop.hbase.chaos.actions.CorruptDataFilesAction;
+import org.apache.hadoop.hbase.chaos.actions.DeleteDataFilesAction;
+import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
+import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
+
+/**
+ * A chaos monkey to delete and corrupt regionserver data, requires a user with
+ * passwordless ssh access to the cluster and sudo privileges.
+ * Highly destructive
+ */
+public class DataIssuesMonkeyFactory extends MonkeyFactory {
+
+ private long action1Period;
+ private long action2Period;
+
+ private float chanceToAct;
+
+ @Override
+ public ChaosMonkey build() {
+ loadProperties();
+
+ // Highly destructive actions to mess things around.
+ Action[] actions1 = new Action[] {
+ new DeleteDataFilesAction(chanceToAct),
+ new CorruptDataFilesAction(chanceToAct)
+ };
+
+ // Action to log more info for debugging
+ Action[] actions2 = new Action[] {
+ new DumpClusterStatusAction()
+ };
+
+ return new PolicyBasedChaosMonkey(util,
+ new PeriodicRandomActionPolicy(action1Period, actions1),
+ new PeriodicRandomActionPolicy(action2Period, actions2));
+ }
+
+ private void loadProperties() {
+ action1Period = Long.parseLong(this.properties.getProperty(
+ MonkeyConstants.PERIODIC_ACTION1_PERIOD,
+ MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + ""));
+ action2Period = Long.parseLong(this.properties.getProperty(
+ MonkeyConstants.PERIODIC_ACTION2_PERIOD,
+ MonkeyConstants.DEFAULT_PERIODIC_ACTION2_PERIOD + ""));
+ chanceToAct = Float.parseFloat(this.properties.getProperty(
+ MonkeyConstants.DATA_ISSUE_CHANCE,
+ MonkeyConstants.DEFAULT_DATA_ISSUE_CHANCE+ ""));
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DistributedIssuesMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DistributedIssuesMonkeyFactory.java
new file mode 100644
index 0000000..745f1b9
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/DistributedIssuesMonkeyFactory.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.factories;
+
+import org.apache.hadoop.hbase.chaos.actions.Action;
+import org.apache.hadoop.hbase.chaos.actions.AddCPULoadAction;
+import org.apache.hadoop.hbase.chaos.actions.CorruptPackagesCommandAction;
+import org.apache.hadoop.hbase.chaos.actions.DelayPackagesCommandAction;
+import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
+import org.apache.hadoop.hbase.chaos.actions.DuplicatePackagesCommandAction;
+import org.apache.hadoop.hbase.chaos.actions.FillDiskCommandAction;
+import org.apache.hadoop.hbase.chaos.actions.LosePackagesCommandAction;
+import org.apache.hadoop.hbase.chaos.actions.ReorderPackagesCommandAction;
+import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
+
+/**
+ * A chaos monkey to create distributed cluster related issues, requires a user with
+ * passwordless ssh access to the cluster and sudo privileges.
+ */
+public class DistributedIssuesMonkeyFactory extends MonkeyFactory {
+
+ private long action1Period;
+ private long action2Period;
+
+ private long cpuLoadDuration;
+ private long cpuLoadProcesses;
+ private long networkIssueTimeout;
+ private long networkIssueDuration;
+ private float networkIssueRation;
+ private long networkIssueDelay;
+ private String networkIssueInterface;
+ private long fillDiskTimeout;
+ private String fillDiskPath;
+ private long fillDiskFileSize;
+ private long fillDiskIssueduration;
+
+ @Override public ChaosMonkey build() {
+ loadProperties();
+
+ Action[] actions1 = new Action[] {
+ new AddCPULoadAction(cpuLoadDuration, cpuLoadProcesses, networkIssueTimeout),
+ new CorruptPackagesCommandAction(networkIssueRation, networkIssueDuration,
+ networkIssueTimeout, networkIssueInterface),
+ new DuplicatePackagesCommandAction(networkIssueRation, networkIssueDuration,
+ networkIssueTimeout, networkIssueInterface),
+ new LosePackagesCommandAction(networkIssueRation, networkIssueDuration,
+ networkIssueTimeout, networkIssueInterface),
+ new DelayPackagesCommandAction(networkIssueDelay, networkIssueDuration,
+ networkIssueTimeout, networkIssueInterface),
+ new ReorderPackagesCommandAction(networkIssueRation, networkIssueDuration,
+ networkIssueDelay, networkIssueTimeout, networkIssueInterface),
+ new FillDiskCommandAction(fillDiskFileSize, fillDiskIssueduration, fillDiskPath,
+ fillDiskTimeout)};
+
+ // Action to log more info for debugging
+ Action[] actions2 = new Action[] {new DumpClusterStatusAction()};
+
+ return new PolicyBasedChaosMonkey(util, new PeriodicRandomActionPolicy(action1Period, actions1),
+ new PeriodicRandomActionPolicy(action2Period, actions2));
+ }
+
+ private void loadProperties() {
+ action1Period = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.PERIODIC_ACTION1_PERIOD,
+ MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + ""));
+ action2Period = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.PERIODIC_ACTION2_PERIOD,
+ MonkeyConstants.DEFAULT_PERIODIC_ACTION2_PERIOD + ""));
+ cpuLoadDuration = Long.parseLong(this.properties.getProperty(
+ MonkeyConstants.CPU_LOAD_DURATION,
+ MonkeyConstants.DEFAULT_CPU_LOAD_DURATION + ""));
+ cpuLoadProcesses = Long.parseLong(this.properties.getProperty(
+ MonkeyConstants.CPU_LOAD_PROCESSES,
+ MonkeyConstants.DEFAULT_CPU_LOAD_PROCESSES + ""));
+ networkIssueTimeout = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.NETWORK_ISSUE_COMMAND_TIMEOUT,
+ MonkeyConstants.DEFAULT_NETWORK_ISSUE_COMMAND_TIMEOUT + ""));
+ networkIssueDuration = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.NETWORK_ISSUE_DURATION,
+ MonkeyConstants.DEFAULT_NETWORK_ISSUE_DURATION + ""));
+ networkIssueRation = Float.parseFloat(this.properties
+ .getProperty(MonkeyConstants.NETWORK_ISSUE_RATIO,
+ MonkeyConstants.DEFAULT_NETWORK_ISSUE_RATIO + ""));
+ networkIssueDelay = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.NETWORK_ISSUE_DELAY,
+ MonkeyConstants.DEFAULT_NETWORK_ISSUE_DELAY + ""));
+ networkIssueInterface = this.properties
+ .getProperty(MonkeyConstants.NETWORK_ISSUE_INTERFACE,
+ MonkeyConstants.DEFAULT_NETWORK_ISSUE_INTERFACE + "");
+ fillDiskTimeout = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.FILL_DISK_COMMAND_TIMEOUT,
+ MonkeyConstants.DEFAULT_FILL_DISK_COMMAND_TIMEOUT + ""));
+ fillDiskPath = this.properties
+ .getProperty(MonkeyConstants.FILL_DISK_PATH,
+ MonkeyConstants.DEFAULT_FILL_DISK_PATH + "");
+ fillDiskFileSize = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.FILL_DISK_FILE_SIZE,
+ MonkeyConstants.DEFAULT_FILL_DISK_FILE_SIZE + ""));
+ fillDiskIssueduration = Long.parseLong(this.properties
+ .getProperty(MonkeyConstants.FILL_DISK_ISSUE_DURATION,
+ MonkeyConstants.DEFAULT_FILL_DISK_ISSUE_DURATION + ""));
+ }
+}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java
index 9051e98..f4c34b5 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyConstants.java
@@ -48,13 +48,26 @@ public interface MonkeyConstants {
String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time";
String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time";
String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio";
+ String CPU_LOAD_DURATION = "cpu.load.duration";
+ String CPU_LOAD_PROCESSES = "cpu.load.processes";
+ String NETWORK_ISSUE_COMMAND_TIMEOUT = "network.issue.command.timeout";
+ String NETWORK_ISSUE_DURATION = "network.issueduration";
+ String NETWORK_ISSUE_RATIO = "network.issue.ratio";
+ String NETWORK_ISSUE_DELAY = "network.issue.delay";
+ String NETWORK_ISSUE_INTERFACE = "network.issue.interface";
+ //should be big enough to create the file
+ String FILL_DISK_COMMAND_TIMEOUT = "fill.disk.command.timeout";
+ String FILL_DISK_PATH = "fill.disk.path";
+ String FILL_DISK_FILE_SIZE = "fill.disk.file.size";
+ String FILL_DISK_ISSUE_DURATION = "fill.disk.issue.duration";
+ String DATA_ISSUE_CHANCE = "data.issue.chance";
/**
* A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky.
*/
Set<String> MONKEY_CONFIGURATION_KEY_PREFIXES = new HashSet<>(
- Arrays.asList("sdm.", "move.", "restart.", "batch.", "rolling.", "compact.",
- "unbalance.", "decrease."));
+ Arrays.asList("sdm.", "move.", "restart.", "batch.", "rolling.", "compact.", "unbalance.",
+ "decrease.", "decrease.", "graceful.", "cpu.", "network.", "fill.", "data."));
long DEFAULT_PERIODIC_ACTION1_PERIOD = 60 * 1000;
long DEFAULT_PERIODIC_ACTION2_PERIOD = 90 * 1000;
@@ -81,4 +94,16 @@ public interface MonkeyConstants {
long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000;
long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000;
float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f;
+ long DEFAULT_CPU_LOAD_DURATION = 5 * 60 * 1000;
+ long DEFAULT_CPU_LOAD_PROCESSES = 2;
+ long DEFAULT_NETWORK_ISSUE_COMMAND_TIMEOUT = 30 * 1000;
+ long DEFAULT_NETWORK_ISSUE_DURATION = 60 * 1000;
+ float DEFAULT_NETWORK_ISSUE_RATIO = 0.1f;
+ long DEFAULT_NETWORK_ISSUE_DELAY = 100;
+ String DEFAULT_NETWORK_ISSUE_INTERFACE = "eth0";
+ long DEFAULT_FILL_DISK_COMMAND_TIMEOUT = 5 * 60 * 1000 + 30 * 1000;//duration + timeout
+ String DEFAULT_FILL_DISK_PATH = "/tmp";
+ long DEFAULT_FILL_DISK_FILE_SIZE = 0;
+ long DEFAULT_FILL_DISK_ISSUE_DURATION = 5 * 60 * 1000;
+ float DEFAULT_DATA_ISSUE_CHANCE = 0.01f;
}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
index f4492b3..73f6968 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
@@ -77,6 +77,8 @@ public abstract class MonkeyFactory {
public static final String MOB_NO_KILL = "mobNoKill";
public static final String MOB_SLOW_DETERMINISTIC = "mobSlowDeterministic";
public static final String SERVER_AND_DEPENDENCIES_KILLING = "serverAndDependenciesKilling";
+ public static final String DISTRIBUTED_ISSUES = "distributedIssues";
+ public static final String DATA_ISSUES = "dataIssues";
public static Map<String, MonkeyFactory> FACTORIES = ImmutableMap.<String,MonkeyFactory>builder()
.put(CALM, new CalmMonkeyFactory())
@@ -89,6 +91,8 @@ public abstract class MonkeyFactory {
.put(MOB_NO_KILL, new MobNoKillMonkeyFactory())
.put(MOB_SLOW_DETERMINISTIC, new MobNoKillMonkeyFactory())
.put(SERVER_AND_DEPENDENCIES_KILLING, new ServerAndDependenciesKillingMonkeyFactory())
+ .put(DISTRIBUTED_ISSUES, new DistributedIssuesMonkeyFactory())
+ .put(DATA_ISSUES, new DataIssuesMonkeyFactory())
.build();
public static MonkeyFactory getFactory(String factoryName) {
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
index 2e763ad..5cb2d7f 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
@@ -78,10 +78,10 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
}
}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java
index 68d11f9..3f2edcc 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerKillingMonkeyFactory.java
@@ -74,10 +74,10 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory {
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
}
}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java
index f29c2a6..cbd492e 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java
@@ -194,10 +194,10 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
}
}
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java
index 4e304fb..b25bef7 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/StressAssignmentManagerMonkeyFactory.java
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- * <p>
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- * <p>
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -105,10 +105,10 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME+ ""));
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
- MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
- MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
+ MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
+ MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
}
}