You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bigtop.apache.org by co...@apache.org on 2014/02/04 00:28:11 UTC

[2/2] git commit: BIGTOP-1192. Add utilities to facilitate cluster failure testing into bigtop-test-framework

BIGTOP-1192.  Add utilities to facilitate cluster failure testing into bigtop-test-framework


Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/40b39855
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/40b39855
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/40b39855

Branch: refs/heads/master
Commit: 40b398553ff4e00b68548e219aa58c3a8fec2232
Parents: 553a6ca
Author: Mikhail Antonov <ol...@gmail.com>
Authored: Mon Feb 3 15:19:21 2014 -0800
Committer: Konstantin Boudnik <co...@apache.org>
Committed: Mon Feb 3 15:19:21 2014 -0800

----------------------------------------------------------------------
 bigtop-test-framework/README                    |  40 +++++
 .../itest/failures/AbstractFailure.groovy       | 179 +++++++++++++++++++
 .../itest/failures/FailureConstants.groovy      |  38 ++++
 .../failures/NetworkShutdownFailure.groovy      |  78 ++++++++
 .../itest/failures/ServiceKilledFailure.groovy  |  69 +++++++
 .../itest/failures/ServiceRestartFailure.groovy |  69 +++++++
 .../itest/failures/ClusterFailuresTest.groovy   | 153 ++++++++++++++++
 7 files changed, 626 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/bigtop/blob/40b39855/bigtop-test-framework/README
----------------------------------------------------------------------
diff --git a/bigtop-test-framework/README b/bigtop-test-framework/README
index 009972c..0980a6a 100644
--- a/bigtop-test-framework/README
+++ b/bigtop-test-framework/README
@@ -25,3 +25,43 @@ tasks via specific adapters:
 This project contains the experimental foundation of JarRunner and a
 meta-service utilizing it to perform tests execution. These two components
 might be joined together later.
+
+New cluster failures injection capabilities for smoke tests (see BIGTOP-1192 for details)
+imposed certain additional requirements. Smoke tests running cluster failures against real clusters
+expect the following preconditions to be satistied:
+  - For all hosts in the cluster being smoke-tested there's a dedicated user(like "jenkins" or "bigtop"),
+    which has password-less SSH access to those hosts and permissions to execute certain sudo commands
+    (see below) without password
+  - 2 new environment variable are set:
+      * BIGTOP_SMOKES_USER should contain username of user which will be used to run SSH commands
+      * BIGTOP_SMOKES_CLUSTER_IDENTITY_FILE should point to a file with private key for password-less SSH.
+
+To be able to run new itest framework internal test (see ClusterFailuresTest.groovy), the following change
+is needed on the machine with Bigtop workspace:
+ - if your current user doesn't have password-less sude, then run 'visudo' and:
+     * add line in /etc/sudoers at the end of file like: (for Redhat):
+       myusername  localhost = NOPASSWD:/usr/sbin/service,/usr/bin/pkill,/usr/sbin/iptables
+
+To be able to run real module smoke tests again real cluster, more complex setup is needed
+on the machine with Bigtop workspace:
+ - make sure sshd is all nodes in the cluster
+ - check the following in /etc/ssh/sshd_config:
+     * PubkeyAuthentication yes
+     * PasswordAuthentication yes
+ - add new user, for example, "bigtop", on local machine AND on each cluster node, set some password,
+   make sure default shell is set
+ - on local machine, su bigtop, do ssh-keygen with empty passphrase
+ - run 'ssh-copy-id bigtop@<each cluster node>'
+ - log back in on your local machine as your regular user which you use to work with your workspace, copy the generated private
+   key for bigtop user somewhere, do chown and make sure it has right permissions (like 600)
+ - export BIGTOP_SMOKES_CLUSTER_IDENTITY_FILE=/full/path/to/private/key
+ - export BIGTOP_SMOKES_USER=bigtop
+ - on each remote node, run 'visudo' and
+     * add line in /etc/sudoers at the end of file like: (for Redhat):
+       bigtop  localhost = NOPASSWD:/usr/sbin/service,/usr/bin/pkill,/usr/sbin/iptables
+     * Comment out line 'Defaults    requiretty', otherwise sudo may complain like
+       "Sorry, you must have a tty to run sudo"
+
+  - run following sample commands from your local machine to verify your setup:
+      * ssh -i /test_bigtop_ssh_key bigtop@<some cluster node> sudo service crond stop
+      * ssh -i /test_bigtop_ssh_key bigtop@<some cluster node> sudo service crond start

http://git-wip-us.apache.org/repos/asf/bigtop/blob/40b39855/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/AbstractFailure.groovy
----------------------------------------------------------------------
diff --git a/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/AbstractFailure.groovy b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/AbstractFailure.groovy
new file mode 100644
index 0000000..d4f117d
--- /dev/null
+++ b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/AbstractFailure.groovy
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.itest.failures
+
+import org.apache.bigtop.itest.shell.Shell
+import static org.apache.bigtop.itest.LogErrorsUtils.logError
+import static org.apache.bigtop.itest.failures.FailureConstants.PRIVATE_KEY_PATH_ENV_VAR
+import static org.apache.bigtop.itest.failures.FailureConstants.BIGTOP_SMOKES_USER
+
+/**
+ * Abstract class to be subclassed by cluster failures classes of various types:
+ *  - service restart
+ *  - service being killed (kill -9)
+ *  - network shutdown (iptables-based drop).
+ *
+ * Provides means to:
+ *  - run set of "failure" commands against the specified list of hosts
+ *  - restore the correct state.
+ *
+ *  Please see examples of usage in test class ClusterFailuresTest.
+ *
+ *  WARNING:
+ *   - password-less (PKI-based) SSH for user specified in env variable BIGTOP_SMOKES_USER
+ *     to all nodes in cluster being tested is assumed
+ *   - for local tests, like ClusterFailuresTest, this SSH should be setup for localhost
+ *   - env variable BIGTOP_SMOKES_CLUSTER_IDENTITY_FILE should point to according private key file.
+ */
+public abstract class AbstractFailure implements Runnable {
+  protected static Shell rootShell = new Shell("/bin/bash", "root")
+
+  /**
+   * Used to wrap actual command to be executed over SSH, if running in distributed setup.
+   * First substitution param is path to SSH private key, second - remote server username,
+   * third - remote server host address, forth - actual command being wrapped.
+   */
+  protected static String SSH_COMMAND_WRAPPER = "ssh -i %s -o StrictHostKeyChecking=no %s@%s '%s'"
+
+  /**
+   * List of hosts to run fail/restore commands against.
+   */
+  protected List<String> hosts = []
+
+  /**
+   * List of failing commands, defined by a subclass, execute in given sequence.
+   */
+  protected List<String> failCommands = []
+
+  /**
+   * List of restore commands, defined by a subclass, execute in given sequence.
+   */
+  protected List<String> restoreCommands = []
+
+  /**
+   * How long (in milliseconds) shall we wait before executing first failure.
+   */
+  protected long startDelay = 0
+
+  /**
+   * How long failure thread waits before next check if failure is over and it should call restore commands.
+   */
+  private static final SLEEP_TIME = 100;
+
+  /**
+   * Simple constructor for failures, uses default values.
+   * @param hosts list of hosts this failure will be executed on.
+   */
+  public AbstractFailure(List<String> hosts) {
+    this.hosts = hosts
+  }
+
+  /**
+   * Constructor allowing to set all params.
+   *
+   * @param hosts list of hosts the failure will be running against
+   * @param startDelay how long (in millisecs) failure will wait before starting
+   */
+  public AbstractFailure(List<String> hosts, long startDelay) {
+    this.hosts = hosts
+    this.startDelay = startDelay
+  }
+
+  /**
+   * Runs failure/restore commands in a separate thread.
+   */
+  @Override
+  public void run() {
+    try {
+      if (startDelay > 0) {
+        try {
+          Thread.sleep(startDelay)
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt()
+          return
+        }
+      }
+
+      runFailCommands()
+
+      while (!Thread.currentThread().isInterrupted()) {
+        try {
+          Thread.sleep(SLEEP_TIME)
+        } catch (InterruptedException e) {
+          return
+        }
+      }
+    } finally {
+      runRestoreCommands()
+    }
+  }
+
+  private void runRestoreCommands() {
+    restoreCommands.each {
+      rootShell.exec(it)
+      logError(rootShell)
+      assert rootShell.getRet() == 0, "Restore command $it has returned non-0 error code:"
+    }
+  }
+
+  private void runFailCommands() {
+    failCommands.each {
+      rootShell.exec(it)
+      logError(rootShell)
+
+      //some commands, like pkill over ssh, return 137. It's ok.
+      //assertTrue(rootShell.getRet() == 0)
+    }
+  }
+
+  /**
+   * Reads the full path to private key file from env. variable PRIVATE_KEY_PATH_ENV_VAR.
+   * @return full path to file with private key for SSH connections to cluster.
+   */
+  protected String getIdentityFile() {
+    String identityFile = System.getenv(PRIVATE_KEY_PATH_ENV_VAR)
+    assert identityFile, "Env variable $PRIVATE_KEY_PATH_ENV_VAR is not set:"
+    return identityFile
+  }
+
+  /**
+   * Reads the username used for ssh commands from env. variable BIGTOP_SMOKES_USER.
+   * @return user which will be used to run SSH command on target hosts
+   */
+  protected String getSshUser() {
+    String sshUser = System.getenv(BIGTOP_SMOKES_USER)
+    assert sshUser, "Env variable $BIGTOP_SMOKES_USER is not set:"
+    return sshUser
+  }
+
+  /**
+   * If tests are running in distributed mode, i.e. not itest framework tests,
+   * but real cluster smoke tests, wrapping failure command to go over SSH to node on the cluster.
+   *
+   * @param formattedCommand actual failure command to be executed on the remote node
+   * @param host remote node to run command on
+   * @return full command to be executed in the local shell
+   */
+  protected String getSshWrappedCommand(String formattedCommand, String host) {
+    def identityFile = getIdentityFile()
+    def sshUser = getSshUser()
+
+    return String.format(SSH_COMMAND_WRAPPER, identityFile, sshUser, host, formattedCommand);
+  }
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/40b39855/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/FailureConstants.groovy
----------------------------------------------------------------------
diff --git a/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/FailureConstants.groovy b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/FailureConstants.groovy
new file mode 100644
index 0000000..0c24681
--- /dev/null
+++ b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/FailureConstants.groovy
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.itest.failures
+
+/**
+ * Constants for cluster failure smoke tests.
+ */
+public final class FailureConstants {
+
+  /**
+   * Env variable which should contain full local path to the file with SSH private key
+   * used to remotely login on cluster nodes without password.
+   */
+  public static final PRIVATE_KEY_PATH_ENV_VAR = "BIGTOP_SMOKES_CLUSTER_IDENTITY_FILE"
+
+  /**
+   * Env variable which should contain name of Linux user on the hosts where failures are running,
+   * this user should have password-less SSH enabled and privileges to run password-less sudo
+   * commands: service stop/start, pkill -9, iptables rules editing.
+   */
+  public static final BIGTOP_SMOKES_USER = "BIGTOP_SMOKES_USER"
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/40b39855/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/NetworkShutdownFailure.groovy
----------------------------------------------------------------------
diff --git a/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/NetworkShutdownFailure.groovy b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/NetworkShutdownFailure.groovy
new file mode 100644
index 0000000..15bf797
--- /dev/null
+++ b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/NetworkShutdownFailure.groovy
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.itest.failures
+
+/**
+ * Can shutdown network connections between specified hosts during tests execution.
+ */
+public class NetworkShutdownFailure extends AbstractFailure {
+
+  private static final String DROP_INPUT_CONNECTIONS = "sudo iptables -A INPUT -s %s -j DROP"
+  private static final String DROP_OUTPUT_CONNECTIONS = "sudo iptables -A OUTPUT -d %s -j DROP"
+  private static final String RESTORE_INPUT_CONNECTIONS = "sudo iptables -D INPUT -s %s -j DROP"
+  private static final String RESTORE_OUTPUT_CONNECTIONS = "sudo iptables -D OUTPUT -d %s -j DROP"
+
+  /**
+   * Creates list of network disruptions between specified hosts.
+   *
+   * @param srcHost host whose connections will but cut
+   * @param dstHosts destination hosts connections to which from srcHost will be shut down.
+   */
+  public NetworkShutdownFailure(String srcHost, List<String> dstHosts) {
+    super(new ArrayList<String>())
+    populateCommandsList(srcHost, dstHosts)
+  }
+
+  /**
+   * Creates list of network disruptions between specified hosts,
+   * allows to set all additional params.
+   *
+   * @param srcHost host whose connections will but cut
+   * @param dstHosts destination hosts connections to which from srcHost will be shut down
+   * @param startDelay time in milliseconds) the failures will wait before start
+   */
+  public NetworkShutdownFailure(String srcHost,
+                               List<String> dstHosts,
+                               long startDelay) {
+
+    super(new ArrayList<String>(), startDelay)
+    populateCommandsList(srcHost, dstHosts)
+  }
+
+  /*
+   * Populate commands list, making choice between local execution and remote one.
+   */
+  private void populateCommandsList(String host, List<String> dstHosts){
+    if ("localhost".equalsIgnoreCase(host)) {
+      dstHosts.each { dstHost ->
+        failCommands.add(String.format(DROP_INPUT_CONNECTIONS, dstHost))
+        failCommands.add(String.format(DROP_OUTPUT_CONNECTIONS, dstHost))
+        restoreCommands.add(String.format(RESTORE_INPUT_CONNECTIONS, dstHost))
+        restoreCommands.add(String.format(RESTORE_OUTPUT_CONNECTIONS, dstHost))
+      }
+    } else {
+      dstHosts.each { dstHost ->
+        failCommands.add(getSshWrappedCommand(String.format(DROP_INPUT_CONNECTIONS, dstHost), host))
+        failCommands.add(getSshWrappedCommand(String.format(DROP_OUTPUT_CONNECTIONS, dstHost), host))
+        restoreCommands.add(getSshWrappedCommand(String.format(RESTORE_INPUT_CONNECTIONS, dstHost), host))
+        restoreCommands.add(getSshWrappedCommand(String.format(RESTORE_OUTPUT_CONNECTIONS, dstHost), host))
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/40b39855/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceKilledFailure.groovy
----------------------------------------------------------------------
diff --git a/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceKilledFailure.groovy b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceKilledFailure.groovy
new file mode 100644
index 0000000..413f171
--- /dev/null
+++ b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceKilledFailure.groovy
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.itest.failures
+
+/**
+ * Can kill (with kill -9) specified service on specified hosts during tests run.
+ */
+public class ServiceKilledFailure extends AbstractFailure {
+
+  private static final String KILL_SERVICE_TEMPLATE = "sudo pkill -9 -f %s"
+  private static final String START_SERVICE_TEMPLATE = "sudo service %s start"
+
+  /**
+   * Can kill specified service on specified hosts during tests run.
+   *
+   * @param hosts list of hosts on which specified service will be killed
+   * @param serviceName name of service to be killed.
+   */
+  public ServiceKilledFailure(List<String> hosts, String serviceName) {
+    super(hosts)
+    populateCommandsList(hosts, serviceName)
+  }
+
+  /**
+   * Can kill specified service on specified hosts during tests run.
+   *
+   * @param hosts list of hosts on which specified service will be killed
+   * @param serviceName name of service to be killed
+   * @param startDelay time in milliseconds) the failures will wait before start
+   */
+  public ServiceKilledFailure(List<String> hosts,
+                              String serviceName,
+                              long startDelay) {
+
+    super(hosts, startDelay)
+    populateCommandsList(hosts, serviceName)
+  }
+
+  /*
+   * Populate commands list, making choice between local execution and remote one.
+   */
+  private void populateCommandsList(List<String> hosts, String serviceName){
+    if (hosts.size() == 1 && "localhost".equalsIgnoreCase(hosts[0])) {
+      failCommands.add(String.format(KILL_SERVICE_TEMPLATE, serviceName))
+      restoreCommands.add(String.format(START_SERVICE_TEMPLATE, serviceName))
+    } else {
+      hosts.each { host ->
+        failCommands.add(getSshWrappedCommand(String.format(KILL_SERVICE_TEMPLATE, serviceName), host))
+        restoreCommands.add(getSshWrappedCommand(String.format(START_SERVICE_TEMPLATE, serviceName), host))
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/40b39855/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceRestartFailure.groovy
----------------------------------------------------------------------
diff --git a/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceRestartFailure.groovy b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceRestartFailure.groovy
new file mode 100644
index 0000000..6dd1005
--- /dev/null
+++ b/bigtop-test-framework/src/main/groovy/org/apache/bigtop/itest/failures/ServiceRestartFailure.groovy
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.itest.failures
+
+/**
+ * Can restart specified services on specified hosts during tests execution.
+ */
+public class ServiceRestartFailure extends AbstractFailure {
+
+  private static final String STOP_SERVICE_TEMPLATE = "sudo service %s stop"
+  private static final String START_SERVICE_TEMPLATE = "sudo service %s start"
+
+  /**
+   * Can restart specified service on specified hosts during tests run.
+   *
+   * @param hosts list of hosts on which specified service will be restarted
+   * @param serviceName name of service to be restarted.
+   */
+  public ServiceRestartFailure(List<String> hosts, String serviceName) {
+    super(hosts)
+    populateCommandsList(hosts, serviceName)
+  }
+
+  /**
+   * Can gracefully restart specified service on specified hosts during tests run.
+   *
+   * @param hosts list of hosts on which specified service will be restarted
+   * @param serviceName name of service to be restarted
+   * @param startDelay time in milliseconds) the failures will wait before start
+   */
+  public ServiceRestartFailure(List<String> hosts,
+                               String serviceName,
+                               long startDelay) {
+
+    super(hosts, startDelay)
+    populateCommandsList(hosts, serviceName)
+  }
+
+  /*
+   * Populate commands list, making choice between local execution and remote one.
+   */
+  private void populateCommandsList(List<String> hosts, String serviceName){
+    if (hosts.size() == 1 && "localhost".equalsIgnoreCase(hosts[0])) {
+      failCommands.add(String.format(STOP_SERVICE_TEMPLATE, serviceName))
+      restoreCommands.add(String.format(START_SERVICE_TEMPLATE, serviceName))
+    } else {
+      hosts.each { host ->
+        failCommands.add(getSshWrappedCommand(String.format(STOP_SERVICE_TEMPLATE, serviceName), host))
+        restoreCommands.add(getSshWrappedCommand(String.format(START_SERVICE_TEMPLATE, serviceName), host))
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/40b39855/bigtop-test-framework/src/test/groovy/org/apache/bigtop/itest/failures/ClusterFailuresTest.groovy
----------------------------------------------------------------------
diff --git a/bigtop-test-framework/src/test/groovy/org/apache/bigtop/itest/failures/ClusterFailuresTest.groovy b/bigtop-test-framework/src/test/groovy/org/apache/bigtop/itest/failures/ClusterFailuresTest.groovy
new file mode 100644
index 0000000..af6b36b
--- /dev/null
+++ b/bigtop-test-framework/src/test/groovy/org/apache/bigtop/itest/failures/ClusterFailuresTest.groovy
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.bigtop.itest.failures
+
+import org.apache.bigtop.itest.shell.OS
+import org.junit.Test
+import org.apache.bigtop.itest.shell.Shell
+
+public class ClusterFailuresTest {
+  private Shell rootShell = new Shell("/bin/bash", "root")
+  private final int SLEEP_TIME = 100
+  private final String TEST_HOST = "localhost"
+  private final String TEST_REMOTE_HOST = "apache.org"
+  private final String CRON_SERVICE
+
+  {
+    switch (OS.linux_flavor) {
+      case ~/(?is).*(redhat|centos|rhel|fedora|enterpriseenterpriseserver).*/:
+        CRON_SERVICE = "crond"
+        break
+      default:
+        CRON_SERVICE = "cron"
+    }
+  }
+
+  @Test
+  void testServiceRestart() {
+    startCron()
+    assert isCronRunning(), "$CRON_SERVICE service isn't running before the test:"
+
+    def cronKilled = new ServiceRestartFailure([TEST_HOST], "$CRON_SERVICE")
+    Thread t = new Thread(cronKilled)
+    t.start()
+
+    while (isCronRunning()) {
+      println "$CRON_SERVICE it still running"
+      Thread.sleep(SLEEP_TIME)
+    }
+
+    try{
+      assert !isCronRunning(), "$CRON_SERVICE hasn't been stopped as expected:"
+      println "$CRON_SERVICE stopped. Good."
+    } finally {
+      t.interrupt()
+    }
+
+    while (!isCronRunning()) {
+      println "$CRON_SERVICE it still stopped.."
+      Thread.sleep(SLEEP_TIME)
+    }
+
+    assert isCronRunning(), "$CRON_SERVICE hasn't been restarted after the test:"
+    println "$CRON_SERVICE is up. Good"
+  }
+
+  @Test
+  void testServiceKilled() {
+    // On Ubuntu services like cron or ssh get restarted automatically if killed,
+    // so for now disabling this test for Ubuntu users.
+    if (OS.linux_flavor ==~ /(?is).*(ubuntu|debian).*/) {
+      println "As you're running on $OS.linux_flavor, testServiceKilled() doesn't run for you."
+      return
+    }
+
+    startCron()
+    assert isCronRunning(), "$CRON_SERVICE service isn't running before the test:"
+
+    def cronKilled = new ServiceKilledFailure([TEST_HOST], "$CRON_SERVICE")
+    Thread t = new Thread(cronKilled)
+    t.start()
+
+    while (isCronRunning()) {
+      println "$CRON_SERVICE it still running.."
+      Thread.sleep(SLEEP_TIME)
+    }
+
+    try{
+      assert !isCronRunning(), "$CRON_SERVICE hasn't been killed as expected:"
+      println "$CRON_SERVICE killed. Good."
+    } finally {
+      t.interrupt()
+    }
+
+    while (!isCronRunning()) {
+      println "$CRON_SERVICE it still killed..."
+      Thread.sleep(SLEEP_TIME)
+    }
+
+    assert isCronRunning(), "$CRON_SERVICE hasn't been restarted after the test:"
+    println "$CRON_SERVICE is up. Good."
+  }
+
+  @Test
+  void testNetworkShutdown() {
+    //make sure there are no blocking rules
+    rootShell.exec("iptables -D INPUT -s $TEST_REMOTE_HOST -j DROP")
+    rootShell.exec("iptables -D OUTPUT -d $TEST_REMOTE_HOST -j DROP")
+
+    assert isRemoteHostReachable(), "No ping to $TEST_REMOTE_HOST, which is used for network failures test:"
+
+    def networkShutdown = new NetworkShutdownFailure(TEST_HOST, [TEST_REMOTE_HOST])
+    Thread t = new Thread(networkShutdown)
+    t.start()
+
+    while (isRemoteHostReachable()) {
+      println "$TEST_REMOTE_HOST is still reachable..."
+      Thread.sleep(SLEEP_TIME)
+    }
+
+    try{
+      assert !isRemoteHostReachable(), "Connection to $TEST_REMOTE_HOST hasn't been killed as expected:"
+      println "$TEST_REMOTE_HOST isn't reachable. Good."
+    } finally {
+      t.interrupt()
+    }
+
+    while (!isRemoteHostReachable()) {
+      println "$TEST_REMOTE_HOST isn't reachable..."
+      Thread.sleep(SLEEP_TIME)
+    }
+
+    assert isRemoteHostReachable(), "Connection to $TEST_REMOTE_HOST hasn't been restored after the test:"
+    println "$TEST_REMOTE_HOST is reachable again. Good."
+  }
+
+  private boolean isCronRunning() {
+    return rootShell.exec("pgrep $CRON_SERVICE").ret == 0 ? true : false
+  }
+
+  private void startCron() {
+    rootShell.exec("service $CRON_SERVICE start")
+  }
+
+  private boolean isRemoteHostReachable() {
+    return rootShell.exec("ping -qc 1 $TEST_REMOTE_HOST").ret == 0 ? true : false
+  }
+}