You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/12/05 01:58:27 UTC

[7/7] kudu git commit: KUDU-1782. Fault injection crashes should exit with a specific exit code

KUDU-1782. Fault injection crashes should exit with a specific exit code

This changes the fault injection "crash" to use exit(85) instead of
LOG(FATAL). This means that we can now distinguish between a process
exiting due to fault injection compared to a process exiting due to a
crash or real FATAL-inducing issue.

The WaitForCrash method in ExternalMiniCluster::ExternalDaemon is now
split into WaitForInjectedCrash, which specifically checks that the exit
code matches the expected one, and WaitForFatal, which checks that the
exit code matches a SIGABRT. If the process exits with an unexpected
status, a bad Status is returned, typically resulting in a test failure.

I verified the new functionality by locally reverting back the fault
injection code to use LOG(FATAL) and checking that ts_recovery-itest
reported that the process had crashed unexpectedly.

Change-Id: Ic8d9ae38efa65123ae69097a76c113b9709c3484
Reviewed-on: http://gerrit.cloudera.org:8080/5353
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/80ac8bae
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/80ac8bae
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/80ac8bae

Branch: refs/heads/master
Commit: 80ac8bae335b490c7b75351e6d4c321a58183c73
Parents: f9e5993
Author: Todd Lipcon <to...@apache.org>
Authored: Sun Dec 4 00:14:37 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Mon Dec 5 01:57:10 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/delete_table-test.cc |  3 +-
 .../integration-tests/disk_reservation-itest.cc |  4 +-
 .../integration-tests/external_mini_cluster.cc  | 42 ++++++++++++++++++--
 .../integration-tests/external_mini_cluster.h   | 27 +++++++++++--
 .../integration-tests/raft_consensus-itest.cc   |  8 ++--
 src/kudu/integration-tests/ts_recovery-itest.cc | 17 ++++----
 src/kudu/util/fault_injection.cc                | 13 ++----
 src/kudu/util/fault_injection.h                 |  7 +++-
 8 files changed, 90 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index 33d2e51..7ad0a3c 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -190,7 +190,7 @@ void DeleteTableTest::WaitForTabletDeletedOnTS(int index,
 void DeleteTableTest::WaitForTSToCrash(int index) {
   auto ts = cluster_->tablet_server(index);
   SCOPED_TRACE(ts->instance_id().permanent_uuid());
-  ASSERT_OK(ts->WaitForCrash(MonoDelta::FromSeconds(60)));
+  ASSERT_OK(ts->WaitForInjectedCrash(MonoDelta::FromSeconds(60)));
 }
 
 void DeleteTableTest::WaitForAllTSToCrash() {
@@ -491,6 +491,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterCrashDuringTabletCopy) {
       "--fault_crash_after_tc_files_fetched=1.0");
 
   // Restart TS 0 and add it to the config. It will crash when tablet copy starts.
+  ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart());
   string leader_uuid = GetLeaderUUID(cluster_->tablet_server(1)->uuid(), replicated_tablet_id);
   TServerDetails* leader = DCHECK_NOTNULL(ts_map_[leader_uuid]);
   TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()];

http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/disk_reservation-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/disk_reservation-itest.cc b/src/kudu/integration-tests/disk_reservation-itest.cc
index ae81d8c..5ca7810 100644
--- a/src/kudu/integration-tests/disk_reservation-itest.cc
+++ b/src/kudu/integration-tests/disk_reservation-itest.cc
@@ -104,7 +104,7 @@ TEST_F(DiskReservationITest, TestFillMultipleDisks) {
   // Wait for crash due to inability to flush or compact.
   Status s;
   for (int i = 0; i < 10; i++) {
-    s = cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(1));
+    s = cluster_->tablet_server(0)->WaitForFatal(MonoDelta::FromSeconds(1));
     if (s.ok()) break;
     LOG(INFO) << "Rows inserted: " << workload.rows_inserted();
   }
@@ -144,7 +144,7 @@ TEST_F(DiskReservationITest, TestWalWriteToFullDiskAborts) {
   ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(0),
                               "disk_reserved_bytes_free_for_testing", "10000001"));
 
-  ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(10)));
+  ASSERT_OK(cluster_->tablet_server(0)->WaitForFatal(MonoDelta::FromSeconds(10)));
   workload.StopAndJoin();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/external_mini_cluster.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster.cc b/src/kudu/integration-tests/external_mini_cluster.cc
index ab69957..9bf09b3 100644
--- a/src/kudu/integration-tests/external_mini_cluster.cc
+++ b/src/kudu/integration-tests/external_mini_cluster.cc
@@ -40,6 +40,7 @@
 #include "kudu/util/async_util.h"
 #include "kudu/util/curl_util.h"
 #include "kudu/util/env.h"
+#include "kudu/util/fault_injection.h"
 #include "kudu/util/jsonreader.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/net/sockaddr.h"
@@ -710,16 +711,49 @@ bool ExternalDaemon::IsProcessAlive() const {
   return s.IsTimedOut();
 }
 
-Status ExternalDaemon::WaitForCrash(const MonoDelta& timeout) const {
+Status ExternalDaemon::WaitForInjectedCrash(const MonoDelta& timeout) const {
+  return WaitForCrash(timeout, [](int status) {
+      return WIFEXITED(status) && WEXITSTATUS(status) == fault_injection::kExitStatus;
+    }, "fault injection");
+}
+
+Status ExternalDaemon::WaitForFatal(const MonoDelta& timeout) const {
+  return WaitForCrash(timeout, [](int status) {
+      return WIFSIGNALED(status) && WTERMSIG(status) == SIGABRT;
+    }, "FATAL crash");
+}
+
+
+Status ExternalDaemon::WaitForCrash(const MonoDelta& timeout,
+                                    const std::function<bool(int)>& wait_status_predicate,
+                                    const char* crash_type_str) const {
+  CHECK(process_) << "process not started";
   MonoTime deadline = MonoTime::Now() + timeout;
 
   int i = 1;
-  while (MonoTime::Now() < deadline) {
-    if (!IsProcessAlive()) return Status::OK();
+  while (IsProcessAlive() && MonoTime::Now() < deadline) {
     int sleep_ms = std::min(i++ * 10, 200);
     SleepFor(MonoDelta::FromMilliseconds(sleep_ms));
   }
-  return Status::TimedOut(Substitute("Process did not crash within $0", timeout.ToString()));
+
+  if (IsProcessAlive()) {
+    return Status::TimedOut(Substitute("Process did not crash within $0",
+                                       timeout.ToString()));
+  }
+
+  // If the process has exited, make sure it exited with the expected status.
+  int wait_status;
+  RETURN_NOT_OK_PREPEND(process_->WaitNoBlock(&wait_status),
+                        "could not get wait status");
+
+  if (!wait_status_predicate(wait_status)) {
+    string info_str;
+    RETURN_NOT_OK_PREPEND(process_->GetExitStatus(nullptr, &info_str),
+                          "could not get description of exit");
+    return Status::Aborted(
+        Substitute("process exited, but not due to a $0: $1", crash_type_str, info_str));
+  }
+  return Status::OK();
 }
 
 pid_t ExternalDaemon::pid() const {

http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/external_mini_cluster.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster.h b/src/kudu/integration-tests/external_mini_cluster.h
index f1dad8c..d7b0fe2 100644
--- a/src/kudu/integration-tests/external_mini_cluster.h
+++ b/src/kudu/integration-tests/external_mini_cluster.h
@@ -17,6 +17,7 @@
 #ifndef KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_H
 #define KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_H
 
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
@@ -348,9 +349,18 @@ class ExternalDaemon : public RefCountedThreadSafe<ExternalDaemon> {
   // explicitly call Shutdown().
   bool IsProcessAlive() const;
 
-  // Wait for this process to crash, or the given timeout to
-  // elapse. If the process is already crashed, returns immediately.
-  Status WaitForCrash(const MonoDelta& timeout) const;
+  // Wait for this process to crash due to a configured fault
+  // injection, or the given timeout to elapse. If the process
+  // crashes for some reason other than an injected fault, returns
+  // Status::Aborted.
+  //
+  // If the process is already crashed, returns immediately.
+  Status WaitForInjectedCrash(const MonoDelta& timeout) const;
+
+  // Same as the above, but expects the process to crash due to a
+  // LOG(FATAL) or CHECK failure. In other words, waits for it to
+  // crash from SIGABRT.
+  Status WaitForFatal(const MonoDelta& timeout) const;
 
   virtual void Shutdown();
 
@@ -381,6 +391,17 @@ class ExternalDaemon : public RefCountedThreadSafe<ExternalDaemon> {
 
   Status StartProcess(const std::vector<std::string>& user_flags);
 
+  // Wait for the process to exit, and then call 'wait_status_predicate'
+  // on the resulting exit status. NOTE: this is not the return code, but
+  // rather the value provided by waitpid(2): use WEXITSTATUS, etc.
+  //
+  // If the predicate matches, returns OK. Otherwise, returns an error.
+  // 'crash_type_str' should be a descriptive name for the type of crash,
+  // used in formatting the error message.
+  Status WaitForCrash(const MonoDelta& timeout,
+                      const std::function<bool(int)>& wait_status_predicate,
+                      const char* crash_type_str) const;
+
   // In a code-coverage build, try to flush the coverage data to disk.
   // In a non-coverage build, this does nothing.
   void FlushCoverage();

http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 2bac732..995a9c4 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -2797,7 +2797,7 @@ TEST_F(RaftConsensusITest, Test_KUDU_1735) {
   // change operations due to the above fault injection.
   ASSERT_OK(RemoveServer(leader_tserver, tablet_id_, tservers[1], boost::none, kTimeout));
   for (int i = 1; i < cluster_->num_tablet_servers(); i++) {
-    ASSERT_OK(external_tservers[i]->WaitForCrash(kTimeout));
+    ASSERT_OK(external_tservers[i]->WaitForInjectedCrash(kTimeout));
   }
 
   // Delete the table, so that when we restart the crashed servers, they'll get RPCs to
@@ -2935,7 +2935,7 @@ TEST_F(RaftConsensusITest, TestLogIOErrorIsFatal) {
   }
   ASSERT_OK(StartElection(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
   for (int i = 1; i <= 2; i++) {
-    ASSERT_OK(ext_tservers[i]->WaitForCrash(MonoDelta::FromSeconds(10)));
+    ASSERT_OK(ext_tservers[i]->WaitForFatal(MonoDelta::FromSeconds(10)));
   }
 
   // Now we know followers crash when they write to their log.
@@ -2954,7 +2954,7 @@ TEST_F(RaftConsensusITest, TestLogIOErrorIsFatal) {
   workload.Start();
 
   // Leader should crash as well.
-  ASSERT_OK(ext_tservers[0]->WaitForCrash(MonoDelta::FromSeconds(10)));
+  ASSERT_OK(ext_tservers[0]->WaitForFatal(MonoDelta::FromSeconds(10)));
   workload.StopAndJoin();
 
   LOG(INFO) << "Everything crashed!";
@@ -3003,7 +3003,7 @@ TEST_F(RaftConsensusITest, TestLogIOErrorIsFatal) {
     ASSERT_OK(ext_tservers[i]->Restart());
   }
   // Leader will crash.
-  ASSERT_OK(ext_tservers[0]->WaitForCrash(MonoDelta::FromSeconds(10)));
+  ASSERT_OK(ext_tservers[0]->WaitForFatal(MonoDelta::FromSeconds(10)));
 }
 
 }  // namespace tserver

http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/ts_recovery-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_recovery-itest.cc b/src/kudu/integration-tests/ts_recovery-itest.cc
index 3480cec..ec20120 100644
--- a/src/kudu/integration-tests/ts_recovery-itest.cc
+++ b/src/kudu/integration-tests/ts_recovery-itest.cc
@@ -86,7 +86,7 @@ TEST_F(TsRecoveryITest, TestRestartWithOrphanedReplicates) {
   work.Start();
 
   // Wait for the process to crash due to the injected fault.
-  ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(30)));
+  ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(30)));
 
   // Stop the writers.
   work.StopAndJoin();
@@ -133,7 +133,7 @@ TEST_F(TsRecoveryITest, TestRestartWithPendingCommitFromFailedOp) {
   work.Start();
 
   // Wait for the process to crash due to the injected fault.
-  ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(30)));
+  ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(30)));
 
   // Stop the writers.
   work.StopAndJoin();
@@ -172,11 +172,14 @@ TEST_F(TsRecoveryITest, TestCrashDuringLogReplay) {
   cluster_->tablet_server(0)->Shutdown();
 
   // Restart might crash very quickly and actually return a bad status, so we
-  // ignore the result.
-  ignore_result(cluster_->tablet_server(0)->Restart());
+  // have to check the result.
+  Status s = cluster_->tablet_server(0)->Restart();
 
-  // Wait for the process to crash during log replay.
-  ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(30)));
+  // Wait for the process to crash during log replay (if it didn't already crash
+  // above while we were restarting it).
+  if (s.ok()) {
+    ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(30)));
+  }
 
   // Now remove the crash flag, so the next replay will complete, and restart
   // the server once more.
@@ -210,7 +213,7 @@ TEST_F(TsRecoveryITest, TestCrashBeforeWriteLogSegmentHeader) {
   work.Start();
 
   // Wait for the process to crash during log roll.
-  ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(60)));
+  ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(60)));
   work.StopAndJoin();
 
   cluster_->tablet_server(0)->Shutdown();

http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/util/fault_injection.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/fault_injection.cc b/src/kudu/util/fault_injection.cc
index db67a24..6a359a6 100644
--- a/src/kudu/util/fault_injection.cc
+++ b/src/kudu/util/fault_injection.cc
@@ -50,18 +50,13 @@ void DoMaybeFault(const char* fault_str, double fraction) {
   if (PREDICT_TRUE(g_random->NextDoubleFraction() >= fraction)) {
     return;
   }
-
-  // Disable core dumps -- it's not useful to get a core dump when we're
-  // purposefully crashing, and some tests cause lots of server crashes
-  // in a loop. This avoids filling up the disk with useless cores.
-  DisableCoreDumps();
-
-  LOG(FATAL) << "Injected fault: " << fault_str;
+  LOG(ERROR) << "Injecting fault: " << fault_str << " (process will exit)";
+  exit(kExitStatus);
 }
 
-void DoInjectRandomLatency(double max_ms) {
+void DoInjectRandomLatency(double max_latency_ms) {
   GoogleOnceInit(&g_random_once, InitRandom);
-  SleepFor(MonoDelta::FromMilliseconds(g_random->NextDoubleFraction() * max_ms));
+  SleepFor(MonoDelta::FromMilliseconds(g_random->NextDoubleFraction() * max_latency_ms));
 }
 
 Status DoMaybeReturnFailure(double fraction,

http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/util/fault_injection.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/fault_injection.h b/src/kudu/util/fault_injection.h
index 462908b..cc22bab 100644
--- a/src/kudu/util/fault_injection.h
+++ b/src/kudu/util/fault_injection.h
@@ -56,9 +56,14 @@
 namespace kudu {
 namespace fault_injection {
 
+// The exit status returned from a process exiting due to a fault.
+// The choice of value here is arbitrary: just needs to be something
+// wouldn't normally be returned by a non-fault-injection code path.
+constexpr int kExitStatus = 85;
+
 // Out-of-line implementation.
 void DoMaybeFault(const char* fault_str, double fraction);
-void DoInjectRandomLatency(double max_latency);
+void DoInjectRandomLatency(double max_latency_ms);
 Status DoMaybeReturnFailure(double fraction,
                             const Status& bad_status_to_return);