You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/12/05 01:58:27 UTC
[7/7] kudu git commit: KUDU-1782. Fault injection crashes should exit
with a specific exit code
KUDU-1782. Fault injection crashes should exit with a specific exit code
This changes the fault injection "crash" to use exit(85) instead of
LOG(FATAL). This means that we can now distinguish between a process
exiting due to fault injection compared to a process exiting due to a
crash or real FATAL-inducing issue.
The WaitForCrash method in ExternalMiniCluster::ExternalDaemon is now
split into WaitForInjectedCrash, which specifically checks that the exit
code matches the expected one, and WaitForFatal, which checks that the
exit code matches a SIGABRT. If the process exits with an unexpected
status, a bad Status is returned, typically resulting in a test failure.
I verified the new functionality by locally reverting back the fault
injection code to use LOG(FATAL) and checking that ts_recovery-itest
reported that the process had crashed unexpectedly.
Change-Id: Ic8d9ae38efa65123ae69097a76c113b9709c3484
Reviewed-on: http://gerrit.cloudera.org:8080/5353
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/80ac8bae
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/80ac8bae
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/80ac8bae
Branch: refs/heads/master
Commit: 80ac8bae335b490c7b75351e6d4c321a58183c73
Parents: f9e5993
Author: Todd Lipcon <to...@apache.org>
Authored: Sun Dec 4 00:14:37 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Mon Dec 5 01:57:10 2016 +0000
----------------------------------------------------------------------
src/kudu/integration-tests/delete_table-test.cc | 3 +-
.../integration-tests/disk_reservation-itest.cc | 4 +-
.../integration-tests/external_mini_cluster.cc | 42 ++++++++++++++++++--
.../integration-tests/external_mini_cluster.h | 27 +++++++++++--
.../integration-tests/raft_consensus-itest.cc | 8 ++--
src/kudu/integration-tests/ts_recovery-itest.cc | 17 ++++----
src/kudu/util/fault_injection.cc | 13 ++----
src/kudu/util/fault_injection.h | 7 +++-
8 files changed, 90 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index 33d2e51..7ad0a3c 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -190,7 +190,7 @@ void DeleteTableTest::WaitForTabletDeletedOnTS(int index,
void DeleteTableTest::WaitForTSToCrash(int index) {
auto ts = cluster_->tablet_server(index);
SCOPED_TRACE(ts->instance_id().permanent_uuid());
- ASSERT_OK(ts->WaitForCrash(MonoDelta::FromSeconds(60)));
+ ASSERT_OK(ts->WaitForInjectedCrash(MonoDelta::FromSeconds(60)));
}
void DeleteTableTest::WaitForAllTSToCrash() {
@@ -491,6 +491,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterCrashDuringTabletCopy) {
"--fault_crash_after_tc_files_fetched=1.0");
// Restart TS 0 and add it to the config. It will crash when tablet copy starts.
+ ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart());
string leader_uuid = GetLeaderUUID(cluster_->tablet_server(1)->uuid(), replicated_tablet_id);
TServerDetails* leader = DCHECK_NOTNULL(ts_map_[leader_uuid]);
TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()];
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/disk_reservation-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/disk_reservation-itest.cc b/src/kudu/integration-tests/disk_reservation-itest.cc
index ae81d8c..5ca7810 100644
--- a/src/kudu/integration-tests/disk_reservation-itest.cc
+++ b/src/kudu/integration-tests/disk_reservation-itest.cc
@@ -104,7 +104,7 @@ TEST_F(DiskReservationITest, TestFillMultipleDisks) {
// Wait for crash due to inability to flush or compact.
Status s;
for (int i = 0; i < 10; i++) {
- s = cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(1));
+ s = cluster_->tablet_server(0)->WaitForFatal(MonoDelta::FromSeconds(1));
if (s.ok()) break;
LOG(INFO) << "Rows inserted: " << workload.rows_inserted();
}
@@ -144,7 +144,7 @@ TEST_F(DiskReservationITest, TestWalWriteToFullDiskAborts) {
ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(0),
"disk_reserved_bytes_free_for_testing", "10000001"));
- ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(10)));
+ ASSERT_OK(cluster_->tablet_server(0)->WaitForFatal(MonoDelta::FromSeconds(10)));
workload.StopAndJoin();
}
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/external_mini_cluster.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster.cc b/src/kudu/integration-tests/external_mini_cluster.cc
index ab69957..9bf09b3 100644
--- a/src/kudu/integration-tests/external_mini_cluster.cc
+++ b/src/kudu/integration-tests/external_mini_cluster.cc
@@ -40,6 +40,7 @@
#include "kudu/util/async_util.h"
#include "kudu/util/curl_util.h"
#include "kudu/util/env.h"
+#include "kudu/util/fault_injection.h"
#include "kudu/util/jsonreader.h"
#include "kudu/util/metrics.h"
#include "kudu/util/net/sockaddr.h"
@@ -710,16 +711,49 @@ bool ExternalDaemon::IsProcessAlive() const {
return s.IsTimedOut();
}
-Status ExternalDaemon::WaitForCrash(const MonoDelta& timeout) const {
+Status ExternalDaemon::WaitForInjectedCrash(const MonoDelta& timeout) const {
+ return WaitForCrash(timeout, [](int status) {
+ return WIFEXITED(status) && WEXITSTATUS(status) == fault_injection::kExitStatus;
+ }, "fault injection");
+}
+
+Status ExternalDaemon::WaitForFatal(const MonoDelta& timeout) const {
+ return WaitForCrash(timeout, [](int status) {
+ return WIFSIGNALED(status) && WTERMSIG(status) == SIGABRT;
+ }, "FATAL crash");
+}
+
+
+Status ExternalDaemon::WaitForCrash(const MonoDelta& timeout,
+ const std::function<bool(int)>& wait_status_predicate,
+ const char* crash_type_str) const {
+ CHECK(process_) << "process not started";
MonoTime deadline = MonoTime::Now() + timeout;
int i = 1;
- while (MonoTime::Now() < deadline) {
- if (!IsProcessAlive()) return Status::OK();
+ while (IsProcessAlive() && MonoTime::Now() < deadline) {
int sleep_ms = std::min(i++ * 10, 200);
SleepFor(MonoDelta::FromMilliseconds(sleep_ms));
}
- return Status::TimedOut(Substitute("Process did not crash within $0", timeout.ToString()));
+
+ if (IsProcessAlive()) {
+ return Status::TimedOut(Substitute("Process did not crash within $0",
+ timeout.ToString()));
+ }
+
+ // If the process has exited, make sure it exited with the expected status.
+ int wait_status;
+ RETURN_NOT_OK_PREPEND(process_->WaitNoBlock(&wait_status),
+ "could not get wait status");
+
+ if (!wait_status_predicate(wait_status)) {
+ string info_str;
+ RETURN_NOT_OK_PREPEND(process_->GetExitStatus(nullptr, &info_str),
+ "could not get description of exit");
+ return Status::Aborted(
+ Substitute("process exited, but not due to a $0: $1", crash_type_str, info_str));
+ }
+ return Status::OK();
}
pid_t ExternalDaemon::pid() const {
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/external_mini_cluster.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster.h b/src/kudu/integration-tests/external_mini_cluster.h
index f1dad8c..d7b0fe2 100644
--- a/src/kudu/integration-tests/external_mini_cluster.h
+++ b/src/kudu/integration-tests/external_mini_cluster.h
@@ -17,6 +17,7 @@
#ifndef KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_H
#define KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_H
+#include <functional>
#include <map>
#include <memory>
#include <string>
@@ -348,9 +349,18 @@ class ExternalDaemon : public RefCountedThreadSafe<ExternalDaemon> {
// explicitly call Shutdown().
bool IsProcessAlive() const;
- // Wait for this process to crash, or the given timeout to
- // elapse. If the process is already crashed, returns immediately.
- Status WaitForCrash(const MonoDelta& timeout) const;
+ // Wait for this process to crash due to a configured fault
+ // injection, or the given timeout to elapse. If the process
+ // crashes for some reason other than an injected fault, returns
+ // Status::Aborted.
+ //
+ // If the process is already crashed, returns immediately.
+ Status WaitForInjectedCrash(const MonoDelta& timeout) const;
+
+ // Same as the above, but expects the process to crash due to a
+ // LOG(FATAL) or CHECK failure. In other words, waits for it to
+ // crash from SIGABRT.
+ Status WaitForFatal(const MonoDelta& timeout) const;
virtual void Shutdown();
@@ -381,6 +391,17 @@ class ExternalDaemon : public RefCountedThreadSafe<ExternalDaemon> {
Status StartProcess(const std::vector<std::string>& user_flags);
+ // Wait for the process to exit, and then call 'wait_status_predicate'
+ // on the resulting exit status. NOTE: this is not the return code, but
+ // rather the value provided by waitpid(2): use WEXITSTATUS, etc.
+ //
+ // If the predicate matches, returns OK. Otherwise, returns an error.
+ // 'crash_type_str' should be a descriptive name for the type of crash,
+ // used in formatting the error message.
+ Status WaitForCrash(const MonoDelta& timeout,
+ const std::function<bool(int)>& wait_status_predicate,
+ const char* crash_type_str) const;
+
// In a code-coverage build, try to flush the coverage data to disk.
// In a non-coverage build, this does nothing.
void FlushCoverage();
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 2bac732..995a9c4 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -2797,7 +2797,7 @@ TEST_F(RaftConsensusITest, Test_KUDU_1735) {
// change operations due to the above fault injection.
ASSERT_OK(RemoveServer(leader_tserver, tablet_id_, tservers[1], boost::none, kTimeout));
for (int i = 1; i < cluster_->num_tablet_servers(); i++) {
- ASSERT_OK(external_tservers[i]->WaitForCrash(kTimeout));
+ ASSERT_OK(external_tservers[i]->WaitForInjectedCrash(kTimeout));
}
// Delete the table, so that when we restart the crashed servers, they'll get RPCs to
@@ -2935,7 +2935,7 @@ TEST_F(RaftConsensusITest, TestLogIOErrorIsFatal) {
}
ASSERT_OK(StartElection(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
for (int i = 1; i <= 2; i++) {
- ASSERT_OK(ext_tservers[i]->WaitForCrash(MonoDelta::FromSeconds(10)));
+ ASSERT_OK(ext_tservers[i]->WaitForFatal(MonoDelta::FromSeconds(10)));
}
// Now we know followers crash when they write to their log.
@@ -2954,7 +2954,7 @@ TEST_F(RaftConsensusITest, TestLogIOErrorIsFatal) {
workload.Start();
// Leader should crash as well.
- ASSERT_OK(ext_tservers[0]->WaitForCrash(MonoDelta::FromSeconds(10)));
+ ASSERT_OK(ext_tservers[0]->WaitForFatal(MonoDelta::FromSeconds(10)));
workload.StopAndJoin();
LOG(INFO) << "Everything crashed!";
@@ -3003,7 +3003,7 @@ TEST_F(RaftConsensusITest, TestLogIOErrorIsFatal) {
ASSERT_OK(ext_tservers[i]->Restart());
}
// Leader will crash.
- ASSERT_OK(ext_tservers[0]->WaitForCrash(MonoDelta::FromSeconds(10)));
+ ASSERT_OK(ext_tservers[0]->WaitForFatal(MonoDelta::FromSeconds(10)));
}
} // namespace tserver
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/integration-tests/ts_recovery-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_recovery-itest.cc b/src/kudu/integration-tests/ts_recovery-itest.cc
index 3480cec..ec20120 100644
--- a/src/kudu/integration-tests/ts_recovery-itest.cc
+++ b/src/kudu/integration-tests/ts_recovery-itest.cc
@@ -86,7 +86,7 @@ TEST_F(TsRecoveryITest, TestRestartWithOrphanedReplicates) {
work.Start();
// Wait for the process to crash due to the injected fault.
- ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(30)));
+ ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(30)));
// Stop the writers.
work.StopAndJoin();
@@ -133,7 +133,7 @@ TEST_F(TsRecoveryITest, TestRestartWithPendingCommitFromFailedOp) {
work.Start();
// Wait for the process to crash due to the injected fault.
- ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(30)));
+ ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(30)));
// Stop the writers.
work.StopAndJoin();
@@ -172,11 +172,14 @@ TEST_F(TsRecoveryITest, TestCrashDuringLogReplay) {
cluster_->tablet_server(0)->Shutdown();
// Restart might crash very quickly and actually return a bad status, so we
- // ignore the result.
- ignore_result(cluster_->tablet_server(0)->Restart());
+ // have to check the result.
+ Status s = cluster_->tablet_server(0)->Restart();
- // Wait for the process to crash during log replay.
- ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(30)));
+ // Wait for the process to crash during log replay (if it didn't already crash
+ // above while we were restarting it).
+ if (s.ok()) {
+ ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(30)));
+ }
// Now remove the crash flag, so the next replay will complete, and restart
// the server once more.
@@ -210,7 +213,7 @@ TEST_F(TsRecoveryITest, TestCrashBeforeWriteLogSegmentHeader) {
work.Start();
// Wait for the process to crash during log roll.
- ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(60)));
+ ASSERT_OK(cluster_->tablet_server(0)->WaitForInjectedCrash(MonoDelta::FromSeconds(60)));
work.StopAndJoin();
cluster_->tablet_server(0)->Shutdown();
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/util/fault_injection.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/fault_injection.cc b/src/kudu/util/fault_injection.cc
index db67a24..6a359a6 100644
--- a/src/kudu/util/fault_injection.cc
+++ b/src/kudu/util/fault_injection.cc
@@ -50,18 +50,13 @@ void DoMaybeFault(const char* fault_str, double fraction) {
if (PREDICT_TRUE(g_random->NextDoubleFraction() >= fraction)) {
return;
}
-
- // Disable core dumps -- it's not useful to get a core dump when we're
- // purposefully crashing, and some tests cause lots of server crashes
- // in a loop. This avoids filling up the disk with useless cores.
- DisableCoreDumps();
-
- LOG(FATAL) << "Injected fault: " << fault_str;
+ LOG(ERROR) << "Injecting fault: " << fault_str << " (process will exit)";
+ exit(kExitStatus);
}
-void DoInjectRandomLatency(double max_ms) {
+void DoInjectRandomLatency(double max_latency_ms) {
GoogleOnceInit(&g_random_once, InitRandom);
- SleepFor(MonoDelta::FromMilliseconds(g_random->NextDoubleFraction() * max_ms));
+ SleepFor(MonoDelta::FromMilliseconds(g_random->NextDoubleFraction() * max_latency_ms));
}
Status DoMaybeReturnFailure(double fraction,
http://git-wip-us.apache.org/repos/asf/kudu/blob/80ac8bae/src/kudu/util/fault_injection.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/fault_injection.h b/src/kudu/util/fault_injection.h
index 462908b..cc22bab 100644
--- a/src/kudu/util/fault_injection.h
+++ b/src/kudu/util/fault_injection.h
@@ -56,9 +56,14 @@
namespace kudu {
namespace fault_injection {
+// The exit status returned from a process exiting due to a fault.
+// The choice of value here is arbitrary: just needs to be something
+// wouldn't normally be returned by a non-fault-injection code path.
+constexpr int kExitStatus = 85;
+
// Out-of-line implementation.
void DoMaybeFault(const char* fault_str, double fraction);
-void DoInjectRandomLatency(double max_latency);
+void DoInjectRandomLatency(double max_latency_ms);
Status DoMaybeReturnFailure(double fraction,
const Status& bad_status_to_return);