You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2018/07/30 18:46:58 UTC
[2/4] kudu git commit: [kudu-admin-test] improvements on error
reporting
[kudu-admin-test] improvements on error reporting
This changelist introduces improvements on logging in case
if an assertion triggers in the rebalancer-related scenarios.
I found it's hard to troubleshoot the flakiness reported lately
in the RebalancerAndSingleReplicaTablets.SingleReplicasStayOrMove
scenario. With hindsight, it turned out that improved logging
on assertions would help a lot.
Change-Id: Ibfa74d5a78e89bf7ff1a0e914384999466460145
Reviewed-on: http://gerrit.cloudera.org:8080/11046
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Tested-by: Alexey Serbin <as...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/660ee80b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/660ee80b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/660ee80b
Branch: refs/heads/master
Commit: 660ee80bb4c0f81cb6f0a08365a7e2abd0a3e409
Parents: 7e733a7
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Jul 24 20:20:00 2018 -0700
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Mon Jul 30 15:06:44 2018 +0000
----------------------------------------------------------------------
src/kudu/tools/kudu-admin-test.cc | 80 ++++++++++++++++++++--------------
1 file changed, 47 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/660ee80b/src/kudu/tools/kudu-admin-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc
index 21bae3b..0408e35 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -48,7 +48,6 @@
#include "kudu/gutil/map-util.h"
#include "kudu/gutil/strings/split.h"
#include "kudu/gutil/strings/substitute.h"
-#include "kudu/gutil/strings/util.h"
#include "kudu/integration-tests/cluster_itest_util.h"
#include "kudu/integration-tests/cluster_verifier.h"
#include "kudu/integration-tests/test_workload.h"
@@ -105,6 +104,7 @@ using std::atomic;
using std::back_inserter;
using std::copy;
using std::deque;
+using std::endl;
using std::ostringstream;
using std::string;
using std::thread;
@@ -1331,6 +1331,14 @@ TEST_F(AdminCliTest, TestListTablesDetail) {
}
}
+static string ToolRunInfo(const Status& s, const string& out, const string& err) {
+ ostringstream str;
+ str << s.ToString() << endl;
+ str << "stdout: " << out << endl;
+ str << "stderr: " << err << endl;
+ return str.str();
+}
+
// Make sure the rebalancer doesn't start if a tablet server is down.
class RebalanceStartCriteriaTest :
public AdminCliTest,
@@ -1359,13 +1367,14 @@ TEST_P(RebalanceStartCriteriaTest, TabletServerIsDown) {
ts->Shutdown();
}
+ string out;
string err;
Status s = RunKuduTool({
"cluster",
"rebalance",
cluster_->master()->bound_rpc_addr().ToString()
- }, nullptr, &err);
- ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
+ }, &out, &err);
+ ASSERT_TRUE(s.IsRuntimeError()) << ToolRunInfo(s, out, err);
const auto err_msg_pattern = Substitute(
"Illegal state: tablet server .* \\($0\\): "
"unacceptable health status UNAVAILABLE",
@@ -1501,7 +1510,7 @@ TEST_P(RebalanceParamTest, Rebalance) {
string out;
string err;
const Status s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
<< "stderr: " << err;
}
@@ -1512,7 +1521,7 @@ TEST_P(RebalanceParamTest, Rebalance) {
string out;
string err;
const Status s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out,
"rebalancing is complete: cluster is balanced (moved 0 replicas)")
<< "stderr: " << err;
@@ -1532,7 +1541,7 @@ TEST_P(RebalanceParamTest, Rebalance) {
string out;
string err;
const Status s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
<< "stderr: " << err;
// The cluster was un-balanced, so many replicas should have been moved.
@@ -1570,6 +1579,8 @@ class RebalancingTest :
}
protected:
+ static const char* const kExitOnSignalStr;
+
void Prepare(const vector<string>& extra_tserver_flags = {},
const vector<string>& extra_master_flags = {}) {
copy(extra_tserver_flags.begin(), extra_tserver_flags.end(),
@@ -1616,6 +1627,7 @@ class RebalancingTest :
vector<string> tserver_flags_;
vector<string> master_flags_;
};
+const char* const RebalancingTest::kExitOnSignalStr = "kudu: process exited on signal";
// Make sure the rebalancer is able to do its job if running concurrently
// with DDL activity on the cluster.
@@ -1781,7 +1793,7 @@ TEST_P(DDLDuringRebalancingTest, TablesCreatedAndDeletedDuringRebalancing) {
string out;
string err;
const auto s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
<< "stderr: " << err;
}
@@ -1792,7 +1804,7 @@ TEST_P(DDLDuringRebalancingTest, TablesCreatedAndDeletedDuringRebalancing) {
string out;
string err;
const auto s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out,
"rebalancing is complete: cluster is balanced (moved 0 replicas)")
<< "stderr: " << err;
@@ -1829,21 +1841,18 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) {
};
const auto runner_func = [&]() {
+ string out;
string err;
- const auto s = RunKuduTool(tool_args, nullptr, &err);
-
- ostringstream os;
- os << "rebalancer status: " << s.ToString();
- // One might expect a bad status returned: e.g., due to some race so
- // the rebalancer didn't able to make progress for more than
- // --max_staleness_interval_sec, etc.
+ const auto s = RunKuduTool(tool_args, &out, &err);
if (!s.ok()) {
- os << " : " << err;
+ // One might expect a bad status returned: e.g., due to some race so
+ // the rebalancer didn't able to make progress for more than
+ // --max_staleness_interval_sec, etc.
+ LOG(INFO) << "rebalancer run info: " << ToolRunInfo(s, out, err);
}
- LOG(INFO) << os.str();
// Should not exit on a signal: not expecting SIGSEGV, SIGABRT, etc.
- return !MatchPattern(err, "*kudu: process exited on signal*");
+ return s.ToString().find(kExitOnSignalStr) == string::npos;
};
CountDownLatch start_synchronizer(1);
@@ -1868,7 +1877,7 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) {
string out;
string err;
const auto s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
<< "stderr: " << err;
}
@@ -1881,7 +1890,7 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) {
string out;
string err;
const auto s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out,
"rebalancing is complete: cluster is balanced (moved 0 replicas)")
<< "stderr: " << err;
@@ -1921,13 +1930,14 @@ TEST_P(TserverGoesDownDuringRebalancingTest, TserverDown) {
// Pre-condition: 'kudu cluster ksck' should be happy with the cluster state
// shortly after initial setup.
ASSERT_EVENTUALLY([&]() {
+ string out;
string err;
const auto s = RunKuduTool({
"cluster",
"ksck",
cluster_->master()->bound_rpc_addr().ToString(),
- }, nullptr, &err);
- ASSERT_TRUE(s.ok()) << "stderr: " << err;
+ }, &out, &err);
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
});
Random r(SeedRandom());
@@ -1960,13 +1970,13 @@ TEST_P(TserverGoesDownDuringRebalancingTest, TserverDown) {
// server goes down.
"--max_moves_per_server=1",
}, &out, &err);
- ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
+ ASSERT_TRUE(s.IsRuntimeError()) << ToolRunInfo(s, out, err);
+
+ // The rebalancer tool should not crash.
+ ASSERT_STR_NOT_CONTAINS(s.ToString(), kExitOnSignalStr);
ASSERT_STR_MATCHES(
err, "Illegal state: tablet server .* \\(.*\\): "
"unacceptable health status UNAVAILABLE");
-
- // The rebalancer tool should not crash.
- ASSERT_STR_NOT_CONTAINS(err, "kudu: process exited on signal");
}
run = false;
@@ -2011,9 +2021,10 @@ TEST_P(TserverAddedDuringRebalancingTest, TserverStarts) {
atomic<bool> run(true);
thread runner([&]() {
while (run) {
+ string out;
string err;
- const auto s = RunKuduTool(tool_args, nullptr, &err);
- CHECK(s.ok()) << s.ToString() << "stderr: " << err;
+ const auto s = RunKuduTool(tool_args, &out, &err);
+ CHECK(s.ok()) << ToolRunInfo(s, out, err);
}
});
auto runner_cleanup = MakeScopedCleanup([&]() {
@@ -2037,7 +2048,7 @@ TEST_P(TserverAddedDuringRebalancingTest, TserverStarts) {
string out;
string err;
const auto s = RunKuduTool(tool_args, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString();
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
ASSERT_STR_CONTAINS(out,
"rebalancing is complete: cluster is balanced (moved 0 replicas)")
<< "stderr: " << err;
@@ -2125,17 +2136,20 @@ TEST_P(RebalancerAndSingleReplicaTablets, SingleReplicasStayOrMove) {
cluster_->master()->bound_rpc_addr().ToString(),
Substitute("--move_single_replicas=$0", move_single_replica),
}, &out, &err);
- ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
- ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced");
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
+ ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
+ << "stderr: " << err;
if (move_single_replica == "enabled" ||
(move_single_replica == "auto" && is_343_scheme)) {
// Should move appropriate replicas of single-replica tablets.
ASSERT_STR_NOT_CONTAINS(out,
- "rebalancing is complete: cluster is balanced (moved 0 replicas)");
+ "rebalancing is complete: cluster is balanced (moved 0 replicas)")
+ << "stderr: " << err;
ASSERT_STR_NOT_CONTAINS(err, "has single replica, skipping");
} else {
ASSERT_STR_CONTAINS(out,
- "rebalancing is complete: cluster is balanced (moved 0 replicas)");
+ "rebalancing is complete: cluster is balanced (moved 0 replicas)")
+ << "stderr: " << err;
ASSERT_STR_MATCHES(err, "tablet .* of table '.*' (.*) has single replica, skipping");
}