You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2018/07/30 18:46:58 UTC

[2/4] kudu git commit: [kudu-admin-test] improvements on error reporting

[kudu-admin-test] improvements on error reporting

This changelist introduces improvements on logging in case
if an assertion triggers in the rebalancer-related scenarios.

I found it's hard to troubleshoot the flakiness reported lately
in the RebalancerAndSingleReplicaTablets.SingleReplicasStayOrMove
scenario.  With hindsight, it turned out that improved logging
on assertions would help a lot.

Change-Id: Ibfa74d5a78e89bf7ff1a0e914384999466460145
Reviewed-on: http://gerrit.cloudera.org:8080/11046
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Tested-by: Alexey Serbin <as...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/660ee80b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/660ee80b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/660ee80b

Branch: refs/heads/master
Commit: 660ee80bb4c0f81cb6f0a08365a7e2abd0a3e409
Parents: 7e733a7
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Jul 24 20:20:00 2018 -0700
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Mon Jul 30 15:06:44 2018 +0000

----------------------------------------------------------------------
 src/kudu/tools/kudu-admin-test.cc | 80 ++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/660ee80b/src/kudu/tools/kudu-admin-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc
index 21bae3b..0408e35 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -48,7 +48,6 @@
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/strings/split.h"
 #include "kudu/gutil/strings/substitute.h"
-#include "kudu/gutil/strings/util.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
 #include "kudu/integration-tests/cluster_verifier.h"
 #include "kudu/integration-tests/test_workload.h"
@@ -105,6 +104,7 @@ using std::atomic;
 using std::back_inserter;
 using std::copy;
 using std::deque;
+using std::endl;
 using std::ostringstream;
 using std::string;
 using std::thread;
@@ -1331,6 +1331,14 @@ TEST_F(AdminCliTest, TestListTablesDetail) {
   }
 }
 
+static string ToolRunInfo(const Status& s, const string& out, const string& err) {
+  ostringstream str;
+  str << s.ToString() << endl;
+  str << "stdout: " << out << endl;
+  str << "stderr: " << err << endl;
+  return str.str();
+}
+
 // Make sure the rebalancer doesn't start if a tablet server is down.
 class RebalanceStartCriteriaTest :
     public AdminCliTest,
@@ -1359,13 +1367,14 @@ TEST_P(RebalanceStartCriteriaTest, TabletServerIsDown) {
     ts->Shutdown();
   }
 
+  string out;
   string err;
   Status s = RunKuduTool({
     "cluster",
     "rebalance",
     cluster_->master()->bound_rpc_addr().ToString()
-  }, nullptr, &err);
-  ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
+  }, &out, &err);
+  ASSERT_TRUE(s.IsRuntimeError()) << ToolRunInfo(s, out, err);
   const auto err_msg_pattern = Substitute(
       "Illegal state: tablet server .* \\($0\\): "
       "unacceptable health status UNAVAILABLE",
@@ -1501,7 +1510,7 @@ TEST_P(RebalanceParamTest, Rebalance) {
     string out;
     string err;
     const Status s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
         << "stderr: " << err;
   }
@@ -1512,7 +1521,7 @@ TEST_P(RebalanceParamTest, Rebalance) {
     string out;
     string err;
     const Status s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out,
         "rebalancing is complete: cluster is balanced (moved 0 replicas)")
         << "stderr: " << err;
@@ -1532,7 +1541,7 @@ TEST_P(RebalanceParamTest, Rebalance) {
     string out;
     string err;
     const Status s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
         << "stderr: " << err;
     // The cluster was un-balanced, so many replicas should have been moved.
@@ -1570,6 +1579,8 @@ class RebalancingTest :
   }
 
  protected:
+  static const char* const kExitOnSignalStr;
+
   void Prepare(const vector<string>& extra_tserver_flags = {},
                const vector<string>& extra_master_flags = {}) {
     copy(extra_tserver_flags.begin(), extra_tserver_flags.end(),
@@ -1616,6 +1627,7 @@ class RebalancingTest :
   vector<string> tserver_flags_;
   vector<string> master_flags_;
 };
+const char* const RebalancingTest::kExitOnSignalStr = "kudu: process exited on signal";
 
 // Make sure the rebalancer is able to do its job if running concurrently
 // with DDL activity on the cluster.
@@ -1781,7 +1793,7 @@ TEST_P(DDLDuringRebalancingTest, TablesCreatedAndDeletedDuringRebalancing) {
     string out;
     string err;
     const auto s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
         << "stderr: " << err;
   }
@@ -1792,7 +1804,7 @@ TEST_P(DDLDuringRebalancingTest, TablesCreatedAndDeletedDuringRebalancing) {
     string out;
     string err;
     const auto s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out,
         "rebalancing is complete: cluster is balanced (moved 0 replicas)")
         << "stderr: " << err;
@@ -1829,21 +1841,18 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) {
   };
 
   const auto runner_func = [&]() {
+    string out;
     string err;
-    const auto s = RunKuduTool(tool_args, nullptr, &err);
-
-    ostringstream os;
-    os << "rebalancer status: " << s.ToString();
-    // One might expect a bad status returned: e.g., due to some race so
-    // the rebalancer didn't able to make progress for more than
-    // --max_staleness_interval_sec, etc.
+    const auto s = RunKuduTool(tool_args, &out, &err);
     if (!s.ok()) {
-      os << " : " << err;
+      // One might expect a bad status returned: e.g., due to some race so
+      // the rebalancer didn't able to make progress for more than
+      // --max_staleness_interval_sec, etc.
+      LOG(INFO) << "rebalancer run info: " << ToolRunInfo(s, out, err);
     }
-    LOG(INFO) << os.str();
 
     // Should not exit on a signal: not expecting SIGSEGV, SIGABRT, etc.
-    return !MatchPattern(err, "*kudu: process exited on signal*");
+    return s.ToString().find(kExitOnSignalStr) == string::npos;
   };
 
   CountDownLatch start_synchronizer(1);
@@ -1868,7 +1877,7 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) {
     string out;
     string err;
     const auto s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
         << "stderr: " << err;
   }
@@ -1881,7 +1890,7 @@ TEST_P(ConcurrentRebalancersTest, TwoConcurrentRebalancers) {
     string out;
     string err;
     const auto s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString() << ": " << err;
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out,
         "rebalancing is complete: cluster is balanced (moved 0 replicas)")
         << "stderr: " << err;
@@ -1921,13 +1930,14 @@ TEST_P(TserverGoesDownDuringRebalancingTest, TserverDown) {
   // Pre-condition: 'kudu cluster ksck' should be happy with the cluster state
   // shortly after initial setup.
   ASSERT_EVENTUALLY([&]() {
+    string out;
     string err;
     const auto s = RunKuduTool({
       "cluster",
       "ksck",
       cluster_->master()->bound_rpc_addr().ToString(),
-    }, nullptr, &err);
-    ASSERT_TRUE(s.ok()) << "stderr: " << err;
+    }, &out, &err);
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
   });
 
   Random r(SeedRandom());
@@ -1960,13 +1970,13 @@ TEST_P(TserverGoesDownDuringRebalancingTest, TserverDown) {
       // server goes down.
       "--max_moves_per_server=1",
     }, &out, &err);
-    ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
+    ASSERT_TRUE(s.IsRuntimeError()) << ToolRunInfo(s, out, err);
+
+    // The rebalancer tool should not crash.
+    ASSERT_STR_NOT_CONTAINS(s.ToString(), kExitOnSignalStr);
     ASSERT_STR_MATCHES(
         err, "Illegal state: tablet server .* \\(.*\\): "
              "unacceptable health status UNAVAILABLE");
-
-    // The rebalancer tool should not crash.
-    ASSERT_STR_NOT_CONTAINS(err, "kudu: process exited on signal");
   }
 
   run = false;
@@ -2011,9 +2021,10 @@ TEST_P(TserverAddedDuringRebalancingTest, TserverStarts) {
   atomic<bool> run(true);
   thread runner([&]() {
     while (run) {
+      string out;
       string err;
-      const auto s = RunKuduTool(tool_args, nullptr, &err);
-      CHECK(s.ok()) << s.ToString() << "stderr: " << err;
+      const auto s = RunKuduTool(tool_args, &out, &err);
+      CHECK(s.ok()) << ToolRunInfo(s, out, err);
     }
   });
   auto runner_cleanup = MakeScopedCleanup([&]() {
@@ -2037,7 +2048,7 @@ TEST_P(TserverAddedDuringRebalancingTest, TserverStarts) {
     string out;
     string err;
     const auto s = RunKuduTool(tool_args, &out, &err);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
     ASSERT_STR_CONTAINS(out,
         "rebalancing is complete: cluster is balanced (moved 0 replicas)")
         << "stderr: " << err;
@@ -2125,17 +2136,20 @@ TEST_P(RebalancerAndSingleReplicaTablets, SingleReplicasStayOrMove) {
     cluster_->master()->bound_rpc_addr().ToString(),
     Substitute("--move_single_replicas=$0", move_single_replica),
   }, &out, &err);
-  ASSERT_TRUE(s.ok()) << s.ToString() << ":" << err;
-  ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced");
+  ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
+  ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
+      << "stderr: " << err;
   if (move_single_replica == "enabled" ||
       (move_single_replica == "auto" && is_343_scheme)) {
     // Should move appropriate replicas of single-replica tablets.
     ASSERT_STR_NOT_CONTAINS(out,
-        "rebalancing is complete: cluster is balanced (moved 0 replicas)");
+        "rebalancing is complete: cluster is balanced (moved 0 replicas)")
+        << "stderr: " << err;
     ASSERT_STR_NOT_CONTAINS(err, "has single replica, skipping");
   } else {
     ASSERT_STR_CONTAINS(out,
-        "rebalancing is complete: cluster is balanced (moved 0 replicas)");
+        "rebalancing is complete: cluster is balanced (moved 0 replicas)")
+        << "stderr: " << err;
     ASSERT_STR_MATCHES(err, "tablet .* of table '.*' (.*) has single replica, skipping");
   }