You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by wd...@apache.org on 2018/05/01 20:10:45 UTC
[2/2] kudu git commit: [tools] ksck improvements [6/n]: Refactor
result handling
[tools] ksck improvements [6/n]: Refactor result handling
This patch refactors ksck so the results of the checks are collected in
a new KsckResults struct. Printing results is now a matter of iterating
over the KsckResults struct and formatting the information within it.
Furthermore, the KsckResults struct also serves as a programmatic access
point to the results of ksck, which can be used for other things like
rebalancing, auto-repair, or machine-readable printing.
There's also a couple of bonus changes:
- Added Ksck::Run and Ksck::RunAndPrintResults methods, which simplify
running all ksck checks and printing the results, as is done in the
ksck CLI tool and in ClusterVerifier.
- Add the changes in https://gerrit.cloudera.org/#/c/10054/, which were
about to be committed but would've needed to be redone a bit to fit
with this refactor.
Change-Id: Id8de619996b6cd753e6a9c01b1b60810a873e609
Reviewed-on: http://gerrit.cloudera.org:8080/10151
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Tested-by: Kudu Jenkins
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/0355d373
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/0355d373
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/0355d373
Branch: refs/heads/master
Commit: 0355d373a744b0109f56058cefc0c6dd0f862044
Parents: 235998b
Author: Will Berkeley <wd...@apache.org>
Authored: Sat Apr 21 22:16:23 2018 -0400
Committer: Will Berkeley <wd...@gmail.com>
Committed: Tue May 1 18:04:42 2018 +0000
----------------------------------------------------------------------
src/kudu/integration-tests/cluster_verifier.cc | 12 +-
src/kudu/tools/CMakeLists.txt | 1 +
src/kudu/tools/ksck-test.cc | 348 ++++++----
src/kudu/tools/ksck.cc | 713 ++++++--------------
src/kudu/tools/ksck.h | 200 +-----
src/kudu/tools/ksck_remote-test.cc | 1 +
src/kudu/tools/ksck_results.cc | 439 ++++++++++++
src/kudu/tools/ksck_results.h | 301 +++++++++
src/kudu/tools/tool_action_cluster.cc | 49 +-
src/kudu/tools/tool_action_tablet.cc | 2 +
10 files changed, 1191 insertions(+), 875 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/0355d373/src/kudu/integration-tests/cluster_verifier.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_verifier.cc b/src/kudu/integration-tests/cluster_verifier.cc
index 407df06..8f54554 100644
--- a/src/kudu/integration-tests/cluster_verifier.cc
+++ b/src/kudu/integration-tests/cluster_verifier.cc
@@ -109,17 +109,7 @@ Status ClusterVerifier::RunKsck() {
// we shouldn't consider fatal.
ksck->set_check_replica_count(false);
- // The CheckMaster* calls are independent of CheckClusterRunning, though
- // their results are correlated. The subsequent calls depend on
- // CheckClusterRunning().
- RETURN_NOT_OK(ksck->CheckMasterHealth());
- RETURN_NOT_OK(ksck->CheckMasterConsensus());
- RETURN_NOT_OK(ksck->CheckClusterRunning());
- RETURN_NOT_OK(ksck->FetchTableAndTabletInfo());
- RETURN_NOT_OK(ksck->FetchInfoFromTabletServers());
- RETURN_NOT_OK(ksck->CheckTablesConsistency());
- RETURN_NOT_OK(ksck->ChecksumData(checksum_options_));
- return Status::OK();
+ return ksck->RunAndPrintResults();
}
void ClusterVerifier::CheckRowCount(const std::string& table_name,
http://git-wip-us.apache.org/repos/asf/kudu/blob/0355d373/src/kudu/tools/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/tools/CMakeLists.txt b/src/kudu/tools/CMakeLists.txt
index 0135994..ad47c8d 100644
--- a/src/kudu/tools/CMakeLists.txt
+++ b/src/kudu/tools/CMakeLists.txt
@@ -68,6 +68,7 @@ target_link_libraries(kudu_tools_util
add_library(ksck
ksck.cc
ksck_remote.cc
+ ksck_results.cc
)
target_link_libraries(ksck
consensus
http://git-wip-us.apache.org/repos/asf/kudu/blob/0355d373/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index c5f851e..f4011cc 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -37,13 +37,14 @@
#include "kudu/tablet/metadata.pb.h"
#include "kudu/tablet/tablet.pb.h"
#include "kudu/tools/ksck.h"
+#include "kudu/tools/ksck_results.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
#include "kudu/util/test_util.h"
+DECLARE_bool(checksum_scan);
DECLARE_string(color);
-DECLARE_bool(consensus);
namespace kudu {
namespace tools {
@@ -196,20 +197,22 @@ class KsckTest : public KuduTest {
protected:
// Returns the expected summary for a table with the given tablet states.
std::string ExpectedKsckTableSummary(const string& table_name,
+ int replication_factor,
int healthy_tablets,
int recovering_tablets,
int underreplicated_tablets,
int consensus_mismatch_tablets,
int unavailable_tablets) {
- Ksck::TableSummary table_summary;
+ KsckTableSummary table_summary;
table_summary.name = table_name;
+ table_summary.replication_factor = replication_factor;
table_summary.healthy_tablets = healthy_tablets;
table_summary.recovering_tablets = recovering_tablets;
table_summary.underreplicated_tablets = underreplicated_tablets;
table_summary.consensus_mismatch_tablets = consensus_mismatch_tablets;
table_summary.unavailable_tablets = unavailable_tablets;
std::ostringstream oss;
- Ksck::PrintTableSummaries({ table_summary }, oss);
+ PrintTableSummaries({ table_summary }, oss);
return oss.str();
}
@@ -339,13 +342,7 @@ class KsckTest : public KuduTest {
auto c = MakeScopedCleanup([this]() {
LOG(INFO) << "Ksck output:\n" << err_stream_.str();
});
- RETURN_NOT_OK(ksck_->CheckMasterHealth());
- RETURN_NOT_OK(ksck_->CheckMasterConsensus());
- RETURN_NOT_OK(ksck_->CheckClusterRunning());
- RETURN_NOT_OK(ksck_->FetchTableAndTabletInfo());
- RETURN_NOT_OK(ksck_->FetchInfoFromTabletServers());
- RETURN_NOT_OK(ksck_->CheckTablesConsistency());
- return Status::OK();
+ return ksck_->RunAndPrintResults();
}
shared_ptr<MockKsckCluster> cluster_;
@@ -387,6 +384,8 @@ TEST_F(KsckTest, TestMasterUnavailable) {
master->fetch_info_status_ = Status::NetworkError("gremlins");
master->cstate_ = boost::none;
ASSERT_TRUE(ksck_->CheckMasterHealth().IsNetworkError());
+ ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
+ ASSERT_OK(ksck_->PrintResults());
ASSERT_STR_CONTAINS(err_stream_.str(),
"Master Summary\n"
" UUID | Address | Status\n"
@@ -394,12 +393,12 @@ TEST_F(KsckTest, TestMasterUnavailable) {
" master-id-0 | master-0 | HEALTHY\n"
" master-id-2 | master-2 | HEALTHY\n"
" master-id-1 | master-1 | UNAVAILABLE\n");
- ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
ASSERT_STR_CONTAINS(err_stream_.str(),
- "WARNING: masters have consensus conflicts All reported masters are:\n"
+ "All reported replicas are:\n"
" A = master-id-0\n"
" B = master-id-1\n"
" C = master-id-2\n"
+ "The consensus matrix is:\n"
" Config source | Replicas | Current term | Config index | Committed?\n"
"---------------+------------------------+--------------+--------------+------------\n"
" A | A* B C | 0 | | Yes\n"
@@ -421,6 +420,8 @@ TEST_F(KsckTest, TestWrongMasterUuid) {
config->add_peers()->set_permanent_uuid(imposter_uuid);
ASSERT_OK(ksck_->CheckMasterHealth());
+ ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
+ ASSERT_OK(ksck_->PrintResults());
ASSERT_STR_CONTAINS(err_stream_.str(),
"Master Summary\n"
" UUID | Address | Status\n"
@@ -428,13 +429,13 @@ TEST_F(KsckTest, TestWrongMasterUuid) {
" master-id-0 | master-0 | HEALTHY\n"
" master-id-1 | master-1 | HEALTHY\n"
" master-id-imposter | master-2 | HEALTHY\n");
- ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
ASSERT_STR_CONTAINS(err_stream_.str(),
- "WARNING: masters have consensus conflicts All reported masters are:\n"
+ "All reported replicas are:\n"
" A = master-id-0\n"
" B = master-id-1\n"
" C = master-id-imposter\n"
" D = master-id-2\n"
+ "The consensus matrix is:\n"
" Config source | Replicas | Current term | Config index | Committed?\n"
"---------------+------------------+--------------+--------------+------------\n"
" A | A* B D | 0 | | Yes\n"
@@ -449,11 +450,13 @@ TEST_F(KsckTest, TestTwoLeaderMasters) {
ASSERT_OK(ksck_->CheckMasterHealth());
ASSERT_TRUE(ksck_->CheckMasterConsensus().IsCorruption());
+ ASSERT_OK(ksck_->PrintResults());
ASSERT_STR_CONTAINS(err_stream_.str(),
- "WARNING: masters have consensus conflicts All reported masters are:\n"
+ "All reported replicas are:\n"
" A = master-id-0\n"
" B = master-id-1\n"
" C = master-id-2\n"
+ "The consensus matrix is:\n"
" Config source | Replicas | Current term | Config index | Committed?\n"
"---------------+--------------+--------------+--------------+------------\n"
" A | A* B C | 0 | | Yes\n"
@@ -479,6 +482,7 @@ TEST_F(KsckTest, TestWrongUUIDTabletServer) {
ASSERT_OK(ksck_->CheckClusterRunning());
ASSERT_OK(ksck_->FetchTableAndTabletInfo());
ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError());
+ ASSERT_OK(ksck_->PrintResults());
ASSERT_STR_CONTAINS(err_stream_.str(),
"Tablet Server Summary\n"
" UUID | Address | Status\n"
@@ -503,10 +507,18 @@ TEST_F(KsckTest, TestBadTabletServer) {
s = ksck_->CheckTablesConsistency();
EXPECT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
+ ASSERT_OK(ksck_->PrintResults());
ASSERT_STR_CONTAINS(
err_stream_.str(),
- "WARNING: Unable to connect to tablet server "
- "ts-id-1 (<mock>): Network error: Network failure");
+ "Tablet Server Summary\n"
+ " UUID | Address | Status\n"
+ "---------+---------+-------------\n"
+ " ts-id-0 | <mock> | HEALTHY\n"
+ " ts-id-2 | <mock> | HEALTHY\n"
+ " ts-id-1 | <mock> | UNAVAILABLE\n");
+ ASSERT_STR_CONTAINS(
+ err_stream_.str(),
+ "Error from <mock>: Network error: Network failure\n");
ASSERT_STR_CONTAINS(
err_stream_.str(),
"Tablet tablet-id-0 of table 'test' is under-replicated: 1 replica(s) not RUNNING\n"
@@ -529,60 +541,83 @@ TEST_F(KsckTest, TestBadTabletServer) {
TEST_F(KsckTest, TestOneTableCheck) {
CreateOneTableOneTablet();
+ FLAGS_checksum_scan = true;
ASSERT_OK(RunKsck());
- ASSERT_OK(ksck_->ChecksumData(ChecksumOptions()));
ASSERT_STR_CONTAINS(err_stream_.str(),
"0/1 replicas remaining (20B from disk, 10 rows summed)");
}
TEST_F(KsckTest, TestOneSmallReplicatedTable) {
CreateOneSmallReplicatedTable();
+ FLAGS_checksum_scan = true;
ASSERT_OK(RunKsck());
- ASSERT_OK(ksck_->ChecksumData(ChecksumOptions()));
ASSERT_STR_CONTAINS(err_stream_.str(),
"0/9 replicas remaining (180B from disk, 90 rows summed)");
+}
- // Test filtering (a non-matching pattern)
- err_stream_.str("");
+// Test filtering on a non-matching table pattern.
+TEST_F(KsckTest, TestNonMatchingTableFilter) {
+ CreateOneSmallReplicatedTable();
ksck_->set_table_filters({"xyz"});
- ASSERT_OK(RunKsck());
- Status s = ksck_->ChecksumData(ChecksumOptions());
- EXPECT_EQ("Not found: No table found. Filter: table_filters=xyz", s.ToString());
+ FLAGS_checksum_scan = true;
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ EXPECT_EQ("Not found: checksum scan error: No table found. Filter: table_filters=xyz",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(err_stream_.str(),
"The cluster doesn't have any matching tables");
+}
- // Test filtering with a matching table pattern.
- err_stream_.str("");
+// Test filtering with a matching table pattern.
+TEST_F(KsckTest, TestMatchingTableFilter) {
+ CreateOneSmallReplicatedTable();
ksck_->set_table_filters({"te*"});
+ FLAGS_checksum_scan = true;
ASSERT_OK(RunKsck());
- ASSERT_OK(ksck_->ChecksumData(ChecksumOptions()));
ASSERT_STR_CONTAINS(err_stream_.str(),
"0/9 replicas remaining (180B from disk, 90 rows summed)");
+}
- // Test filtering with a matching tablet ID pattern.
- err_stream_.str("");
- ksck_->set_table_filters({});
+// Test filtering on a non-matching tablet id pattern.
+TEST_F(KsckTest, TestNonMatchingTabletIdFilter) {
+ CreateOneSmallReplicatedTable();
+ ksck_->set_tablet_id_filters({"xyz"});
+ FLAGS_checksum_scan = true;
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ EXPECT_EQ(
+ "Not found: checksum scan error: No tablet replicas found. Filter: tablet_id_filters=xyz",
+ error_messages[0].ToString());
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ "The cluster doesn't have any matching tablets");
+}
+
+// Test filtering with a matching tablet ID pattern.
+TEST_F(KsckTest, TestMatchingTabletIdFilter) {
+ CreateOneSmallReplicatedTable();
ksck_->set_tablet_id_filters({"*-id-2"});
+ FLAGS_checksum_scan = true;
ASSERT_OK(RunKsck());
- ASSERT_OK(ksck_->ChecksumData(ChecksumOptions()));
ASSERT_STR_CONTAINS(err_stream_.str(),
"0/3 replicas remaining (60B from disk, 30 rows summed)");
}
TEST_F(KsckTest, TestOneSmallReplicatedTableWithConsensusState) {
- FLAGS_consensus = true;
CreateOneSmallReplicatedTable();
ASSERT_OK(RunKsck());
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 3,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 0,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 0));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 3,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 0,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 0));
}
TEST_F(KsckTest, TestConsensusConflictExtraPeer) {
- FLAGS_consensus = true;
CreateOneSmallReplicatedTable();
shared_ptr<KsckTabletServer> ts = FindOrDie(cluster_->tablet_servers_, "ts-id-0");
@@ -590,8 +625,11 @@ TEST_F(KsckTest, TestConsensusConflictExtraPeer) {
std::make_pair("ts-id-0", "tablet-id-0"));
cstate.mutable_committed_config()->add_peers()->set_permanent_uuid("ts-id-fake");
- Status s = RunKsck();
- ASSERT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(err_stream_.str(),
"The consensus matrix is:\n"
" Config source | Replicas | Current term | Config index | Committed?\n"
@@ -600,16 +638,17 @@ TEST_F(KsckTest, TestConsensusConflictExtraPeer) {
" A | A* B C D | 0 | | Yes\n"
" B | A* B C | 0 | | Yes\n"
" C | A* B C | 0 | | Yes");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 2,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 0,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 1));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 2,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 0,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 1));
}
TEST_F(KsckTest, TestConsensusConflictMissingPeer) {
- FLAGS_consensus = true;
CreateOneSmallReplicatedTable();
shared_ptr<KsckTabletServer> ts = FindOrDie(cluster_->tablet_servers_, "ts-id-0");
@@ -617,8 +656,11 @@ TEST_F(KsckTest, TestConsensusConflictMissingPeer) {
std::make_pair("ts-id-0", "tablet-id-0"));
cstate.mutable_committed_config()->mutable_peers()->RemoveLast();
- Status s = RunKsck();
- ASSERT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(err_stream_.str(),
"The consensus matrix is:\n"
" Config source | Replicas | Current term | Config index | Committed?\n"
@@ -627,16 +669,17 @@ TEST_F(KsckTest, TestConsensusConflictMissingPeer) {
" A | A* B | 0 | | Yes\n"
" B | A* B C | 0 | | Yes\n"
" C | A* B C | 0 | | Yes");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 2,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 0,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 1));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 2,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 0,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 1));
}
TEST_F(KsckTest, TestConsensusConflictDifferentLeader) {
- FLAGS_consensus = true;
CreateOneSmallReplicatedTable();
const shared_ptr<KsckTabletServer>& ts = FindOrDie(cluster_->tablet_servers_, "ts-id-0");
@@ -644,8 +687,11 @@ TEST_F(KsckTest, TestConsensusConflictDifferentLeader) {
std::make_pair("ts-id-0", "tablet-id-0"));
cstate.set_leader_uuid("ts-id-1");
- Status s = RunKsck();
- ASSERT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(err_stream_.str(),
"The consensus matrix is:\n"
" Config source | Replicas | Current term | Config index | Committed?\n"
@@ -654,27 +700,34 @@ TEST_F(KsckTest, TestConsensusConflictDifferentLeader) {
" A | A B* C | 0 | | Yes\n"
" B | A* B C | 0 | | Yes\n"
" C | A* B C | 0 | | Yes");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 2,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 0,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 1));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 2,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 0,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 1));
}
TEST_F(KsckTest, TestOneOneTabletBrokenTable) {
CreateOneOneTabletReplicatedBrokenTable();
- Status s = RunKsck();
- EXPECT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(err_stream_.str(),
"Tablet tablet-id-1 of table 'test' is under-replicated: "
"configuration has 2 replicas vs desired 3");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 0,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 1,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 0));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 0,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 1,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 0));
}
TEST_F(KsckTest, TestMismatchedAssignments) {
@@ -683,27 +736,35 @@ TEST_F(KsckTest, TestMismatchedAssignments) {
cluster_->tablet_servers_.at(Substitute("ts-id-$0", 0)));
ASSERT_EQ(1, ts->tablet_status_map_.erase("tablet-id-2"));
- Status s = RunKsck();
- EXPECT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(err_stream_.str(),
"Tablet tablet-id-2 of table 'test' is under-replicated: "
"1 replica(s) not RUNNING\n"
" ts-id-0 (<mock>): missing [LEADER]\n"
" ts-id-1 (<mock>): RUNNING\n"
" ts-id-2 (<mock>): RUNNING\n");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 2,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 1,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 0));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 2,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 1,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 0));
}
TEST_F(KsckTest, TestTabletNotRunning) {
CreateOneSmallReplicatedTableWithTabletNotRunning();
- Status s = RunKsck();
- EXPECT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(
err_stream_.str(),
"Tablet tablet-id-0 of table 'test' is unavailable: 3 replica(s) not RUNNING\n"
@@ -719,12 +780,14 @@ TEST_F(KsckTest, TestTabletNotRunning) {
" State: FAILED\n"
" Data state: TABLET_DATA_UNKNOWN\n"
" Last status: \n");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 2,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 0,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 1));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 2,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 0,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 1));
}
TEST_F(KsckTest, TestTabletCopying) {
@@ -736,15 +799,19 @@ TEST_F(KsckTest, TestTabletCopying) {
cluster_->tablet_servers_.at(assignment_plan_.back()));
auto& pb = FindOrDie(not_running_ts->tablet_status_map_, "tablet-id-0");
pb.set_tablet_data_state(TabletDataState::TABLET_DATA_COPYING);
- Status s = RunKsck();
- ASSERT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
- ASSERT_STR_CONTAINS(err_stream_.str(), "Table test has 1 recovering tablet(s)");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 2,
- /*recovering_tablets=*/ 1,
- /*underreplicated_tablets=*/ 0,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 0));
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 2,
+ /*recovering_tables=*/ 1,
+ /*underreplicated_tables=*/ 0,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 0));
}
// Test for a bug where we weren't properly handling a tserver not reported by the master.
@@ -755,15 +822,19 @@ TEST_F(KsckTest, TestMasterNotReportingTabletServer) {
// where the master is starting and doesn't list all tablet servers yet, but
// tablets from other tablet servers are listing a missing tablet server as a peer.
EraseKeyReturnValuePtr(&cluster_->tablet_servers_, "ts-id-0");
- Status s = RunKsck();
- ASSERT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
- ASSERT_STR_CONTAINS(err_stream_.str(), "Table test has 3 under-replicated tablet(s)");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 0,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 3,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 0));
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 0,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 3,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 0));
}
// KUDU-2113: Test for a bug where we weren't properly handling a tserver not
@@ -780,9 +851,11 @@ TEST_F(KsckTest, TestMasterNotReportingTabletServerWithConsensusConflict) {
std::make_pair("ts-id-1", "tablet-id-1"));
cstate.set_leader_uuid("ts-id-1");
- Status s = RunKsck();
- ASSERT_EQ("Corruption: 1 out of 1 table(s) are not healthy", s.ToString());
- ASSERT_STR_CONTAINS(err_stream_.str(), "Table test has 3 under-replicated tablet(s)");
+ ASSERT_TRUE(RunKsck().IsRuntimeError());
+ const vector<Status>& error_messages = ksck_->results().error_messages;
+ ASSERT_EQ(1, error_messages.size());
+ ASSERT_EQ("Corruption: table consistency check error: 1 out of 1 table(s) are not healthy",
+ error_messages[0].ToString());
ASSERT_STR_CONTAINS(err_stream_.str(),
"The consensus matrix is:\n"
" Config source | Replicas | Current term | Config index | Committed?\n"
@@ -791,23 +864,27 @@ TEST_F(KsckTest, TestMasterNotReportingTabletServerWithConsensusConflict) {
" A | [config not available] | | | \n"
" B | A B* C | 0 | | Yes\n"
" C | A* B C | 0 | | Yes");
- ASSERT_STR_CONTAINS(err_stream_.str(), ExpectedKsckTableSummary("test",
- /*healthy_tablets=*/ 0,
- /*recovering_tablets=*/ 0,
- /*underreplicated_tablets=*/ 3,
- /*consensus_mismatch_tablets=*/ 0,
- /*unavailable_tablets=*/ 0));
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ ExpectedKsckTableSummary("test",
+ /*replication_factor=*/ 3,
+ /*healthy_tables=*/ 0,
+ /*recovering_tables=*/ 0,
+ /*underreplicated_tables=*/ 3,
+ /*consensus_mismatch_tables=*/ 0,
+ /*unavailable_tables=*/ 0));
}
TEST_F(KsckTest, TestTableFiltersNoMatch) {
CreateOneSmallReplicatedTable();
ksck_->set_table_filters({ "fake-table" });
- Status s = RunKsck();
- // Every table we checked was healthy ;).
- ASSERT_OK(s);
+ // Every table we check is healthy ;).
+ ASSERT_OK(RunKsck());
ASSERT_STR_CONTAINS(err_stream_.str(), "The cluster doesn't have any matching tables");
+ ASSERT_STR_NOT_CONTAINS(err_stream_.str(),
+ " | Total Count\n"
+ "----------------+-------------\n");
}
TEST_F(KsckTest, TestTableFilters) {
@@ -815,30 +892,43 @@ TEST_F(KsckTest, TestTableFilters) {
CreateOneSmallReplicatedTable("other", "other-");
ksck_->set_table_filters({ "test" });
- Status s = RunKsck();
-
- ASSERT_OK(s);
- ASSERT_STR_CONTAINS(err_stream_.str(), "The metadata for 1 table(s) is HEALTHY");
+ ASSERT_OK(RunKsck());
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ " | Total Count\n"
+ "----------------+-------------\n"
+ " Masters | 3\n"
+ " Tablet Servers | 3\n"
+ " Tables | 1\n"
+ " Tablets | 3\n"
+ " Replicas | 9\n");
}
TEST_F(KsckTest, TestTabletFiltersNoMatch) {
CreateOneSmallReplicatedTable();
ksck_->set_tablet_id_filters({ "tablet-id-fake" });
- Status s = RunKsck();
- ASSERT_OK(s);
+ // Every tablet we check is healthy ;).
+ ASSERT_OK(RunKsck());
ASSERT_STR_CONTAINS(err_stream_.str(), "The cluster doesn't have any matching tablets");
+ ASSERT_STR_NOT_CONTAINS(err_stream_.str(),
+ " | Total Count\n"
+ "----------------+-------------\n");
}
TEST_F(KsckTest, TestTabletFilters) {
CreateOneSmallReplicatedTable();
ksck_->set_tablet_id_filters({ "tablet-id-0", "tablet-id-1" });
- Status s = RunKsck();
-
- ASSERT_OK(s);
- ASSERT_STR_CONTAINS(err_stream_.str(), "The metadata for 2 tablet(s) is HEALTHY");
+ ASSERT_OK(ksck_->RunAndPrintResults());
+ ASSERT_STR_CONTAINS(err_stream_.str(),
+ " | Total Count\n"
+ "----------------+-------------\n"
+ " Masters | 3\n"
+ " Tablet Servers | 3\n"
+ " Tables | 1\n"
+ " Tablets | 2\n"
+ " Replicas | 6\n");
}
} // namespace tools
http://git-wip-us.apache.org/repos/asf/kudu/blob/0355d373/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 78d02b2..191838b 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -24,9 +24,7 @@
#include <iterator>
#include <map>
#include <mutex>
-#include <numeric>
-#include <tuple>
-#include <type_traits>
+#include <set>
#include <vector>
#include <boost/optional.hpp> // IWYU pragma: keep
@@ -36,20 +34,30 @@
#include "kudu/consensus/quorum_util.h"
#include "kudu/gutil/gscoped_ptr.h"
#include "kudu/gutil/map-util.h"
+#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/strings/human_readable.h"
#include "kudu/gutil/strings/join.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/gutil/strings/util.h"
#include "kudu/tablet/tablet.pb.h"
-#include "kudu/tools/tool_action_common.h"
#include "kudu/util/atomic.h"
#include "kudu/util/blocking_queue.h"
+#include "kudu/tools/color.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/locks.h"
#include "kudu/util/monotime.h"
#include "kudu/util/threadpool.h"
+#define PUSH_PREPEND_NOT_OK(s, statuses, msg) do { \
+ ::kudu::Status _s = (s); \
+ if (PREDICT_FALSE(!_s.ok())) { \
+ statuses.push_back(_s.CloneAndPrepend(msg)); \
+ } \
+} while (0);
+
+DEFINE_bool(checksum_scan, false,
+ "Perform a checksum scan on data in the cluster.");
DEFINE_int32(checksum_timeout_sec, 3600,
"Maximum total seconds to wait for a checksum scan to complete "
"before timing out.");
@@ -99,35 +107,6 @@ bool MatchesAnyPattern(const vector<string>& patterns, const string& str) {
return false;
}
-// Return a formatted string version of 'config', mapping UUIDs to single-character
-// labels using the mapping 'label_by_uuid'.
-string format_replicas(const map<string, char>& label_by_uuid, const KsckConsensusState& config) {
- constexpr int kPeerWidth = 4;
- ostringstream result;
- // Sort the output by label for readability.
- std::set<std::pair<char, string>> labeled_replicas;
- for (const auto& entry : label_by_uuid) {
- labeled_replicas.emplace(entry.second, entry.first);
- }
- for (const auto &entry : labeled_replicas) {
- if (!ContainsKey(config.voter_uuids, entry.second) &&
- !ContainsKey(config.non_voter_uuids, entry.second)) {
- result << setw(kPeerWidth) << left << "";
- continue;
- }
- if (config.leader_uuid && config.leader_uuid == entry.second) {
- result << setw(kPeerWidth) << left << Substitute("$0*", entry.first);
- } else {
- if (ContainsKey(config.non_voter_uuids, entry.second)) {
- result << setw(kPeerWidth) << left << Substitute("$0~", entry.first);
- } else {
- result << setw(kPeerWidth) << left << Substitute("$0", entry.first);
- }
- }
- }
- return result.str();
-}
-
void BuildKsckConsensusStateForConfigMember(const consensus::ConsensusStatePB& cstate,
KsckConsensusState* ksck_cstate) {
CHECK(ksck_cstate);
@@ -156,19 +135,6 @@ void BuildKsckConsensusStateForConfigMember(const consensus::ConsensusStatePB& c
}
}
-void AddToUuidLabelMapping(const std::set<string>& uuids,
- map<string, char>* uuid_label_mapping) {
- CHECK(uuid_label_mapping);
- // TODO(wdberkeley): use a scheme that gives > 26 unique labels.
- const string labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
- int i = uuid_label_mapping->size() % labels.size();
- for (const auto& uuid : uuids) {
- if (InsertIfNotPresent(uuid_label_mapping, uuid, labels[i])) {
- i = (i + 1) % labels.size();
- }
- }
-}
-
} // anonymous namespace
ChecksumOptions::ChecksumOptions()
@@ -217,63 +183,33 @@ Ksck::Ksck(shared_ptr<KsckCluster> cluster, ostream* out)
out_(out == nullptr ? &std::cout : out) {
}
-string Ksck::ServerHealthToString(ServerHealth sh) {
- switch (sh) {
- case ServerHealth::HEALTHY:
- return "HEALTHY";
- case ServerHealth::UNAVAILABLE:
- return "UNAVAILABLE";
- case ServerHealth::WRONG_SERVER_UUID:
- return "WRONG_SERVER_UUID";
- default:
- LOG(FATAL) << "Unknown ServerHealth";
- }
-}
-
-int Ksck::ServerHealthScore(ServerHealth sh) {
- switch (sh) {
- case ServerHealth::HEALTHY:
- return 0;
- case ServerHealth::UNAVAILABLE:
- return 1;
- case ServerHealth::WRONG_SERVER_UUID:
- return 2;
- default:
- LOG(FATAL) << "Unknown ServerHealth";
- }
-}
-
Status Ksck::CheckMasterHealth() {
int bad_masters = 0;
- vector<ServerHealthSummary> master_summaries;
+ vector<KsckServerHealthSummary> master_summaries;
// There shouldn't be more than 5 masters, so we'll keep it simple and gather
// info in sequence instead of spreading it across a threadpool.
- for (const KsckCluster::MasterList::value_type& master : cluster_->masters()) {
- Status s = master->FetchInfo();
- ServerHealthSummary sh;
+ for (const auto& master : cluster_->masters()) {
+ KsckServerHealthSummary sh;
+ Status s = master->FetchInfo().AndThen([&]() {
+ return master->FetchConsensusState();
+ });
sh.uuid = master->uuid();
sh.address = master->address();
- sh.health = s.ok() ? ServerHealth::HEALTHY : ServerHealth::UNAVAILABLE;
- master_summaries.emplace_back(std::move(sh));
+ sh.status = s;
if (!s.ok()) {
- Warn() << Substitute("Unable to connect to master $0: $1",
- master->ToString(), s.ToString()) << endl;
bad_masters++;
- } else if (FLAGS_consensus) {
- if (!master->FetchConsensusState().ok()) {
- Warn() << Substitute("Errors gathering consensus info for master $0: $1",
- master->ToString(), s.ToString()) << endl;
- }
+ sh.health = KsckServerHealth::UNAVAILABLE;
}
+ master_summaries.push_back(std::move(sh));
}
- CHECK_OK(PrintServerHealthSummaries(ServerType::MASTER, std::move(master_summaries), Out()));
+ results_.master_summaries.swap(master_summaries);
+
int num_masters = cluster_->masters().size();
if (bad_masters > 0) {
- Warn() << Substitute("Fetched info from $0 masters; $1 weren't reachable",
- num_masters, bad_masters) << endl;
- return Status::NetworkError("failed to gather info from all masters");
+ return Status::NetworkError(
+ Substitute("failed to gather info from all masters: $0 of $1 had errors",
+ bad_masters, num_masters));
}
- Out() << Substitute("Fetched info from all $0 masters", num_masters) << endl;
return Status::OK();
}
@@ -281,74 +217,36 @@ Status Ksck::CheckMasterConsensus() {
if (!FLAGS_consensus) {
return Status::OK();
}
- // There's no "reference" cstate for masters, so pick an arbitrary master
- // cstate to compare with.
- bool missing_or_conflict = false;
- map<string, KsckConsensusState> master_cstates;
+ KsckConsensusStateMap master_cstates;
for (const KsckCluster::MasterList::value_type& master : cluster_->masters()) {
if (master->cstate()) {
KsckConsensusState ksck_cstate;
BuildKsckConsensusStateForConfigMember(*master->cstate(), &ksck_cstate);
InsertOrDie(&master_cstates, master->uuid(), ksck_cstate);
} else {
- missing_or_conflict = true;
+ results_.master_consensus_conflict = true;
}
}
if (master_cstates.empty()) {
return Status::NotFound("no master consensus state available");
}
+ // There's no "reference" cstate for masters, so pick an arbitrary master
+ // cstate to compare with.
const KsckConsensusState& base = master_cstates.begin()->second;
for (const auto& entry : master_cstates) {
if (!base.Matches(entry.second)) {
- missing_or_conflict = true;
+ results_.master_consensus_conflict = true;
break;
}
}
- if (missing_or_conflict || FLAGS_verbose) {
- // We need to make a consensus matrix for the masters now.
- if (missing_or_conflict) {
- Warn() << "masters have consensus conflicts";
- }
- map<string, char> replica_labels;
- for (const KsckCluster::MasterList::value_type& master : cluster_->masters()) {
- AddToUuidLabelMapping({ master->uuid() }, &replica_labels);
- }
- // Master configs have no non-voters.
- for (const auto& entry : master_cstates) {
- AddToUuidLabelMapping(entry.second.voter_uuids, &replica_labels);
- }
- Out() << " All reported masters are:" << endl;
- // Sort the output by label for readability.
- std::set<std::pair<char, string>> reported_masters;
- for (const auto& entry : replica_labels) {
- reported_masters.emplace(entry.second, entry.first);
- }
- for (const auto& entry : reported_masters) {
- Out() << " " << entry.first << " = " << entry.second << endl;
- }
- DataTable cmatrix({ "Config source", "Replicas", "Current term",
- "Config index", "Committed?"});
- for (const KsckCluster::MasterList::value_type& master : cluster_->masters()) {
- const string label(1, FindOrDie(replica_labels, master->uuid()));
- if (master->cstate()) {
- const auto& cstate = master->cstate();
- const string opid_index_str = cstate->committed_config().has_opid_index() ?
- std::to_string(cstate->committed_config().opid_index()) :
- "";
- cmatrix.AddRow({ label,
- format_replicas(replica_labels,
- FindOrDie(master_cstates,
- master->uuid())),
- std::to_string(cstate->current_term()),
- opid_index_str,
- "Yes" });
- } else {
- cmatrix.AddRow({ label, "[config not available]", "", "", "" });
- }
- }
- RETURN_NOT_OK(cmatrix.PrintTo(Out()));
- }
- if (missing_or_conflict) {
+ results_.master_consensus_state_map.swap(master_cstates);
+ vector<string> uuids;
+ std::transform(cluster_->masters().begin(), cluster_->masters().end(),
+ std::back_inserter(uuids),
+ [](const shared_ptr<KsckMaster>& master) { return master->uuid(); });
+ results_.master_uuids.swap(uuids);
+
+ if (results_.master_consensus_conflict) {
return Status::Corruption("there are master consensus conflicts");
}
return Status::OK();
@@ -356,11 +254,7 @@ Status Ksck::CheckMasterConsensus() {
Status Ksck::CheckClusterRunning() {
VLOG(1) << "Connecting to the leader master";
- Status s = cluster_->Connect();
- if (s.ok()) {
- Out() << "Connected to the leader master" << endl;
- }
- return s;
+ return cluster_->Connect();
}
Status Ksck::FetchTableAndTabletInfo() {
@@ -384,24 +278,32 @@ Status Ksck::FetchInfoFromTabletServers() {
AtomicInt<int32_t> bad_servers(0);
VLOG(1) << "Fetching info from all " << servers_count << " tablet servers";
- vector<ServerHealthSummary> tablet_server_summaries;
+ vector<KsckServerHealthSummary> tablet_server_summaries;
simple_spinlock tablet_server_summaries_lock;
- for (const KsckCluster::TSMap::value_type& entry : cluster_->tablet_servers()) {
+ for (const auto& entry : cluster_->tablet_servers()) {
+ const auto& ts = entry.second;
CHECK_OK(pool->SubmitFunc([&]() {
- Status s = ConnectToTabletServer(entry.second);
- ServerHealthSummary summary;
+ VLOG(1) << "Going to connect to tablet server: " << ts->uuid();
+ Status s = ts->FetchInfo().AndThen([&ts]() {
+ if (FLAGS_consensus) {
+ return ts->FetchConsensusState();
+ }
+ return Status::OK();
+ });
+ KsckServerHealthSummary summary;
summary.uuid = entry.second->uuid();
summary.address = entry.second->address();
+ summary.status = s;
if (!s.ok()) {
bad_servers.Increment();
if (s.IsRemoteError()) {
- summary.health = ServerHealth::WRONG_SERVER_UUID;
+ summary.health = KsckServerHealth::WRONG_SERVER_UUID;
} else {
- summary.health = ServerHealth::UNAVAILABLE;
- }
+ summary.health = KsckServerHealth::UNAVAILABLE;
+ }
} else {
- summary.health = ServerHealth::HEALTHY;
+ summary.health = KsckServerHealth::HEALTHY;
}
std::lock_guard<simple_spinlock> lock(tablet_server_summaries_lock);
@@ -410,142 +312,84 @@ Status Ksck::FetchInfoFromTabletServers() {
}
pool->Wait();
- CHECK_OK(PrintServerHealthSummaries(ServerType::TABLET_SERVER,
- std::move(tablet_server_summaries),
- Out()));
+ results_.tserver_summaries.swap(tablet_server_summaries);
if (bad_servers.Load() == 0) {
- Out() << Substitute("Fetched info from all $0 tablet servers", servers_count) << endl;
return Status::OK();
}
- Warn() << Substitute("Fetched info from $0 tablet servers, $1 weren't reachable",
- servers_count - bad_servers.Load(), bad_servers.Load()) << endl;
- return Status::NetworkError("Could not gather complete information from all tablet servers");
+ return Status::NetworkError(
+ Substitute("failed to gather info for all tablet servers: $0 of $1 had errors",
+ bad_servers.Load(), servers_count));
}
-Status Ksck::ConnectToTabletServer(const shared_ptr<KsckTabletServer>& ts) {
- VLOG(1) << "Going to connect to tablet server: " << ts->uuid();
- Status s = ts->FetchInfo();
- if (!s.ok()) {
- Warn() << Substitute("Unable to connect to tablet server $0: $1",
- ts->ToString(), s.ToString()) << endl;
- return s;
+const KsckResults& Ksck::results() const {
+ return results_;
+}
+
+Status Ksck::Run() {
+ PUSH_PREPEND_NOT_OK(CheckMasterHealth(), results_.error_messages,
+ "error fetching info from masters");
+ PUSH_PREPEND_NOT_OK(CheckMasterConsensus(), results_.error_messages,
+ "master consensus error");
+
+ // CheckClusterRunning and FetchTableAndTabletInfo must succeed for
+ // subsequent checks to be runnable.
+ const char* const liveness_prefix = "leader master liveness check error";
+ Status s = CheckClusterRunning();
+ PUSH_PREPEND_NOT_OK(s, results_.error_messages, liveness_prefix);
+ RETURN_NOT_OK_PREPEND(s, liveness_prefix);
+ const char* const fetch_prefix = "error fetching the cluster metadata "
+ "from the leader master";
+ s = FetchTableAndTabletInfo();
+ PUSH_PREPEND_NOT_OK(s, results_.error_messages, fetch_prefix);
+ RETURN_NOT_OK_PREPEND(s, fetch_prefix);
+
+ PUSH_PREPEND_NOT_OK(FetchInfoFromTabletServers(), results_.error_messages,
+ "error fetching info from tablet servers");
+
+ PUSH_PREPEND_NOT_OK(CheckTablesConsistency(), results_.error_messages,
+ "table consistency check error");
+
+ if (FLAGS_checksum_scan) {
+ PUSH_PREPEND_NOT_OK(ChecksumData(ChecksumOptions()),
+ results_.error_messages, "checksum scan error");
}
- VLOG(1) << "Connected to tablet server: " << ts->uuid();
- if (FLAGS_consensus) {
- s = ts->FetchConsensusState();
- if (!s.ok()) {
- Warn() << Substitute("Errors gathering consensus info for tablet server $0: $1",
- ts->ToString(), s.ToString()) << endl;
- }
+
+ if (!results_.error_messages.empty()) {
+ return Status::RuntimeError("ksck discovered errors");
}
- return s;
+ return Status::OK();
}
-Status Ksck::PrintServerHealthSummaries(ServerType type,
- vector<ServerHealthSummary> summaries,
- ostream& out) {
- // Sort by (health decreasing, uuid, address), so bad health appears
- // closest to the bottom of the output in a terminal.
- // The address is used in the sort for the unavailable master case, because
- // we do not know the uuid in that case.
- std::sort(summaries.begin(), summaries.end(),
- [](const ServerHealthSummary& left, const ServerHealthSummary& right) {
- return std::make_tuple(ServerHealthScore(left.health), left.uuid, left.address) <
- std::make_tuple(ServerHealthScore(right.health), right.uuid, right.address);
- });
- out << ServerTypeToString(type) << " Summary" << endl;
- DataTable table({ "UUID", "Address", "Status"});
- for (const auto& s : summaries) {
- table.AddRow({ s.uuid, s.address, ServerHealthToString(s.health) });
- }
- return table.PrintTo(out);
+Status Ksck::PrintResults() {
+ PrintMode mode = FLAGS_verbose ? PrintMode::VERBOSE : PrintMode::DEFAULT;
+ return results_.PrintTo(mode, *out_);
}
-Status Ksck::PrintTableSummaries(const vector<TableSummary>& table_summaries, ostream& out) {
- out << "Summary by table" << endl;
- DataTable table({ "Name", "Status", "Total Tablets",
- "Healthy", "Recovering", "Under-replicated", "Unavailable"});
- for (const TableSummary& ts : table_summaries) {
- string status;
- switch (ts.TableStatus()) {
- case CheckResult::HEALTHY:
- status = "HEALTHY";
- break;
- case CheckResult::RECOVERING:
- status = "RECOVERING";
- break;
- case CheckResult::UNDER_REPLICATED:
- status = "UNDER-REPLICATED";
- break;
- default:
- status = "UNAVAILABLE";
- break;
- }
- table.AddRow({ ts.name, status, to_string(ts.TotalTablets()),
- to_string(ts.healthy_tablets), to_string(ts.recovering_tablets),
- to_string(ts.underreplicated_tablets),
- to_string(ts.consensus_mismatch_tablets + ts.unavailable_tablets) });
- }
- return table.PrintTo(out);
+Status Ksck::RunAndPrintResults() {
+ Status s = Run();
+ RETURN_NOT_OK_PREPEND(PrintResults(), "error printing results");
+ return s;
}
Status Ksck::CheckTablesConsistency() {
int bad_tables_count = 0;
- int bad_tablets_count = 0;
- int total_tablets = 0;
- vector<TableSummary> table_summaries;
for (const shared_ptr<KsckTable> &table : cluster_->tables()) {
if (!MatchesAnyPattern(table_filters_, table->name())) {
VLOG(1) << "Skipping table " << table->name();
continue;
}
- TableSummary ts;
- ts.name = table->name();
- if (!VerifyTable(table, &ts)) {
+ if (!VerifyTable(table)) {
bad_tables_count++;
- bad_tablets_count += ts.UnhealthyTablets();
- }
- // If the summary has no tablets (because of tablet id filters), don't
- // save the summary.
- if (ts.TotalTablets() > 0) {
- total_tablets += ts.TotalTablets();
- table_summaries.emplace_back(std::move(ts));
}
}
- if (table_summaries.empty()) {
- const string tables_or_tablets = tablet_id_filters_.empty() ? "tables" : "tablets";
- Out() << "The cluster doesn't have any matching " << tables_or_tablets << endl;
- return Status::OK();
- }
-
- // Show unhealthy tables at the bottom so they're easier to see;
- // otherwise sort alphabetically.
- std::sort(table_summaries.begin(), table_summaries.end(),
- [](const TableSummary& left, const TableSummary& right) {
- return std::make_pair(left.TableStatus() != CheckResult::HEALTHY, left.name) <
- std::make_pair(right.TableStatus() != CheckResult::HEALTHY, right.name);
- });
- CHECK_OK(PrintTableSummaries(table_summaries, Out()));
-
- if (bad_tables_count == 0) {
- if (tablet_id_filters_.empty()) {
- Out() << Substitute("The metadata for $0 table(s) is HEALTHY",
- table_summaries.size()) << endl;
- } else {
- Out() << Substitute("The metadata for $0 tablet(s) is HEALTHY",
- total_tablets) << endl;
- }
- return Status::OK();
- }
- if (tablet_id_filters_.empty()) {
- return Status::Corruption(Substitute("$0 out of $1 table(s) are not healthy",
- bad_tables_count, table_summaries.size()));
- } else {
- return Status::Corruption(Substitute("$0 out of $1 tablet(s) are not healthy",
- bad_tablets_count, total_tablets));
+ if (bad_tables_count > 0) {
+ return Status::Corruption(
+ Substitute("$0 out of $1 table(s) are not healthy",
+ bad_tables_count, results_.table_summaries.size()));
}
+ return Status::OK();
}
// Class to act as a collector of scan results.
@@ -763,7 +607,7 @@ Status Ksck::ChecksumData(const ChecksumOptions& opts) {
return Status::ServiceUnavailable(
"No tablet servers were available to fetch the current timestamp");
}
- Out() << "Using snapshot timestamp: " << options.snapshot_timestamp << endl;
+ results_.checksum_results.snapshot_timestamp = options.snapshot_timestamp;
}
// Kick off checksum scans in parallel. For each tablet server, we start
@@ -788,53 +632,55 @@ Status Ksck::ChecksumData(const ChecksumOptions& opts) {
bool timed_out = !reporter->WaitFor(options.timeout, out_);
- // Even if we timed out, print the checksum results that we did get.
+ // Even if we timed out, for printing collate the checksum results that we did get.
ChecksumResultReporter::TabletResultMap checksums = reporter->checksums();
int num_errors = 0;
int num_mismatches = 0;
int num_results = 0;
+ KsckTableChecksumMap checksum_tables;
for (const shared_ptr<KsckTable>& table : cluster_->tables()) {
- bool printed_table_name = false;
+ KsckTableChecksum table_checksum;
for (const shared_ptr<KsckTablet>& tablet : table->tablets()) {
if (ContainsKey(checksums, tablet->id())) {
- if (!printed_table_name) {
- printed_table_name = true;
- cout << "-----------------------" << endl;
- cout << table->name() << endl;
- cout << "-----------------------" << endl;
- }
+ KsckTabletChecksum tablet_checksum;
+ tablet_checksum.tablet_id = tablet->id();
bool seen_first_replica = false;
uint64_t first_checksum = 0;
- for (const ChecksumResultReporter::ReplicaResultMap::value_type& r :
- FindOrDie(checksums, tablet->id())) {
+ for (const auto& r : FindOrDie(checksums, tablet->id())) {
+ KsckReplicaChecksum replica_checksum;
const string& replica_uuid = r.first;
+ shared_ptr<KsckTabletServer> ts = FindOrDie(cluster_->tablet_servers(), replica_uuid);
+ replica_checksum.ts_uuid = ts->uuid();
+ replica_checksum.ts_address = ts->address();
- const auto& ts = FindOrDie(cluster_->tablet_servers(), replica_uuid);
const ChecksumResultReporter::ResultPair& result = r.second;
const Status& status = result.first;
- uint64_t checksum = result.second;
- string status_str = (status.ok()) ? Substitute("Checksum: $0", checksum)
- : Substitute("Error: $0", status.ToString());
- cout << Substitute("T $0 P $1 ($2): $3", tablet->id(), ts->uuid(), ts->address(),
- status_str) << endl;
+ replica_checksum.checksum = result.second;
+ replica_checksum.status = status;
if (!status.ok()) {
num_errors++;
} else if (!seen_first_replica) {
seen_first_replica = true;
- first_checksum = checksum;
- } else if (checksum != first_checksum) {
+ first_checksum = replica_checksum.checksum;
+ } else if (replica_checksum.checksum != first_checksum) {
num_mismatches++;
- Error() << ">> Mismatch found in table " << table->name()
- << " tablet " << tablet->id() << endl;
+ tablet_checksum.mismatch = true;
}
num_results++;
+ InsertOrDie(&tablet_checksum.replica_checksums,
+ replica_checksum.ts_uuid,
+ std::move(replica_checksum));
}
+ InsertOrDie(&table_checksum,
+ tablet_checksum.tablet_id,
+ std::move(tablet_checksum));
}
}
- if (printed_table_name) cout << endl;
+ InsertOrDie(&checksum_tables, table->name(), std::move(table_checksum));
}
+ results_.checksum_results.tables.swap(checksum_tables);
if (timed_out) {
return Status::TimedOut(Substitute("Checksum scan did not complete within the timeout of $0: "
"Received results for $1 out of $2 expected replicas",
@@ -855,7 +701,7 @@ Status Ksck::ChecksumData(const ChecksumOptions& opts) {
return Status::OK();
}
-bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table, TableSummary* ts) {
+bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table) {
const auto all_tablets = table->tablets();
vector<shared_ptr<KsckTablet>> tablets;
std::copy_if(all_tablets.begin(), all_tablets.end(), std::back_inserter(tablets),
@@ -869,88 +715,43 @@ bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table, TableSummary* ts) {
return true;
}
- int table_num_replicas = table->num_replicas();
+ KsckTableSummary ts;
+ ts.replication_factor = table->num_replicas();
VLOG(1) << Substitute("Verifying $0 tablet(s) for table $1 configured with num_replicas = $2",
- tablets.size(), table->name(), table_num_replicas);
+ tablets.size(), table->name(), table->num_replicas());
+ ts.name = table->name();
for (const auto& tablet : tablets) {
- auto tablet_result = VerifyTablet(tablet, table_num_replicas);
+ auto tablet_result = VerifyTablet(tablet, table->num_replicas());
switch (tablet_result) {
- case CheckResult::HEALTHY:
- ts->healthy_tablets++;
+ case KsckCheckResult::HEALTHY:
+ ts.healthy_tablets++;
break;
- case CheckResult::RECOVERING:
- ts->recovering_tablets++;
+ case KsckCheckResult::RECOVERING:
+ ts.recovering_tablets++;
break;
- case CheckResult::UNDER_REPLICATED:
- ts->underreplicated_tablets++;
+ case KsckCheckResult::UNDER_REPLICATED:
+ ts.underreplicated_tablets++;
break;
- case CheckResult::CONSENSUS_MISMATCH:
- ts->consensus_mismatch_tablets++;
+ case KsckCheckResult::CONSENSUS_MISMATCH:
+ ts.consensus_mismatch_tablets++;
break;
- case CheckResult::UNAVAILABLE:
- ts->unavailable_tablets++;
+ case KsckCheckResult::UNAVAILABLE:
+ ts.unavailable_tablets++;
break;
}
}
- // If running with tablet id filters, we're not checking the health of whole
- // tables, so don't output conclusions about whole tables.
- if (!tablet_id_filters_.empty()) {
- return ts->healthy_tablets == tablets.size();
- }
- if (ts->healthy_tablets == tablets.size()) {
- Out() << Substitute("Table $0 is $1 ($2 tablet(s) checked)",
- table->name(),
- Color(AnsiCode::GREEN, "HEALTHY"),
- tablets.size()) << endl << endl;
- return true;
- }
- if (ts->recovering_tablets > 0) {
- Out() << Substitute("Table $0 has $1 $2 tablet(s)",
- table->name(),
- ts->recovering_tablets,
- Color(AnsiCode::YELLOW, "recovering")) << endl;
+ bool all_healthy = ts.healthy_tablets == ts.TotalTablets();
+ if (ts.TotalTablets() > 0) {
+ results_.table_summaries.push_back(std::move(ts));
}
- if (ts->underreplicated_tablets > 0) {
- Out() << Substitute("Table $0 has $1 $2 tablet(s)",
- table->name(),
- ts->underreplicated_tablets,
- Color(AnsiCode::YELLOW, "under-replicated")) << endl;
- }
- if (ts->consensus_mismatch_tablets > 0) {
- Out() << Substitute("Table $0 has $1 tablet(s) $2",
- table->name(),
- ts->consensus_mismatch_tablets,
- Color(AnsiCode::YELLOW, "with mismatched consensus")) << endl;
- }
- if (ts->unavailable_tablets > 0) {
- Out() << Substitute("Table $0 has $1 $2 tablet(s)",
- table->name(),
- ts->unavailable_tablets,
- Color(AnsiCode::RED, "unavailable")) << endl;
- }
- // Empty line for spacing.
- Out() << endl;
- return false;
+ return all_healthy;
}
-namespace {
-
-// A struct consolidating the state of each replica, for easier analysis.
-struct ReplicaInfo {
- KsckTabletReplica* replica;
- KsckTabletServer* ts = nullptr;
- tablet::TabletStatePB state = tablet::UNKNOWN;
- boost::optional<tablet::TabletStatusPB> status_pb;
- boost::optional<KsckConsensusState> consensus_state;
-};
-
-} // anonymous namespace
-
-Ksck::CheckResult Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int table_num_replicas) {
+KsckCheckResult Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet,
+ int table_num_replicas) {
const string tablet_str = Substitute("Tablet $0 of table '$1'",
tablet->id(), tablet->table()->name());
- // Organize consensus info for the master.
auto leader_it = std::find_if(tablet->replicas().cbegin(), tablet->replicas().cend(),
[](const shared_ptr<KsckTabletReplica>& r) -> bool { return r->is_leader(); });
boost::optional<string> leader_uuid;
@@ -972,218 +773,114 @@ Ksck::CheckResult Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int t
leader_uuid,
voter_uuids_from_master,
non_voter_uuids_from_master);
- vector<ReplicaInfo> replica_infos;
+
+ int leaders_count = 0;
+ int running_voters_count = 0;
+ int copying_replicas_count = 0;
+ int conflicting_states = 0;
+ int num_voters = 0;
+ vector<KsckReplicaSummary> replica_infos;
for (const shared_ptr<KsckTabletReplica>& replica : tablet->replicas()) {
replica_infos.emplace_back();
auto* repl_info = &replica_infos.back();
- repl_info->replica = replica.get();
+ repl_info->ts_uuid = replica->ts_uuid();
VLOG(1) << Substitute("A replica of tablet $0 is on live tablet server $1",
tablet->id(), replica->ts_uuid());
// Check for agreement on tablet assignment and state between the master
// and the tablet server.
auto ts = FindPointeeOrNull(cluster_->tablet_servers(), replica->ts_uuid());
- repl_info->ts = ts;
+ if (ts) {
+ repl_info->ts_address = ts->address();
+ }
if (ts && ts->is_healthy()) {
+ repl_info->ts_healthy = true;
repl_info->state = ts->ReplicaState(tablet->id());
if (ContainsKey(ts->tablet_status_map(), tablet->id())) {
repl_info->status_pb = ts->tablet_status_map().at(tablet->id());
}
// Organize consensus info for each replica.
- if (FLAGS_consensus) {
- std::pair<string, string> tablet_key = std::make_pair(ts->uuid(), tablet->id());
- if (!ContainsKey(ts->tablet_consensus_state_map(), tablet_key)) {
- continue;
- }
+ std::pair<string, string> tablet_key = std::make_pair(ts->uuid(), tablet->id());
+ if (ContainsKey(ts->tablet_consensus_state_map(), tablet_key)) {
const auto& cstate = FindOrDieNoPrint(ts->tablet_consensus_state_map(), tablet_key);
KsckConsensusState ksck_cstate;
BuildKsckConsensusStateForConfigMember(cstate, &ksck_cstate);
repl_info->consensus_state = std::move(ksck_cstate);
}
}
- }
- // Summarize the states.
- int leaders_count = 0;
- int running_voters_count = 0;
- int copying_replicas_count = 0;
- for (const auto& r : replica_infos) {
- if (r.replica->is_leader()) {
+ repl_info->is_leader = replica->is_leader();
+ repl_info->is_voter = replica->is_voter();
+ num_voters += replica->is_voter() ? 1 : 0;
+ if (replica->is_leader()) {
leaders_count++;
}
- if (r.state == tablet::RUNNING && r.replica->is_voter()) {
+ if (repl_info->state == tablet::RUNNING && replica->is_voter()) {
running_voters_count++;
- } else if (r.status_pb && r.status_pb->tablet_data_state() == tablet::TABLET_DATA_COPYING) {
+ } else if (repl_info->status_pb &&
+ repl_info->status_pb->tablet_data_state() == tablet::TABLET_DATA_COPYING) {
copying_replicas_count++;
}
- }
-
- // Reconcile the master's and peers' consensus configs.
- int conflicting_states = 0;
- if (FLAGS_consensus) {
+ // Compare the master's and peers' consensus configs.
for (const auto& r : replica_infos) {
if (r.consensus_state && !r.consensus_state->Matches(master_config)) {
conflicting_states++;
}
}
}
- std::sort(replica_infos.begin(), replica_infos.end(),
- [](const ReplicaInfo& left, const ReplicaInfo& right) -> bool {
- if (!left.ts) return true;
- if (!right.ts) return false;
- return left.ts->uuid() < right.ts->uuid();
- });
// Determine the overall health state of the tablet.
- CheckResult result = CheckResult::HEALTHY;
- int num_voters = std::accumulate(replica_infos.begin(), replica_infos.end(),
- 0, [](int sum, const ReplicaInfo& info) {
- return sum + (info.replica->is_voter() ? 1 : 0);
- });
+ KsckCheckResult result = KsckCheckResult::HEALTHY;
+ string status;
int majority_size = consensus::MajoritySize(num_voters);
if (copying_replicas_count > 0) {
- Out() << Substitute("$0 is $1: $2 on-going tablet copies",
+ result = KsckCheckResult::RECOVERING;
+ status = Substitute("$0 is $1: $2 on-going tablet copies",
tablet_str,
Color(AnsiCode::YELLOW, "recovering"),
- copying_replicas_count) << endl;
- result = CheckResult::RECOVERING;
+ copying_replicas_count);
} else if (running_voters_count < majority_size) {
- Out() << Substitute("$0 is $1: $2 replica(s) not RUNNING",
+ result = KsckCheckResult::UNAVAILABLE;
+ status = Substitute("$0 is $1: $2 replica(s) not RUNNING",
tablet_str,
Color(AnsiCode::RED, "unavailable"),
- num_voters - running_voters_count) << endl;
- result = CheckResult::UNAVAILABLE;
+ num_voters - running_voters_count);
} else if (running_voters_count < num_voters) {
- Out() << Substitute("$0 is $1: $2 replica(s) not RUNNING",
+ result = KsckCheckResult::UNDER_REPLICATED;
+ status = Substitute("$0 is $1: $2 replica(s) not RUNNING",
tablet_str,
Color(AnsiCode::YELLOW, "under-replicated"),
- num_voters - running_voters_count) << endl;
- result = CheckResult::UNDER_REPLICATED;
+ num_voters - running_voters_count);
} else if (check_replica_count_ && num_voters < table_num_replicas) {
- Out() << Substitute("$0 is $1: configuration has $2 replicas vs desired $3",
+ result = KsckCheckResult::UNDER_REPLICATED;
+ status = Substitute("$0 is $1: configuration has $2 replicas vs desired $3",
tablet_str,
Color(AnsiCode::YELLOW, "under-replicated"),
num_voters,
- table_num_replicas) << endl;
- result = CheckResult::UNDER_REPLICATED;
+ table_num_replicas);
} else if (leaders_count != 1) {
- Out() << Substitute("$0 is $1: expected one LEADER replica",
- tablet_str, Color(AnsiCode::RED, "unavailable")) << endl;
- result = CheckResult::UNAVAILABLE;
+ result = KsckCheckResult::UNAVAILABLE;
+ status = Substitute("$0 is $1: expected one LEADER replica",
+ tablet_str, Color(AnsiCode::RED, "unavailable"));
} else if (conflicting_states > 0) {
- Out() << Substitute("$0 is $1: $0 replicas' active configs disagree with the master's",
+ result = KsckCheckResult::CONSENSUS_MISMATCH;
+ status = Substitute("$0 is $1: $0 replicas' active configs disagree with the master's",
tablet_str,
Color(AnsiCode::YELLOW, "conflicted"),
- conflicting_states) << endl;
- result = CheckResult::CONSENSUS_MISMATCH;
- } else if (FLAGS_verbose) {
- // The tablet is healthy. Only print if verbose mode is on.
- Out() << Substitute("$0 is $1.",
+ conflicting_states);
+ } else {
+ status = Substitute("$0 is $1.",
tablet_str,
- Color(AnsiCode::GREEN, "healthy")) << endl;
- }
-
- // In the case that we found something wrong, dump info on all the replicas
- // to make it easy to debug. Also, do that if verbose output is requested.
- if (result != CheckResult::HEALTHY || FLAGS_verbose) {
- for (const ReplicaInfo& r : replica_infos) {
- string ts_str = r.ts ? r.ts->ToString() : r.replica->ts_uuid();
- const char* spec_str = r.replica->is_leader()
- ? " [LEADER]" : (!r.replica->is_voter() ? " [NONVOTER]" : "");
-
- Out() << " " << ts_str << ": ";
- if (!r.ts || !r.ts->is_healthy()) {
- Out() << Color(AnsiCode::YELLOW, "TS unavailable") << spec_str << endl;
- continue;
- }
- if (r.state == tablet::RUNNING) {
- Out() << Color(AnsiCode::GREEN, "RUNNING") << spec_str << endl;
- continue;
- }
- if (r.status_pb == boost::none) {
- Out() << Color(AnsiCode::YELLOW, "missing") << spec_str << endl;
- continue;
- }
-
- Out() << Color(AnsiCode::YELLOW, "not running") << spec_str << endl;
- Out() << Substitute(
- " State: $0\n"
- " Data state: $1\n"
- " Last status: $2\n",
- Color(AnsiCode::BLUE, tablet::TabletStatePB_Name(r.state)),
- Color(AnsiCode::BLUE, tablet::TabletDataState_Name(r.status_pb->tablet_data_state())),
- Color(AnsiCode::BLUE, r.status_pb->last_status()));
- }
- }
-
- // If there are consensus conflicts, dump the consensus info too. Do that also
- // if verbose output is requested and consensus is on.
- if (conflicting_states > 0 || (FLAGS_verbose && FLAGS_consensus)) {
- if (result != CheckResult::CONSENSUS_MISMATCH && result != CheckResult::HEALTHY) {
- Out() << Substitute("$0 replicas' active configs differ from the master's.",
- conflicting_states)
- << endl;
- }
- if (result == CheckResult::HEALTHY) {
- Out() << Substitute("All replicas active configs agree with the master's.") << endl;
- }
- map<string, char> replica_uuid_mapping;
- AddToUuidLabelMapping(master_config.voter_uuids, &replica_uuid_mapping);
- AddToUuidLabelMapping(master_config.non_voter_uuids, &replica_uuid_mapping);
- for (const ReplicaInfo& rs : replica_infos) {
- if (!rs.consensus_state) continue;
- AddToUuidLabelMapping(rs.consensus_state->voter_uuids, &replica_uuid_mapping);
- AddToUuidLabelMapping(rs.consensus_state->non_voter_uuids, &replica_uuid_mapping);
- }
-
- Out() << " All the peers reported by the master and tablet servers are:" << endl;
- for (const auto& entry : replica_uuid_mapping) {
- Out() << " " << entry.second << " = " << entry.first << endl;
- }
- Out() << "The consensus matrix is:" << endl;
-
- // Prepare the header and columns for PrintTable.
- DataTable table({});
-
- // Seed the columns with the master info.
- vector<string> sources{"master"};
- vector<string> replicas{format_replicas(replica_uuid_mapping, master_config)};
- vector<string> terms{""};
- vector<string> indexes{""};
- vector<string> committed{"Yes"};
-
- // Fill out the columns with info from the replicas.
- for (const auto& replica_info : replica_infos) {
- char label = FindOrDie(replica_uuid_mapping, replica_info.replica->ts_uuid());
- sources.emplace_back(1, label);
- if (!replica_info.consensus_state) {
- replicas.emplace_back("[config not available]");
- terms.emplace_back("");
- indexes.emplace_back("");
- committed.emplace_back("");
- continue;
- }
- replicas.push_back(format_replicas(replica_uuid_mapping, replica_info.consensus_state.get()));
- terms.push_back(replica_info.consensus_state->term ?
- std::to_string(replica_info.consensus_state->term.get()) : "");
- indexes.push_back(replica_info.consensus_state->opid_index ?
- std::to_string(replica_info.consensus_state->opid_index.get()) : "");
- committed.emplace_back(
- replica_info.consensus_state->type == KsckConsensusConfigType::PENDING ? "No" : "Yes");
- }
- table.AddColumn("Config source", std::move(sources));
- table.AddColumn("Replicas", std::move(replicas));
- table.AddColumn("Current term", std::move(terms));
- table.AddColumn("Config index", std::move(indexes));
- table.AddColumn("Committed?", std::move(committed));
- CHECK_OK(table.PrintTo(Out()));
+ Color(AnsiCode::GREEN, "healthy"));
}
- // Information is printed only if the tablet isn't healthy or verbose mode is on.
- if (result != CheckResult::HEALTHY || FLAGS_verbose) {
- Out() << endl;
- }
+ KsckTabletSummary tablet_summary;
+ tablet_summary.result = result;
+ tablet_summary.status = status;
+ tablet_summary.master_cstate = std::move(master_config);
+ tablet_summary.replica_infos.swap(replica_infos);
+ results_.tablet_summaries.push_back(std::move(tablet_summary));
return result;
}
http://git-wip-us.apache.org/repos/asf/kudu/blob/0355d373/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index 5bbb6c5..edae81b 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -24,8 +24,6 @@
#include <iosfwd>
#include <map>
#include <memory>
-#include <ostream>
-#include <set>
#include <string>
#include <unordered_map>
#include <utility>
@@ -41,7 +39,7 @@
#include "kudu/gutil/strings/substitute.h"
#include "kudu/tablet/metadata.pb.h"
#include "kudu/tablet/tablet.pb.h" // IWYU pragma: keep
-#include "kudu/tools/color.h"
+#include "kudu/tools/ksck_results.h"
#include "kudu/util/monotime.h"
#include "kudu/util/status.h"
@@ -106,60 +104,9 @@ class KsckTabletReplica {
DISALLOW_COPY_AND_ASSIGN(KsckTabletReplica);
};
-// Possible types of consensus configs.
-enum class KsckConsensusConfigType {
- // A config reported by the master.
- MASTER,
- // A config that has been committed.
- COMMITTED,
- // A config that has not yet been committed.
- PENDING,
-};
-
-// Representation of a consensus state.
-struct KsckConsensusState {
- KsckConsensusState() = default;
- KsckConsensusState(KsckConsensusConfigType type,
- boost::optional<int64_t> term,
- boost::optional<int64_t> opid_index,
- boost::optional<std::string> leader_uuid,
- const std::vector<std::string>& voters,
- const std::vector<std::string>& non_voters)
- : type(type),
- term(std::move(term)),
- opid_index(std::move(opid_index)),
- leader_uuid(std::move(leader_uuid)),
- voter_uuids(voters.cbegin(), voters.cend()),
- non_voter_uuids(non_voters.cbegin(), non_voters.cend()) {
- }
-
- // Two KsckConsensusState structs match if they have the same
- // leader_uuid, the same set of peers, and one of the following holds:
- // - at least one of them is of type MASTER
- // - they are configs of the same type and they have the same term
- bool Matches(const KsckConsensusState &other) const {
- bool same_leader_and_peers =
- leader_uuid == other.leader_uuid &&
- voter_uuids == other.voter_uuids &&
- non_voter_uuids == other.non_voter_uuids;
- if (type == KsckConsensusConfigType::MASTER || other.type == KsckConsensusConfigType::MASTER) {
- return same_leader_and_peers;
- }
- return type == other.type && term == other.term && same_leader_and_peers;
- }
-
- KsckConsensusConfigType type;
- boost::optional<int64_t> term;
- boost::optional<int64_t> opid_index;
- boost::optional<std::string> leader_uuid;
- std::set<std::string> voter_uuids;
- std::set<std::string> non_voter_uuids;
-};
-
// Representation of a tablet belonging to a table. The tablet is composed of replicas.
class KsckTablet {
public:
- // TODO add start/end keys, stale.
KsckTablet(KsckTable* table, std::string id)
: id_(std::move(id)),
table_(table) {
@@ -483,6 +430,8 @@ class Ksck {
tablet_id_filters_ = std::move(tablet_ids);
}
+ const KsckResults& results() const;
+
// Check that all masters are healthy.
Status CheckMasterHealth();
@@ -501,10 +450,6 @@ class Ksck {
// their current status and tablet information.
Status FetchInfoFromTabletServers();
- // Establishes a connection with the specified tablet server.
- // Must first call FetchTableAndTabletInfo().
- Status ConnectToTabletServer(const std::shared_ptr<KsckTabletServer>& ts);
-
// Verifies that all the tablets in all tables matching the filters have
// enough replicas, and that each tablet's view of the tablet's consensus
// matches every other tablet's and the master's.
@@ -516,138 +461,28 @@ class Ksck {
// Must first call FetchTableAndTabletInfo().
Status ChecksumData(const ChecksumOptions& options);
- private:
- friend class KsckTest;
+ // Runs all the checks of ksck in the proper order, including checksum scans,
+ // if enabled. Returns OK if and only if all checks succeed.
+ Status Run();
- enum class CheckResult {
- // The tablet is healthy.
- HEALTHY,
-
- // The tablet has on-going tablet copies.
- RECOVERING,
-
- // The tablet has fewer replicas than its table's replication factor and
- // has no on-going tablet copies.
- UNDER_REPLICATED,
-
- // The tablet is missing a majority of its replicas and is unavailable for
- // writes. If a majority cannot be brought back online, then the tablet
- // requires manual intervention to recover.
- UNAVAILABLE,
-
- // There was a discrepancy among the tablets' consensus configs and the master's.
- CONSENSUS_MISMATCH,
- };
-
- enum class ServerHealth {
- // The server is healthy.
- HEALTHY,
-
- // The server couldn't be connected to.
- UNAVAILABLE,
-
- // The server reported an unexpected UUID.
- WRONG_SERVER_UUID,
- };
-
- static std::string ServerHealthToString(ServerHealth sh);
-
- // Returns an int signifying the "unhealthiness level" of a 'sh'.
- // Useful for sorting or comparing.
- static int ServerHealthScore(ServerHealth sh);
-
- // Summarizes the result of a server health check.
- struct ServerHealthSummary {
- std::string uuid;
- std::string address;
- ServerHealth health;
- };
-
- // Summarizes the result of VerifyTable().
- struct TableSummary {
- std::string name;
- int healthy_tablets = 0;
- int recovering_tablets = 0;
- int underreplicated_tablets = 0;
- int consensus_mismatch_tablets = 0;
- int unavailable_tablets = 0;
-
- int TotalTablets() const {
- return healthy_tablets + recovering_tablets + underreplicated_tablets +
- consensus_mismatch_tablets + unavailable_tablets;
- }
+ // Prints the results of ksck.
+ Status PrintResults();
- int UnhealthyTablets() const {
- return TotalTablets() - healthy_tablets;
- }
+ // Performs all checks and prints the results.
+ // Returns OK if and only if the ksck finds the cluster completely healthy,
+ // and printing succeeds.
+ Status RunAndPrintResults();
- // Summarize the table's status with a tablet CheckResult.
- // A table's status is determined by the health of the least healthy tablet.
- CheckResult TableStatus() const {
- if (unavailable_tablets > 0) {
- return CheckResult::UNAVAILABLE;
- }
- if (consensus_mismatch_tablets > 0) {
- return CheckResult::CONSENSUS_MISMATCH;
- }
- if (underreplicated_tablets > 0) {
- return CheckResult::UNDER_REPLICATED;
- }
- if (recovering_tablets > 0) {
- return CheckResult::RECOVERING;
- }
- return CheckResult::HEALTHY;
- }
- };
-
- enum class ServerType {
- MASTER,
- TABLET_SERVER,
- };
-
- static std::string ServerTypeToString(ServerType type) {
- switch (type) {
- case ServerType::MASTER:
- return "Master";
- case ServerType::TABLET_SERVER:
- return "Tablet Server";
- default:
- LOG(FATAL) << "Unkown ServerType";
- }
- }
-
- // Print a formatted health summary to 'out', given a list `summaries`
- // describing the health of servers of type 'type'.
- static Status PrintServerHealthSummaries(ServerType type,
- std::vector<ServerHealthSummary> summaries,
- std::ostream& out);
-
- // Print a formatted summary of the table in 'table_summaries' to 'out'.
- static Status PrintTableSummaries(const std::vector<TableSummary>& table_summaries,
- std::ostream& out);
+ private:
+ friend class KsckTest;
- bool VerifyTable(const std::shared_ptr<KsckTable>& table, TableSummary* ts);
+ bool VerifyTable(const std::shared_ptr<KsckTable>& table);
bool VerifyTableWithTimeout(const std::shared_ptr<KsckTable>& table,
const MonoDelta& timeout,
const MonoDelta& retry_interval);
- CheckResult VerifyTablet(const std::shared_ptr<KsckTablet>& tablet,
+ KsckCheckResult VerifyTablet(const std::shared_ptr<KsckTablet>& tablet,
int table_num_replicas);
- // Print an informational message to this instance's output stream.
- std::ostream& Out() {
- return *out_;
- }
-
- // Print an error message to this instance's output stream.
- std::ostream& Error() {
- return (*out_) << Color(AnsiCode::RED, "ERROR: ");
- }
-
- // Print a warning message to this instance's output stream.
- std::ostream& Warn() {
- return (*out_) << Color(AnsiCode::YELLOW, "WARNING: ");
- }
-
const std::shared_ptr<KsckCluster> cluster_;
bool check_replica_count_ = true;
@@ -656,8 +491,11 @@ class Ksck {
std::ostream* const out_;
+ KsckResults results_;
+
DISALLOW_COPY_AND_ASSIGN(Ksck);
};
+
} // namespace tools
} // namespace kudu
http://git-wip-us.apache.org/repos/asf/kudu/blob/0355d373/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index 944b640..e969f33 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -241,6 +241,7 @@ TEST_F(RemoteKsckTest, TestTabletServerMismatchedUUID) {
string new_uuid = new_tablet_server.uuid();
ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError());
+ ASSERT_OK(ksck_->PrintResults());
string match_string = "Remote error: ID reported by tablet server "
"($0) doesn't match the expected ID: $1";