You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2021/04/19 20:36:46 UTC
[kudu] branch master updated: [ksck] KUDU-3258: allow ksck and
rebalancer to work on txn status table
This is an automated email from the ASF dual-hosted git repository.
awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new bdb6d06 [ksck] KUDU-3258: allow ksck and rebalancer to work on txn status table
bdb6d06 is described below
commit bdb6d0643ec7b68f11cefc038dc256824ce96f5e
Author: Andrew Wong <aw...@cloudera.com>
AuthorDate: Fri Apr 9 18:22:29 2021 -0700
[ksck] KUDU-3258: allow ksck and rebalancer to work on txn status table
This patch adds the transaction system table to the ksck output in its
own section for system tables. Here's a sample snippet of an output that
has the system table:
Summary by system table
Name | RF | Status | Total Tablets | Healthy | Recovering | Under-replicated | Unavailable
-------------------------------+----+------------------+---------------+---------+------------+------------------+-------------
kudu_system.kudu_transactions | 3 | UNDER_REPLICATED | 1 | 0 | 0 | 1 | 0
Summary by table
Name | RF | Status | Total Tablets | Healthy | Recovering | Under-replicated | Unavailable
-------------------------------------------------------+----+-------------+---------------+---------+------------+------------------+-------------
default.loadgen_auto_05cf5be513ea4a84a052e8044f641c1a | 1 | UNAVAILABLE | 8 | 6 | 0 | 0 | 2
default.loadgen_auto_0c7ea48d5f6948408694b176f70e69ec | 1 | UNAVAILABLE | 8 | 5 | 0 | 0 | 3
default.loadgen_auto_241be343981c46d081ab2b3d2e3b6e6a | 1 | UNAVAILABLE | 8 | 5 | 0 | 0 | 3
default.loadgen_auto_385476d5d3b6493f8cbf659c8a4cf7cc | 1 | UNAVAILABLE | 8 | 6 | 0 | 0 | 2
default.loadgen_auto_430e280e8aa7450591da67ae15ff0f37 | 1 | UNAVAILABLE | 8 | 6 | 0 | 0 | 2
The section can be included/excluded via the --sections flag of ksck.
Since ksck and the rebalancer use the same cluster-examining code, this
patch also updates the rebalancer cluster_status class to account for
system tables -- the tool would have crashed upon trying to find the
replication factor of the system table otherwise.
Change-Id: I8162f6eb046d98791c6bdeb5c15a0af72487300d
Reviewed-on: http://gerrit.cloudera.org:8080/17315
Tested-by: Andrew Wong <aw...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
.../integration-tests/txn_status_table-itest.cc | 52 ++++++++
src/kudu/rebalance/cluster_status.h | 1 +
src/kudu/tools/ksck-test.cc | 133 ++++++++++++---------
src/kudu/tools/ksck.cc | 20 +++-
src/kudu/tools/ksck.h | 17 +--
src/kudu/tools/ksck_remote.cc | 45 +++++--
src/kudu/tools/ksck_results.cc | 33 +++--
src/kudu/tools/ksck_results.h | 9 +-
src/kudu/tools/rebalancer_tool.cc | 28 +++--
src/kudu/tools/tool.proto | 1 +
src/kudu/tools/tool_action_cluster.cc | 2 +-
11 files changed, 247 insertions(+), 94 deletions(-)
diff --git a/src/kudu/integration-tests/txn_status_table-itest.cc b/src/kudu/integration-tests/txn_status_table-itest.cc
index d018beb..a766dda 100644
--- a/src/kudu/integration-tests/txn_status_table-itest.cc
+++ b/src/kudu/integration-tests/txn_status_table-itest.cc
@@ -40,6 +40,7 @@
#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/stl_util.h"
+#include "kudu/gutil/strings/join.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/integration-tests/cluster_itest_util.h"
#include "kudu/integration-tests/test_workload.h"
@@ -51,6 +52,7 @@
#include "kudu/tablet/metadata.pb.h"
#include "kudu/tablet/tablet_metadata.h"
#include "kudu/tablet/tablet_replica.h"
+#include "kudu/tools/tool_test_util.h"
#include "kudu/transactions/transactions.pb.h"
#include "kudu/transactions/txn_status_tablet.h"
#include "kudu/transactions/txn_system_client.h"
@@ -58,6 +60,7 @@
#include "kudu/tserver/tablet_server.h"
#include "kudu/tserver/ts_tablet_manager.h"
#include "kudu/util/monotime.h"
+#include "kudu/util/net/net_util.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
@@ -85,6 +88,7 @@ using kudu::cluster::InternalMiniClusterOptions;
using kudu::itest::TServerDetails;
using kudu::itest::TabletServerMap;
using kudu::tablet::TabletReplica;
+using kudu::tools::RunKuduTool;
using kudu::transactions::TxnStatePB;
using kudu::transactions::TxnStatusEntryPB;
using kudu::transactions::TxnStatusTablet;
@@ -207,6 +211,54 @@ TEST_F(TxnStatusTableITest, TestTxnStatusTableNotListed) {
ASSERT_NE(nullptr, table);
}
+// Test that the transaction status table is visible in a separate section in
+// ksck, and that its health is reported as other tables are.
+TEST_F(TxnStatusTableITest, TestTxnStatusTableInKsck) {
+ vector<string> master_addrs;
+ for (const auto& hp : cluster_->master_rpc_addrs()) {
+ master_addrs.emplace_back(hp.ToString());
+ }
+ string out;
+ string err;
+ vector<string> ksck_args = { "cluster", "ksck", JoinStrings(master_addrs, ",") };
+ ASSERT_OK(RunKuduTool(ksck_args, &out, &err));
+ ASSERT_STR_CONTAINS(out, "The cluster doesn't have any matching system tables");
+
+ // Nothing should be logged on error for finding the txn status table.
+ ASSERT_STR_NOT_CONTAINS(err, TxnStatusTablet::kTxnStatusTableName) << err;
+ ASSERT_OK(txn_sys_client_->CreateTxnStatusTable(100));
+
+ ASSERT_OK(RunKuduTool(ksck_args, &out));
+ ASSERT_STR_MATCHES(out,
+ "^Summary by system table.*\n"
+ "^ Name | RF | Status | Total Tablets | Healthy .*\n"
+ "^-------------------------------+----+---------+---------------+---------.*\n"
+ "^ kudu_system.kudu_transactions | 1 | HEALTHY | 1 | 1 .*\n");
+
+ // Now bring down a tablet server and we should see the health update.
+ cluster_->mini_tablet_server(0)->Shutdown();
+ Status s = RunKuduTool(ksck_args, &out);
+ ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
+ ASSERT_STR_MATCHES(out,
+ "^Summary by system table.*\n"
+ "^ Name | RF | Status | Total Tablets | Healthy .*\n"
+ "^-------------------------------+----+-------------+---------------+---------.*\n"
+ "^ kudu_system.kudu_transactions | 1 | UNAVAILABLE | 1 | 0 .*\n");
+}
+
+// Test that despite being unable to list the transaction status table, we are
+// able to run the tool. In previous iterations of the rebalancer, the tool may
+// have crashed attempting to find the RF for a table it didn't know about.
+TEST_F(TxnStatusTableITest, TestTxnStatusTableInRebalancer) {
+ vector<string> master_addrs;
+ for (const auto& hp : cluster_->master_rpc_addrs()) {
+ master_addrs.emplace_back(hp.ToString());
+ }
+ ASSERT_OK(txn_sys_client_->CreateTxnStatusTable(100));
+ vector<string> rebalancer_args = { "cluster", "rebalance", JoinStrings(master_addrs, ",") };
+ ASSERT_OK(RunKuduTool(rebalancer_args));
+}
+
// Test that only the service- or super-user can create or alter the
// transaction status table.
TEST_F(TxnStatusTableITest, TestProtectCreateAndAlter) {
diff --git a/src/kudu/rebalance/cluster_status.h b/src/kudu/rebalance/cluster_status.h
index cc5dcbf..cd3c3b0 100644
--- a/src/kudu/rebalance/cluster_status.h
+++ b/src/kudu/rebalance/cluster_status.h
@@ -236,6 +236,7 @@ struct ClusterStatus {
// Tablet information includes consensus state.
std::vector<TabletSummary> tablet_summaries;
std::vector<TableSummary> table_summaries;
+ std::vector<TableSummary> system_table_summaries;
};
} // namespace cluster_summary
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index 643d401..388821b 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -28,7 +28,6 @@
#include <set>
#include <sstream>
#include <string>
-#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
@@ -50,6 +49,7 @@
#include "kudu/tablet/tablet.pb.h"
#include "kudu/tools/ksck_checksum.h"
#include "kudu/tools/ksck_results.h"
+#include "kudu/transactions/txn_status_tablet.h"
#include "kudu/util/jsonreader.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/status.h"
@@ -74,6 +74,7 @@ using kudu::cluster_summary::TableSummary;
using kudu::cluster_summary::TabletSummary;
using kudu::server::GetFlagsResponsePB;
using kudu::tablet::TabletDataState;
+using kudu::transactions::TxnStatusTablet;
using std::make_shared;
using std::ostringstream;
@@ -256,6 +257,7 @@ class MockKsckCluster : public KsckCluster {
using KsckCluster::masters_;
using KsckCluster::tables_;
using KsckCluster::tablet_servers_;
+ using KsckCluster::txn_sys_table_;
};
class KsckTest : public KuduTest {
@@ -312,7 +314,7 @@ class KsckTest : public KuduTest {
table_summary.consensus_mismatch_tablets = consensus_mismatch_tablets;
table_summary.unavailable_tablets = unavailable_tablets;
std::ostringstream oss;
- PrintTableSummaries({ table_summary }, oss);
+ PrintTableSummaries({ table_summary }, "table", oss);
return oss.str();
}
@@ -331,13 +333,20 @@ class KsckTest : public KuduTest {
}
}
- void CreateOneTableOneTablet() {
- CreateDefaultAssignmentPlan(1);
+ void CreateOneTableOneTablet(bool create_txn_status_table = false) {
+ NO_FATALS(CreateDefaultAssignmentPlan(create_txn_status_table ? 2 : 1));
auto table = CreateAndAddTable("test", 1);
auto tablet(make_shared<KsckTablet>(table.get(), "tablet-id-1"));
- CreateAndFillTablet(tablet, 1, true, true);
+ NO_FATALS(CreateAndFillTablet(tablet, 1, true, true));
table->set_tablets({ tablet });
+
+ if (create_txn_status_table) {
+ auto sys_table = CreateAndAddTxnStatusTable(1);
+ auto sys_tablet(make_shared<KsckTablet>(sys_table.get(), "sys-tablet-id-1"));
+ NO_FATALS(CreateAndFillTablet(sys_tablet, 1, true, true));
+ sys_table->set_tablets({ sys_tablet });
+ }
}
void CreateOneSmallReplicatedTable(const string& table_name = "test",
@@ -384,6 +393,14 @@ class KsckTest : public KuduTest {
table->set_tablets({ tablet });
}
+ shared_ptr<KsckTable> CreateAndAddTxnStatusTable(int num_replicas) {
+ auto table(make_shared<KsckTable>(
+ TxnStatusTablet::kTxnStatusTableName, TxnStatusTablet::kTxnStatusTableName,
+ TxnStatusTablet::GetSchema(), num_replicas));
+ cluster_->txn_sys_table_ = table;
+ return table;
+ }
+
shared_ptr<KsckTable> CreateAndAddTable(const string& id_and_name, int num_replicas) {
auto table(make_shared<KsckTable>(
id_and_name, id_and_name, Schema(), num_replicas));
@@ -396,11 +413,11 @@ class KsckTest : public KuduTest {
{
vector<shared_ptr<KsckTabletReplica>> replicas;
if (has_leader) {
- CreateReplicaAndAdd(&replicas, tablet->id(), true, is_running);
+ NO_FATALS(CreateReplicaAndAdd(&replicas, tablet->id(), true, is_running));
num_replicas--;
}
for (int i = 0; i < num_replicas; i++) {
- CreateReplicaAndAdd(&replicas, tablet->id(), false, is_running);
+ NO_FATALS(CreateReplicaAndAdd(&replicas, tablet->id(), false, is_running));
}
tablet->set_replicas(std::move(replicas));
}
@@ -419,7 +436,7 @@ class KsckTest : public KuduTest {
for (const auto& replica : tablet->replicas()) {
shared_ptr<MockKsckTabletServer> ts =
static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_.at(replica->ts_uuid()));
- InsertOrDieNoPrint(&ts->tablet_consensus_state_map_,
+ InsertIfNotPresent(&ts->tablet_consensus_state_map_,
std::make_pair(replica->ts_uuid(), tablet->id()),
cstate);
}
@@ -856,27 +873,29 @@ void CheckPlainStringSection(const string& plain, const string& header, bool pre
}
void CheckPlainStringSections(const string& plain, int sections) {
- CheckPlainStringSection(plain,
- "Master Summary\n",
- sections & PrintSections::Values::MASTER_SUMMARIES);
- CheckPlainStringSection(plain,
- "Tablet Server Summary\n",
- sections & PrintSections::Values::TSERVER_SUMMARIES);
- CheckPlainStringSection(plain,
- "Version Summary\n",
- sections & PrintSections::Values::VERSION_SUMMARIES);
- CheckPlainStringSection(plain,
- "Tablet Summary\n",
- sections & PrintSections::Values::TABLET_SUMMARIES);
- CheckPlainStringSection(plain,
- "Summary by table\n",
- sections & PrintSections::Values::TABLE_SUMMARIES);
- CheckPlainStringSection(plain,
- "Checksum Summary\n",
- sections & PrintSections::Values::CHECKSUM_RESULTS);
- CheckPlainStringSection(plain,
- "Total Count Summary\n",
- sections & PrintSections::Values::TOTAL_COUNT);
+ NO_FATALS(CheckPlainStringSection(plain,
+ "Master Summary\n",
+ sections & PrintSections::Values::MASTER_SUMMARIES));
+ NO_FATALS(CheckPlainStringSection(plain,
+ "Tablet Server Summary\n",
+ sections & PrintSections::Values::TSERVER_SUMMARIES));
+ NO_FATALS(CheckPlainStringSection(plain,
+ "Version Summary\n",
+ sections & PrintSections::Values::VERSION_SUMMARIES));
+ NO_FATALS(CheckPlainStringSection(plain,
+ "Tablet Summary\n",
+ sections & PrintSections::Values::TABLET_SUMMARIES));
+ NO_FATALS(CheckPlainStringSection(plain,
+ "Summary by table\n",
+ sections & PrintSections::Values::TABLE_SUMMARIES));
+ NO_FATALS(CheckPlainStringSection(plain, "Summary by system table\n",
+ sections & PrintSections::Values::SYSTEM_TABLE_SUMMARIES));
+ NO_FATALS(CheckPlainStringSection(plain,
+ "Checksum Summary\n",
+ sections & PrintSections::Values::CHECKSUM_RESULTS));
+ NO_FATALS(CheckPlainStringSection(plain,
+ "Total Count Summary\n",
+ sections & PrintSections::Values::TOTAL_COUNT));
}
void CheckJsonStringVsKsckResults(const string& json,
@@ -885,53 +904,59 @@ void CheckJsonStringVsKsckResults(const string& json,
JsonReader r(json);
ASSERT_OK(r.Init());
- CheckJsonVsServerHealthSummaries(
+ NO_FATALS(CheckJsonVsServerHealthSummaries(
r,
"master_summaries",
sections & PrintSections::Values::MASTER_SUMMARIES ?
boost::optional<vector<ServerHealthSummary>>
- (results.cluster_status.master_summaries) : boost::none);
- CheckJsonVsMasterConsensus(
+ (results.cluster_status.master_summaries) : boost::none));
+ NO_FATALS(CheckJsonVsMasterConsensus(
r,
results.cluster_status.master_consensus_conflict,
sections & PrintSections::Values::MASTER_SUMMARIES ?
boost::optional<ConsensusStateMap>
- (results.cluster_status.master_consensus_state_map) : boost::none);
- CheckJsonVsServerHealthSummaries(
+ (results.cluster_status.master_consensus_state_map) : boost::none));
+ NO_FATALS(CheckJsonVsServerHealthSummaries(
r,
"tserver_summaries",
sections & PrintSections::Values::TSERVER_SUMMARIES ?
boost::optional<vector<ServerHealthSummary>>
- (results.cluster_status.tserver_summaries) : boost::none);
- CheckJsonVsVersionSummaries(
+ (results.cluster_status.tserver_summaries) : boost::none));
+ NO_FATALS(CheckJsonVsVersionSummaries(
r,
"version_summaries",
sections & PrintSections::Values::VERSION_SUMMARIES ?
boost::optional<KsckVersionToServersMap>
- (results.version_summaries) : boost::none);
- CheckJsonVsTabletSummaries(
+ (results.version_summaries) : boost::none));
+ NO_FATALS(CheckJsonVsTabletSummaries(
r,
"tablet_summaries",
sections & PrintSections::Values::TABLET_SUMMARIES ?
boost::optional<vector<TabletSummary>>
- (results.cluster_status.tablet_summaries) : boost::none);
- CheckJsonVsTableSummaries(
+ (results.cluster_status.tablet_summaries) : boost::none));
+ NO_FATALS(CheckJsonVsTableSummaries(
r,
"table_summaries",
sections & PrintSections::Values::TABLE_SUMMARIES ?
boost::optional<vector<TableSummary>>
- (results.cluster_status.table_summaries) : boost::none);
- CheckJsonVsChecksumResults(
+ (results.cluster_status.table_summaries) : boost::none));
+ NO_FATALS(CheckJsonVsTableSummaries(
+ r,
+ "system_table_summaries",
+ sections & PrintSections::Values::SYSTEM_TABLE_SUMMARIES ?
+ boost::optional<vector<TableSummary>>
+ (results.cluster_status.system_table_summaries) : boost::none));
+ NO_FATALS(CheckJsonVsChecksumResults(
r,
"checksum_results",
sections & PrintSections::Values::CHECKSUM_RESULTS ?
- boost::optional<KsckChecksumResults>(results.checksum_results) : boost::none);
- CheckJsonVsCountSummaries(
+ boost::optional<KsckChecksumResults>(results.checksum_results) : boost::none));
+ NO_FATALS(CheckJsonVsCountSummaries(
r,
"count_summaries",
sections & PrintSections::Values::TOTAL_COUNT ?
- boost::optional<KsckResults>(results) : boost::none);
- CheckJsonVsErrors(r, "errors", results.error_messages);
+ boost::optional<KsckResults>(results) : boost::none));
+ NO_FATALS(CheckJsonVsErrors(r, "errors", results.error_messages));
}
void CheckMessageNotPresent(const vector<Status>& messages, const string& msg) {
@@ -1872,25 +1897,25 @@ TEST_F(KsckTest, TestSectionFilter) {
{PrintSections::Values::VERSION_SUMMARIES, "VERSION_SUMMARIES"},
{PrintSections::Values::TABLET_SUMMARIES, "TABLET_SUMMARIES"},
{PrintSections::Values::TABLE_SUMMARIES, "TABLE_SUMMARIES"},
+ {PrintSections::Values::SYSTEM_TABLE_SUMMARIES, "SYSTEM_TABLE_SUMMARIES"},
{PrintSections::Values::CHECKSUM_RESULTS, "CHECKSUM_RESULTS"},
{PrintSections::Values::TOTAL_COUNT, "TOTAL_COUNT"}};
- CreateOneTableOneTablet();
- for (const auto& section : sections) {
- if (section.first == PrintSections::Values::CHECKSUM_RESULTS) {
+ NO_FATALS(CreateOneTableOneTablet(/*create_txn_status_table=*/true));
+ for (const auto& [s_enum, s_str] : sections) {
+ if (s_enum == PrintSections::Values::CHECKSUM_RESULTS) {
FLAGS_checksum_scan = true;
}
- int selected_sections = section.first;
- ksck_->set_print_sections({section.second});
+ ksck_->set_print_sections({s_str});
err_stream_.str("");
err_stream_.clear();
ASSERT_OK(RunKsck());
// Check plain string output.
- CheckPlainStringSections(err_stream_.str(), selected_sections);
+ NO_FATALS(CheckPlainStringSections(err_stream_.str(), s_enum));
// Check json string output.
- const string& json_output = KsckResultsToJsonString(selected_sections);
- CheckJsonStringVsKsckResults(json_output, ksck_->results(), selected_sections);
+ const string& json_output = KsckResultsToJsonString(s_enum);
+ NO_FATALS(CheckJsonStringVsKsckResults(json_output, ksck_->results(), s_enum));
}
}
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 6cd5b9f..050bc06 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -564,6 +564,9 @@ void Ksck::set_print_sections(const std::vector<std::string>& sections) {
if (section_upper == "TABLE_SUMMARIES") {
print_sections_flags_ |= PrintSections::TABLE_SUMMARIES;
}
+ if (section_upper == "SYSTEM_TABLE_SUMMARIES") {
+ print_sections_flags_ |= PrintSections::SYSTEM_TABLE_SUMMARIES;
+ }
if (section_upper == "CHECKSUM_RESULTS") {
print_sections_flags_ |= PrintSections::CHECKSUM_RESULTS;
}
@@ -799,16 +802,23 @@ Status Ksck::RunAndPrintResults() {
Status Ksck::CheckTablesConsistency() {
int bad_tables_count = 0;
+ auto& cluster_status = results_.cluster_status;
+ if (cluster_->txn_sys_table()) {
+ if (!VerifyTable(cluster_->txn_sys_table(), &cluster_status.system_table_summaries)) {
+ bad_tables_count++;
+ }
+ }
for (const shared_ptr<KsckTable> &table : cluster_->tables()) {
- if (!VerifyTable(table)) {
+ if (!VerifyTable(table, &cluster_status.table_summaries)) {
bad_tables_count++;
}
}
if (bad_tables_count > 0) {
return Status::Corruption(
- Substitute("$0 out of $1 table(s) are not healthy",
- bad_tables_count, results_.cluster_status.table_summaries.size()));
+ Substitute("$0 out of $1 table(s) are not healthy", bad_tables_count,
+ cluster_status.table_summaries.size() +
+ cluster_status.system_table_summaries.size()));
}
return Status::OK();
}
@@ -823,7 +833,7 @@ Status Ksck::ChecksumData(const KsckChecksumOptions& opts) {
out_for_progress_updates);
}
-bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table) {
+bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table, vector<TableSummary>* table_summaries) {
if (table->tablets().empty()) {
VLOG(1) << Substitute("Skipping table $0 as it has no matching tablets",
table->name());
@@ -858,7 +868,7 @@ bool Ksck::VerifyTable(const shared_ptr<KsckTable>& table) {
}
bool all_healthy = ts.healthy_tablets == ts.TotalTablets();
if (ts.TotalTablets() > 0) {
- results_.cluster_status.table_summaries.push_back(std::move(ts));
+ table_summaries->emplace_back(std::move(ts));
}
return all_healthy;
}
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index d9f1223..2466e18 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -47,8 +47,6 @@
namespace kudu {
-class MonoDelta;
-
namespace rpc {
class Messenger;
} // namespace rpc
@@ -503,6 +501,10 @@ class KsckCluster {
return ts_states_;
}
+ const std::shared_ptr<KsckTable>& txn_sys_table() const {
+ return txn_sys_table_;
+ }
+
const std::vector<std::shared_ptr<KsckTable>>& tables() const {
return tables_;
}
@@ -545,6 +547,7 @@ class KsckCluster {
MasterList masters_;
TSMap tablet_servers_;
KsckTServerStateMap ts_states_;
+ std::shared_ptr<KsckTable> txn_sys_table_;
std::vector<std::shared_ptr<KsckTable>> tables_;
std::unique_ptr<ThreadPool> pool_;
@@ -683,11 +686,11 @@ class Ksck {
KsckFlagToServersMap* flags_to_servers_map,
KsckFlagTagsMap* flag_tags_map = nullptr);
- bool VerifyTable(const std::shared_ptr<KsckTable>& table);
-
- bool VerifyTableWithTimeout(const std::shared_ptr<KsckTable>& table,
- const MonoDelta& timeout,
- const MonoDelta& retry_interval);
+ // Returns true if the table is healthy, creating a health summary for it and
+ // adding the table to 'table_summaries'. Only adds a summary if there are
+ // tablets in the table.
+ bool VerifyTable(const std::shared_ptr<KsckTable>& table,
+ std::vector<cluster_summary::TableSummary>* table_summaries);
cluster_summary::HealthCheckResult VerifyTablet(
const std::shared_ptr<KsckTablet>& tablet,
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index 34792bc..09dfe21 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -56,6 +56,7 @@
#include "kudu/tools/ksck_checksum.h"
#include "kudu/tools/ksck_results.h"
#include "kudu/tools/tool_action_common.h"
+#include "kudu/transactions/txn_status_tablet.h"
#include "kudu/tserver/tablet_server.h"
#include "kudu/tserver/tserver.pb.h"
#include "kudu/tserver/tserver_admin.pb.h"
@@ -89,6 +90,7 @@ using kudu::master::ListTabletServersResponsePB;
using kudu::master::TServerStatePB;
using kudu::rpc::Messenger;
using kudu::rpc::RpcController;
+using kudu::transactions::TxnStatusTablet;
using kudu::server::GenericServiceProxy;
using kudu::server::GetFlagsRequestPB;
using kudu::server::GetFlagsResponsePB;
@@ -577,10 +579,35 @@ Status RemoteKsckCluster::RetrieveTabletServers() {
}
Status RemoteKsckCluster::RetrieveTablesList() {
+ shared_ptr<KsckTable> txn_sys_table;
+ RETURN_NOT_OK(pool_->Submit([&]() {
+ // There is no public API to list the txn status table -- just open it
+ // manually.
+ client::sp::shared_ptr<KuduTable> t;
+ Status s = client_->OpenTable(TxnStatusTablet::kTxnStatusTableName, &t);
+ if (s.IsNotFound()) {
+ // If there is no table, just exit without logging anything (e.g. if
+ // we're communicating with an older version of Kudu that doesn't support
+ // transactions).
+ return;
+ }
+ if (!s.ok()) {
+ LOG(ERROR) << Substitute("unable to open txn status table $0: $1",
+ TxnStatusTablet::kTxnStatusTableName, s.ToString());
+ return;
+ }
+ auto table(make_shared<KsckTable>(
+ t->id(), TxnStatusTablet::kTxnStatusTableName,
+ KuduSchema::ToSchema(t->schema()), t->num_replicas()));
+ txn_sys_table = std::move(table);
+ }));
+
vector<string> table_names;
RETURN_NOT_OK(client_->ListTables(&table_names));
if (table_names.empty()) {
+ pool_->Wait();
+ txn_sys_table_ = std::move(txn_sys_table);
return Status::OK();
}
@@ -606,17 +633,16 @@ Status RemoteKsckCluster::RetrieveTablesList() {
table_name, s.ToString());
return;
}
- {
- auto table(make_shared<KsckTable>(
- t->id(), table_name, KuduSchema::ToSchema(t->schema()), t->num_replicas()));
- std::lock_guard<simple_spinlock> l(tables_lock);
- tables.emplace_back(std::move(table));
- }
+ auto table(make_shared<KsckTable>(
+ t->id(), table_name, KuduSchema::ToSchema(t->schema()), t->num_replicas()));
+ std::lock_guard<simple_spinlock> l(tables_lock);
+ tables.emplace_back(std::move(table));
}));
}
pool_->Wait();
- tables_.swap(tables);
+ txn_sys_table_ = std::move(txn_sys_table);
+ tables_ = std::move(tables);
if (tables_.size() < tables_count) {
return Status::NetworkError(
@@ -628,6 +654,11 @@ Status RemoteKsckCluster::RetrieveTablesList() {
}
Status RemoteKsckCluster::RetrieveAllTablets() {
+ if (txn_sys_table_) {
+ RETURN_NOT_OK(pool_->Submit(
+ [this]() { this->RetrieveTabletsList(txn_sys_table_); }));
+ pool_->Wait();
+ }
if (tables_.empty()) {
return Status::OK();
}
diff --git a/src/kudu/tools/ksck_results.cc b/src/kudu/tools/ksck_results.cc
index 9ac9a35..7c503c5 100644
--- a/src/kudu/tools/ksck_results.cc
+++ b/src/kudu/tools/ksck_results.cc
@@ -298,17 +298,27 @@ Status KsckResults::PrintTo(PrintMode mode, int sections, ostream& out) {
// Then, summarize the tablets by table.
// Sort the tables so unhealthy tables are easy to see at the bottom.
- if (sections & PrintSections::TABLE_SUMMARIES) {
- std::sort(cluster_status.table_summaries.begin(),
- cluster_status.table_summaries.end(),
+ const auto sort_and_print_tables = [&] (vector<TableSummary>* table_summaries,
+ const string& table_type) {
+ std::sort(table_summaries->begin(),
+ table_summaries->end(),
[](const TableSummary &left,
- const TableSummary &right) {
+ const TableSummary &right) {
return std::make_pair(left.TableStatus() != HealthCheckResult::HEALTHY,
left.name) <
- std::make_pair(right.TableStatus() != HealthCheckResult::HEALTHY,
+ std::make_pair(right.TableStatus() != HealthCheckResult::HEALTHY,
right.name);
});
- RETURN_NOT_OK(PrintTableSummaries(cluster_status.table_summaries, out));
+ return PrintTableSummaries(*table_summaries, table_type, out);
+ };
+ if (sections & PrintSections::SYSTEM_TABLE_SUMMARIES) {
+ sort_and_print_tables(&cluster_status.system_table_summaries, "system table");
+ if (!cluster_status.system_table_summaries.empty()) {
+ out << endl;
+ }
+ }
+ if (sections & PrintSections::TABLE_SUMMARIES) {
+ sort_and_print_tables(&cluster_status.table_summaries, "table");
if (!cluster_status.table_summaries.empty()) {
out << endl;
}
@@ -541,13 +551,14 @@ Status PrintVersionTable(const KsckVersionToServersMap& version_summaries,
}
Status PrintTableSummaries(const vector<TableSummary>& table_summaries,
+ const string& table_type,
ostream& out) {
if (table_summaries.empty()) {
- out << "The cluster doesn't have any matching tables" << endl;
+ out << Substitute("The cluster doesn't have any matching $0s", table_type) << endl;
return Status::OK();
}
- out << "Summary by table" << endl;
+ out << Substitute("Summary by $0", table_type) << endl;
DataTable table({ "Name", "RF", "Status", "Total Tablets",
"Healthy", "Recovering", "Under-replicated", "Unavailable"});
for (const TableSummary& ts : table_summaries) {
@@ -1027,6 +1038,12 @@ void KsckResults::ToPb(KsckResultsPB* pb, int sections) const {
}
}
+ if (sections & PrintSections::SYSTEM_TABLE_SUMMARIES) {
+ for (const auto &table : cluster_status.system_table_summaries) {
+ TableSummaryToPb(table, pb->add_system_table_summaries());
+ }
+ }
+
if (sections & PrintSections::TABLE_SUMMARIES) {
for (const auto &table : cluster_status.table_summaries) {
TableSummaryToPb(table, pb->add_table_summaries());
diff --git a/src/kudu/tools/ksck_results.h b/src/kudu/tools/ksck_results.h
index 4d68086..da01cfb 100644
--- a/src/kudu/tools/ksck_results.h
+++ b/src/kudu/tools/ksck_results.h
@@ -85,11 +85,12 @@ struct PrintSections {
VERSION_SUMMARIES = 1 << 3,
TABLET_SUMMARIES = 1 << 4,
TABLE_SUMMARIES = 1 << 5,
- CHECKSUM_RESULTS = 1 << 6,
- TOTAL_COUNT = 1 << 7,
+ SYSTEM_TABLE_SUMMARIES = 1 << 6,
+ CHECKSUM_RESULTS = 1 << 7,
+ TOTAL_COUNT = 1 << 8,
// Print all sections above.
- ALL_SECTIONS = 0b011111111
+ ALL_SECTIONS = 0b0111111111
};
};
@@ -189,8 +190,10 @@ Status PrintVersionTable(const KsckVersionToServersMap& version_summaries,
std::ostream& out);
// Print a formatted summary of the tables in 'table_summaries' to 'out'.
+// 'table_type' is used to print the kind of tables being printed.
Status PrintTableSummaries(
const std::vector<cluster_summary::TableSummary>& table_summaries,
+ const std::string& table_type,
std::ostream& out);
// Print a formatted summary of the tablets in 'tablet_summaries' to 'out'.
diff --git a/src/kudu/tools/rebalancer_tool.cc b/src/kudu/tools/rebalancer_tool.cc
index 824f531..6e638c9 100644
--- a/src/kudu/tools/rebalancer_tool.cc
+++ b/src/kudu/tools/rebalancer_tool.cc
@@ -272,25 +272,30 @@ Status RebalancerTool::KsckResultsToClusterRawInfo(
// Filter out entities that are not relevant to the specified location.
vector<ServerHealthSummary> tserver_summaries;
- tserver_summaries.reserve(ksck_info.cluster_status.tserver_summaries.size());
+ const auto& cluster_status = ksck_info.cluster_status;
+ tserver_summaries.reserve(cluster_status.tserver_summaries.size());
vector<TabletSummary> tablet_summaries;
- tablet_summaries.reserve(ksck_info.cluster_status.tablet_summaries.size());
+ tablet_summaries.reserve(cluster_status.tablet_summaries.size());
vector<TableSummary> table_summaries;
- table_summaries.reserve(table_summaries.size());
+ table_summaries.reserve(cluster_status.table_summaries.size() +
+ cluster_status.system_table_summaries.size());
if (!location) {
// Information on the whole cluster.
- tserver_summaries = ksck_info.cluster_status.tserver_summaries;
- tablet_summaries = ksck_info.cluster_status.tablet_summaries;
- table_summaries = ksck_info.cluster_status.table_summaries;
+ tserver_summaries = cluster_status.tserver_summaries;
+ tablet_summaries = cluster_status.tablet_summaries;
+ table_summaries = cluster_status.table_summaries;
+ for (const auto& sys_table : cluster_status.system_table_summaries) {
+ table_summaries.emplace_back(sys_table);
+ }
} else {
// Information on the specified location only: filter out non-relevant info.
const auto& location_str = *location;
unordered_set<string> ts_ids_at_location;
- for (const auto& summary : ksck_info.cluster_status.tserver_summaries) {
+ for (const auto& summary : cluster_status.tserver_summaries) {
if (summary.ts_location == location_str) {
tserver_summaries.push_back(summary);
InsertOrDie(&ts_ids_at_location, summary.uuid);
@@ -298,7 +303,7 @@ Status RebalancerTool::KsckResultsToClusterRawInfo(
}
unordered_set<string> table_ids_at_location;
- for (const auto& summary : ksck_info.cluster_status.tablet_summaries) {
+ for (const auto& summary : cluster_status.tablet_summaries) {
const auto& replicas = summary.replicas;
decltype(summary.replicas) replicas_at_location;
replicas_at_location.reserve(replicas.size());
@@ -314,7 +319,12 @@ Status RebalancerTool::KsckResultsToClusterRawInfo(
}
}
- for (const auto& summary : ksck_info.cluster_status.table_summaries) {
+ for (const auto& summary : cluster_status.table_summaries) {
+ if (ContainsKey(table_ids_at_location, summary.id)) {
+ table_summaries.push_back(summary);
+ }
+ }
+ for (const auto& summary : cluster_status.system_table_summaries) {
if (ContainsKey(table_ids_at_location, summary.id)) {
table_summaries.push_back(summary);
}
diff --git a/src/kudu/tools/tool.proto b/src/kudu/tools/tool.proto
index d5c60c7..3aea8d4 100644
--- a/src/kudu/tools/tool.proto
+++ b/src/kudu/tools/tool.proto
@@ -242,6 +242,7 @@ message KsckResultsPB {
repeated TabletSummaryPB tablet_summaries = 7;
repeated TableSummaryPB table_summaries = 8;
+ repeated TableSummaryPB system_table_summaries = 12;
optional KsckChecksumResultsPB checksum_results = 9;
repeated KsckVersionSummaryPB version_summaries = 10;
diff --git a/src/kudu/tools/tool_action_cluster.cc b/src/kudu/tools/tool_action_cluster.cc
index 0e4d629..38fb4a4 100644
--- a/src/kudu/tools/tool_action_cluster.cc
+++ b/src/kudu/tools/tool_action_cluster.cc
@@ -74,7 +74,7 @@ DEFINE_string(sections, "*",
"Sections to print (comma-separated list of sections, "
"available sections are: MASTER_SUMMARIES, TSERVER_SUMMARIES, "
"VERSION_SUMMARIES, TABLET_SUMMARIES, TABLE_SUMMARIES, "
- "CHECKSUM_RESULTS and TOTAL_COUNT.) "
+ "SYSTEM_TABLE_SUMMARIES, CHECKSUM_RESULTS and TOTAL_COUNT.) "
"If not specified, print all sections.");
DEFINE_uint32(max_moves_per_server, 5,