You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2019/11/10 13:15:30 UTC
[kudu] branch master updated: KUDU-2987 Intra location rebalance
crashes in special case.
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 0fea1cb KUDU-2987 Intra location rebalance crashes in special case.
0fea1cb is described below
commit 0fea1cb8ede852a87efc422b394ffe8d1e89bc6c
Author: triplesheep <tr...@gmail.com>
AuthorDate: Fri Nov 1 10:29:20 2019 +0800
KUDU-2987 Intra location rebalance crashes in special case.
The crash manifested itself in cases where a Kudu cluster
had a location that didn't host even a single replica of
a tablet.
Change-Id: Iea39472e55178fca688b390249432a0f7fefaaba
Reviewed-on: http://gerrit.cloudera.org:8080/14608
Tested-by: Alexey Serbin <as...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
src/kudu/tools/rebalancer_tool-test.cc | 119 ++++++++++++++++++++++++++++++++-
src/kudu/tools/rebalancer_tool.cc | 4 +-
2 files changed, 120 insertions(+), 3 deletions(-)
diff --git a/src/kudu/tools/rebalancer_tool-test.cc b/src/kudu/tools/rebalancer_tool-test.cc
index 8a14ca0..18063ee 100644
--- a/src/kudu/tools/rebalancer_tool-test.cc
+++ b/src/kudu/tools/rebalancer_tool-test.cc
@@ -473,6 +473,7 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase {
tserver_unresponsive_ms_, created_tables_names));
} else {
ASSERT_OK(CreateTablesExcludingLocations(empty_locations,
+ kTableNamePattern,
created_tables_names));
}
}
@@ -481,8 +482,13 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase {
// tablet servers in the specified locations. This is similar to
// CreateUnbalancedTables() but the set of tablet servers to avoid is defined
// by the set of the specified locations.
+ //
+ // Note: 'table_name_pattern' is used to describe the table names created by this
+ // function. Each call to this function can create different batch of tables with
+ // this parameter set.
Status CreateTablesExcludingLocations(
const unordered_set<string>& excluded_locations,
+ const string& table_name_pattern = kTableNamePattern,
vector<string>* table_names = nullptr) {
// Shutdown all tablet servers in the specified locations so no tablet
// replicas would be hosted by those servers.
@@ -504,7 +510,7 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase {
// are available.
SleepFor(MonoDelta::FromMilliseconds(5 * tserver_unresponsive_ms_ / 4));
RETURN_NOT_OK(CreateTables(cluster_.get(), client_.get(), schema_,
- kTableNamePattern, num_tables_, rep_factor_,
+ table_name_pattern, num_tables_, rep_factor_,
table_names));
// Start tablet servers at the excluded locations.
if (!excluded_locations.empty()) {
@@ -519,6 +525,44 @@ class RebalancingTest : public tserver::TabletServerIntegrationTestBase {
return Status::OK();
}
+ // Create tables placing their tablet replicas elsewhere but not at the specified
+ // number of tservers in the specified locations.
+ Status CreateTablesExcludingTserversInLocations(
+ const unordered_map<string, int>& excluded_tserver_by_location,
+ const string& table_name_pattern = kTableNamePattern,
+ vector<string>* table_names = nullptr) {
+ unordered_map<string, int> excluded_tservers = excluded_tserver_by_location;
+ vector<string> excluded_tserver_uuids;
+ // Shutdown some tablet servers in the specified locations so no tablet
+ // replicas would be hosted by those servers.
+ if (!excluded_tservers.empty()) {
+ for (const auto& elem : tablet_servers_) {
+ auto* ts = elem.second;
+ if (ContainsKey(excluded_tservers, ts->location)
+ && excluded_tservers[ts->location] > 0) {
+ excluded_tserver_uuids.push_back(ts->uuid());
+ --excluded_tservers[ts->location];
+ cluster_->tablet_server_by_uuid(ts->uuid())->Shutdown();
+ }
+ }
+ }
+
+ for (const auto& elem : excluded_tservers) {
+ CHECK_EQ(0, elem.second);
+ }
+ // Wait for the catalog manager to understand that not all tablet servers
+ // are available.
+ SleepFor(MonoDelta::FromMilliseconds(5 * tserver_unresponsive_ms_ / 4));
+ RETURN_NOT_OK(CreateTables(cluster_.get(), client_.get(), schema_,
+ table_name_pattern, num_tables_, rep_factor_,
+ table_names));
+ // Start tablet servers at the excluded locations.
+ for (const auto& uuid : excluded_tserver_uuids) {
+ RETURN_NOT_OK(cluster_->tablet_server_by_uuid(uuid)->Restart());
+ }
+ return Status::OK();
+ }
+
// When the rebalancer starts moving replicas, ksck detects corruption
// (that's why RuntimeError), seeing affected tables as non-healthy
// with data state of corresponding tablets as TABLET_DATA_COPYING. If using
@@ -1668,5 +1712,78 @@ TEST_P(LocationAwareRebalancingParamTest, Rebalance) {
}
}
+// Basic intra location rebalance tests.
+class IntraLocationRebalancingBasicTest : public RebalancingTest {
+public:
+ IntraLocationRebalancingBasicTest()
+ : RebalancingTest(/*num_tables=*/ 1,
+ /*rep_factor=*/ 3,
+ /*num_tservers=*/ 11) {
+ }
+
+ bool is_343_scheme() const override {
+ return true;
+ }
+};
+
+TEST_F(IntraLocationRebalancingBasicTest, LocationsWithEmptyTabletServers) {
+ if (!AllowSlowTests()) {
+ LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
+ return;
+ }
+ const string first_table_name_pattern = "rebalance_test_first_table_$0";
+ const string second_table_name_pattern = "rebalance_test_second_table_$0";
+ const LocationInfo location_info = { { "/A", 3 }, { "/B", 3 }, { "/C", 3 },
+ { "/D", 2 } };
+ const vector<string>& extra_master_flags = {
+ "--master_client_location_assignment_enabled=false",
+ };
+
+ copy(extra_master_flags.begin(), extra_master_flags.end(),
+ back_inserter(master_flags_));
+
+ FLAGS_num_tablet_servers = num_tservers_;
+ FLAGS_num_replicas = rep_factor_;
+ NO_FATALS(BuildAndStart(tserver_flags_, master_flags_, location_info,
+ /*create_table=*/ false));
+
+ // Create special intra location imbalance in location "/D". Location "/D"
+ // should only contain tablet replicas of one table.
+ //
+ // Note: there are 3 tablets in each table and the replica factor is 3,
+ // so it's 9 tablet replicas per table.
+ //
+ // 1. Create table0 with tablet replicas in locations /A, /B and /C. Because
+ // the per table location load skew is not greater than 1, it won't trigger
+ // cross location rebalance, so no replica of table0 will be moved.
+ ASSERT_OK(CreateTablesExcludingLocations({"/D"}, first_table_name_pattern));
+ // 2. Create table1 in { "/A", 3 }, { "/B", 3 }, { "/C", 3 }, {"/D", 1 }.
+ // There is an imbalance in location "/D" of table1.
+ ASSERT_OK(CreateTablesExcludingTserversInLocations({ {"/D", 1 } },
+ second_table_name_pattern));
+
+ const vector<string> tool_args = {
+ "cluster",
+ "rebalance",
+ cluster_->master()->bound_rpc_addr().ToString(),
+ };
+
+ // Before the fix added in this changelist the rebalancer would simply
+ // crash while running. See KUDU-2987 for detail informations.
+ {
+ string out;
+ string err;
+ const Status s = RunKuduTool(tool_args, &out, &err);
+ ASSERT_TRUE(s.ok()) << ToolRunInfo(s, out, err);
+ ASSERT_STR_CONTAINS(out, "rebalancing is complete: cluster is balanced")
+ << "stderr: " << err;
+
+ // Intra location rebalancing in location "/D" should move one replica
+ // to another tserver, so there is at least one move.
+ ASSERT_STR_NOT_CONTAINS(out, "(moved 0 replicas)");
+ ASSERT_STR_CONTAINS(out, "Placement policy violations:\n none\n");
+ }
+}
+
} // namespace tools
} // namespace kudu
diff --git a/src/kudu/tools/rebalancer_tool.cc b/src/kudu/tools/rebalancer_tool.cc
index 7aa505f..a7ebfe5 100644
--- a/src/kudu/tools/rebalancer_tool.cc
+++ b/src/kudu/tools/rebalancer_tool.cc
@@ -286,9 +286,9 @@ Status RebalancerTool::KsckResultsToClusterRawInfo(
}
if (!replicas_at_location.empty()) {
table_ids_at_location.insert(summary.table_id);
+ tablet_summaries.push_back(summary);
+ tablet_summaries.back().replicas = std::move(replicas_at_location);
}
- tablet_summaries.push_back(summary);
- tablet_summaries.back().replicas = std::move(replicas_at_location);
}
for (const auto& summary : ksck_info.cluster_status.table_summaries) {