You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2019/10/01 06:05:26 UTC
[kudu] 02/04: KUDU-2069 p5: recheck tablet health when exiting
maintenance mode
This is an automated email from the ASF dual-hosted git repository.
adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 3cd6bd0b6b2eaba1027974eabc1490d90691ce82
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Sat Sep 28 23:15:30 2019 -0700
KUDU-2069 p5: recheck tablet health when exiting maintenance mode
Previously, when exiting maintenance mode for a given tserver, if the
replicas of that tserver were unhealthy, there was no mechanism with
which to guarantee that the proper re-replication would happen.
Specifically, the following sequence of events was possible:
1. tablet T has replicas on tservers A, B*, C
2. A enters maintenance mode
3. A is shut down
4. enough time passes for B* to consider A as failed
5. B* notices the failure of A and reports to the master that replica A
has failed
6. the master does nothing to schedule re-replication because A is in
maintenance mode
7. A exits maintenance mode, but is not brought back online
8. B* never hears back from A, and never hits a health state change to
report to the master, and so the master never "rechecks" the health
of T
9. T is left under-replicated with only B* and C
Note: The set of tservers we need to recheck is the set that hosted a
leader of any replica on A.
This patch addresses this by requesting a full tablet report from every
tserver in the cluster upon exiting maintenance mode on any tserver.
Testing:
- this adds to the existing integration test for maintenance mode to
exercise the new behavior
- amends an existing concurrency test to verify the correct locking
behavior is used
Change-Id: Ic0ab3d78cbc5b1228c01592a00118f11f76e43dd
Reviewed-on: http://gerrit.cloudera.org:8080/14223
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
src/kudu/integration-tests/maintenance_mode-itest.cc | 11 +++++++++++
src/kudu/master/master_service.cc | 14 ++++++++++++++
src/kudu/master/ts_descriptor.cc | 11 +++++++++++
src/kudu/master/ts_descriptor.h | 9 +++++++++
src/kudu/master/ts_manager.cc | 11 +++++++++++
src/kudu/master/ts_manager.h | 11 ++++++++++-
src/kudu/master/ts_state-test.cc | 9 ++++++++-
7 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/src/kudu/integration-tests/maintenance_mode-itest.cc b/src/kudu/integration-tests/maintenance_mode-itest.cc
index 6d620f4..f982586 100644
--- a/src/kudu/integration-tests/maintenance_mode-itest.cc
+++ b/src/kudu/integration-tests/maintenance_mode-itest.cc
@@ -270,6 +270,17 @@ TEST_F(MaintenanceModeRF3ITest, TestFailedTServerInMaintenanceModeDoesntRereplic
SleepFor(kDurationForSomeHeartbeats);
NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/false));
+ // Now set maintenance mode, bring the tablet server down, and then exit
+ // maintenance mode without bringing the tablet server back up. This should
+ // result in tablet copies.
+ ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::ENTER_MAINTENANCE_MODE));
+ NO_FATALS(maintenance_ts->Shutdown());
+ NO_FATALS(AssertEventuallyNumFailedReplicas(ts_map, kNumTablets));
+ ASSERT_OK(ChangeTServerState(maintenance_uuid, TServerStateChangePB::EXIT_MAINTENANCE_MODE));
+ ASSERT_EVENTUALLY([&] {
+ NO_FATALS(ExpectStartedTabletCopies(/*should_have_started*/true));
+ });
+
// All the while, our workload should not have been interrupted. Assert
// eventually to wait for the rows to converge.
NO_FATALS(create_table.StopAndJoin());
diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc
index 6941ed7..f9e74e1 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -338,6 +338,13 @@ void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req,
rpc->RespondFailure(s.CloneAndPrepend("Failed to process tablet report"));
return;
}
+ // If we previously needed a full tablet report for the tserver (e.g.
+ // because we need to recheck replica states after exiting from maintenance
+ // mode) and have just received a full report, mark that we no longer need
+ // a full tablet report.
+ if (!req->tablet_report().is_incremental()) {
+ ts_desc->UpdateNeedsFullTabletReport(false);
+ }
}
// 6. Only leaders sign CSR from tablet servers (if present).
@@ -367,6 +374,13 @@ void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req,
}
}
+ // 8. Check if we need a full tablet report (e.g. the tablet server just
+ // exited maintenance mode and needs to check whether any replicas need to
+ // be moved).
+ if (is_leader_master && ts_desc->needs_full_report()) {
+ resp->set_needs_full_tablet_report(true);
+ }
+
rpc->RespondSuccess();
}
diff --git a/src/kudu/master/ts_descriptor.cc b/src/kudu/master/ts_descriptor.cc
index e89b27f..90f035d 100644
--- a/src/kudu/master/ts_descriptor.cc
+++ b/src/kudu/master/ts_descriptor.cc
@@ -76,6 +76,7 @@ TSDescriptor::TSDescriptor(std::string perm_id)
: permanent_uuid_(std::move(perm_id)),
latest_seqno_(-1),
last_heartbeat_(MonoTime::Now()),
+ needs_full_report_(false),
recent_replica_creations_(0),
last_replica_creations_decay_(MonoTime::Now()),
num_live_replicas_(0) {
@@ -162,6 +163,16 @@ MonoDelta TSDescriptor::TimeSinceHeartbeat() const {
return now - last_heartbeat_;
}
+void TSDescriptor::UpdateNeedsFullTabletReport(bool needs_report) {
+ std::lock_guard<rw_spinlock> l(lock_);
+ needs_full_report_ = needs_report;
+}
+
+bool TSDescriptor::needs_full_report() const {
+ shared_lock<rw_spinlock> l(lock_);
+ return needs_full_report_;
+}
+
bool TSDescriptor::PresumedDead() const {
return TimeSinceHeartbeat().ToMilliseconds() >= FLAGS_tserver_unresponsive_timeout_ms;
}
diff --git a/src/kudu/master/ts_descriptor.h b/src/kudu/master/ts_descriptor.h
index 800710f..9c4199a 100644
--- a/src/kudu/master/ts_descriptor.h
+++ b/src/kudu/master/ts_descriptor.h
@@ -78,6 +78,12 @@ class TSDescriptor : public enable_make_shared<TSDescriptor> {
// Set the last-heartbeat time to now.
void UpdateHeartbeatTime();
+ // Set whether a full tablet report is needed.
+ void UpdateNeedsFullTabletReport(bool needs_report);
+
+ // Whether a full tablet report is needed from this tablet server.
+ bool needs_full_report() const;
+
// Return the amount of time since the last heartbeat received
// from this TS.
MonoDelta TimeSinceHeartbeat() const;
@@ -181,6 +187,9 @@ class TSDescriptor : public enable_make_shared<TSDescriptor> {
// The last time a heartbeat was received for this node.
MonoTime last_heartbeat_;
+ // Whether the tablet server needs to send a full report.
+ bool needs_full_report_;
+
// The number of times this tablet server has recently been selected to create a
// tablet replica. This value decays back to 0 over time.
double recent_replica_creations_;
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index 79d3088..a44526b 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -241,6 +241,10 @@ Status TSManager::SetTServerState(const string& ts_uuid,
RETURN_NOT_OK_PREPEND(sys_catalog->RemoveTServerState(ts_uuid),
Substitute("Failed to remove tserver state for $0", ts_uuid));
ts_state_by_uuid_.erase(ts_uuid);
+ // If exiting maintenance mode, make sure that any replica failures that
+ // may have been ignored while in maintenance mode are reprocessed. To do
+ // so, request full tablet reports across all tablet servers.
+ SetAllTServersNeedFullTabletReports();
return Status::OK();
}
SysTServerStateEntryPB pb;
@@ -269,6 +273,13 @@ Status TSManager::ReloadTServerStates(SysCatalogTable* sys_catalog) {
return sys_catalog->VisitTServerStates(&loader);
}
+void TSManager::SetAllTServersNeedFullTabletReports() {
+ lock_guard<rw_spinlock> l(lock_);
+ for (auto& id_and_desc : servers_by_id_) {
+ id_and_desc.second->UpdateNeedsFullTabletReport(true);
+ }
+}
+
int TSManager::ClusterSkew() const {
int min_count = std::numeric_limits<int>::max();
int max_count = 0;
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index d1cc925..cb0c6dc 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -100,6 +100,9 @@ class TSManager {
int GetCount() const;
// Sets the tserver state for the given tserver, persisting it to disk.
+ //
+ // If removing a tserver from maintenance mode, this also sets that all
+ // tablet servers must report back a full tablet reports.
Status SetTServerState(const std::string& ts_uuid,
TServerStatePB ts_state,
SysCatalogTable* sys_catalog);
@@ -124,10 +127,16 @@ class TSManager {
// is not dead, not in maintenance mode).
bool AvailableForPlacementUnlocked(const TSDescriptor& ts) const;
- mutable rw_spinlock lock_;
+ // Sets that all registered tablet servers need to report back with a full
+ // tablet report. This may be necessary, e.g., after exiting maintenance mode
+ // to recheck any ignored failures.
+ void SetAllTServersNeedFullTabletReports();
FunctionGaugeDetacher metric_detacher_;
+ // Protects 'servers_by_id_'.
+ mutable rw_spinlock lock_;
+
// TODO(awong): add a map from HostPort to descriptor so we aren't forced to
// know UUIDs up front, e.g. if specifying a given tablet server for
// maintenance mode, it'd be easier for users to specify the HostPort.
diff --git a/src/kudu/master/ts_state-test.cc b/src/kudu/master/ts_state-test.cc
index 387dac5..9ba575b 100644
--- a/src/kudu/master/ts_state-test.cc
+++ b/src/kudu/master/ts_state-test.cc
@@ -238,7 +238,7 @@ TEST_F(TServerStateTest, TestConcurrentSetTServerState) {
}
// Spin up a bunch of threads that contend for setting the state for a
// limited number of tablet servers.
- Barrier b(kNumThreadsPerTServer * kNumTServers);
+ Barrier b(kNumThreadsPerTServer * kNumTServers + kNumTServers);
for (int i = 0; i < kNumThreadsPerTServer; i++) {
for (const auto& ts : tservers) {
threads.emplace_back([&, ts] {
@@ -247,6 +247,13 @@ TEST_F(TServerStateTest, TestConcurrentSetTServerState) {
});
}
}
+ // Concurrently, register the servers.
+ for (const auto& ts : tservers) {
+ threads.emplace_back([&, ts] {
+ b.Wait();
+ CHECK_OK(SendHeartbeat(ts));
+ });
+ }
for (auto& t : threads) {
t.join();
}