You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2020/01/16 03:28:45 UTC

[kudu] 02/02: KUDU-3011 p6: don't transfer leadership to quiescing followers

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 31ed4a11de3f5a158d90d76b306c2d96fe07a55d
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Tue Jan 14 15:39:43 2020 -0800

    KUDU-3011 p6: don't transfer leadership to quiescing followers
    
    When a tablet server is quiescing, any followers hosted on it should not
    be considered candidates to be the leader's successor.
    
    A quiescing follower already would never become a leader because it
    would reject the StartElection request immediately. This patch improves
    upon this by nipping such requests in the bud. Followers will now send
    along with their ConsensusResponses whether or not they're quiescing,
    and if they are, the leader will know not to transfer leadership to it.
    
    I considered another approach to reducing the number of fruitless RPCs
    sent -- simply throttling the interval with which a leader can send
    StartElection requests. I opted to go with the current approach since it
    is more complete with regards to preventing extraneous StartElection
    requests.
    
    Change-Id: I74ec79d0bc4dbe42fce0cca2e001cd0b369cd066
    Reviewed-on: http://gerrit.cloudera.org:8080/15035
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 src/kudu/consensus/consensus.proto                   |  5 +++++
 src/kudu/consensus/consensus_queue-test.cc           | 17 +++++++++++++++++
 src/kudu/consensus/consensus_queue.cc                |  8 +++++++-
 src/kudu/consensus/consensus_queue.h                 |  4 ++++
 src/kudu/consensus/raft_consensus.cc                 |  3 +++
 .../tablet_server_quiescing-itest.cc                 | 20 ++++++++++++++++++++
 6 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/src/kudu/consensus/consensus.proto b/src/kudu/consensus/consensus.proto
index b6a2d9a..dc770e7 100644
--- a/src/kudu/consensus/consensus.proto
+++ b/src/kudu/consensus/consensus.proto
@@ -404,6 +404,11 @@ message ConsensusResponsePB {
   // The current consensus status of the receiver peer.
   optional ConsensusStatusPB status = 3;
 
+  // Whether the server that hosts the peer is quiescing. This doesn't
+  // necessarily have any bearing on the state of the Raft peer itself, but it
+  // does indicate that the peer should not be a candidate for leadership.
+  optional bool server_quiescing = 4;
+
   // A generic error message (such as tablet not found), per operation
   // error messages are sent along with the consensus status.
   optional tserver.TabletServerErrorPB error = 999;
diff --git a/src/kudu/consensus/consensus_queue-test.cc b/src/kudu/consensus/consensus_queue-test.cc
index 092d475..eba226c 100644
--- a/src/kudu/consensus/consensus_queue-test.cc
+++ b/src/kudu/consensus/consensus_queue-test.cc
@@ -339,6 +339,23 @@ TEST_F(ConsensusQueueTest, TestTransferLeadershipWhenAppropriate) {
   config.mutable_peers(1)->set_member_type(RaftPeerPB::NON_VOTER);
   queue_->SetLeaderMode(10, 1, config);
   NO_FATALS(verify_elections(/*election_happened*/false));
+
+  // Now undo that.
+  config.mutable_peers(1)->set_member_type(RaftPeerPB::VOTER);
+  queue_->SetLeaderMode(10, 1, config);
+  NO_FATALS(verify_elections(/*election_happened*/true));
+
+  // If the peer reported itself as quiescing, we also wouldn't trigger an
+  // election.
+  peer_response.set_server_quiescing(true);
+  NO_FATALS(verify_elections(/*election_happened*/false));
+
+  // If the peer stops reporting its server as quiescing, elections will start
+  // up again.
+  peer_response.clear_server_quiescing();
+  NO_FATALS(verify_elections(/*election_happened*/true));
+  peer_response.set_server_quiescing(false);
+  NO_FATALS(verify_elections(/*election_happened*/true));
 }
 
 // Tests that the queue is able to track a peer when it starts tracking a peer
diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc
index 3c6373f..619bc18 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -146,6 +146,7 @@ PeerMessageQueue::TrackedPeer::TrackedPeer(RaftPeerPB peer_pb)
       last_exchange_status(PeerStatus::NEW),
       last_communication_time(MonoTime::Now()),
       wal_catchup_possible(true),
+      remote_server_quiescing(false),
       last_overall_health_status(HealthReportPB::UNKNOWN),
       status_log_throttler(std::make_shared<logging::LogThrottler>()),
       last_seen_term_(0) {
@@ -1079,8 +1080,9 @@ void PeerMessageQueue::TransferLeadershipIfNeeded(const TrackedPeer& peer,
   // Do some basic sanity checks that we can actually transfer leadership to
   // the given peer.
   if (queue_state_.mode != PeerMessageQueue::LEADER ||
+      local_peer_pb_.permanent_uuid() == peer.uuid() ||
       peer.last_exchange_status != PeerStatus::OK ||
-      local_peer_pb_.permanent_uuid() == peer.uuid()) {
+      peer.remote_server_quiescing) {
     return;
   }
 
@@ -1155,6 +1157,10 @@ bool PeerMessageQueue::ResponseFromPeer(const std::string& peer_uuid,
     // offset between the local leader and the remote peer.
     UpdateExchangeStatus(peer, prev_peer_state, response, &send_more_immediately);
 
+    // If the peer is hosted on a server that is quiescing, note that now.
+    peer->remote_server_quiescing = response.has_server_quiescing() &&
+                                    response.server_quiescing();
+
     // If the reported last-received op for the replica is in our local log,
     // then resume sending entries from that point onward. Otherwise, resume
     // after the last op they received from us. If we've never successfully
diff --git a/src/kudu/consensus/consensus_queue.h b/src/kudu/consensus/consensus_queue.h
index 430a313..def0ba3 100644
--- a/src/kudu/consensus/consensus_queue.h
+++ b/src/kudu/consensus/consensus_queue.h
@@ -167,6 +167,10 @@ class PeerMessageQueue {
     // the local peer's WAL.
     bool wal_catchup_possible;
 
+    // Whether the peer's server is quiescing, which dictates whether the peer
+    // is a candidate for leadership successor.
+    bool remote_server_quiescing;
+
     // The peer's latest overall health status.
     HealthReportPB::HealthStatus last_overall_health_status;
 
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 4f99b71..d9ca4f4 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -1641,6 +1641,9 @@ void RaftConsensus::FillConsensusResponseOKUnlocked(ConsensusResponsePB* respons
       last_received_cur_leader_);
   response->mutable_status()->set_last_committed_idx(
       queue_->GetCommittedIndex());
+  if (PREDICT_TRUE(server_ctx_.quiescing) && server_ctx_.quiescing->load()) {
+    response->set_server_quiescing(true);
+  }
 }
 
 void RaftConsensus::FillConsensusResponseError(ConsensusResponsePB* response,
diff --git a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
index 70c4eb9..dde8883 100644
--- a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
+++ b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
@@ -57,6 +57,8 @@ DECLARE_int32(scanner_default_batch_size_bytes);
 DECLARE_int32(scanner_ttl_ms);
 DECLARE_int32(raft_heartbeat_interval_ms);
 
+METRIC_DECLARE_histogram(handler_latency_kudu_consensus_ConsensusService_RunLeaderElection);
+
 using kudu::client::KuduClient;
 using kudu::client::KuduScanBatch;
 using kudu::client::KuduScanner;
@@ -514,12 +516,30 @@ TEST_P(TServerQuiescingParamITest, TestWriteWhileAllQuiescing) {
   for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
     *cluster_->mini_tablet_server(i)->server()->mutable_quiescing() = true;
   }
+  // Counts the number of times we were requested to start elections
+  // cluster-wide.
+  auto get_num_elections = [&] () {
+    int num_elections = 0;
+    for (int i = 0; i < kNumReplicas; i++) {
+      auto* ts = cluster_->mini_tablet_server(i)->server();
+      scoped_refptr<Histogram> hist(ts->metric_entity()->FindOrCreateHistogram(
+          &METRIC_handler_latency_kudu_consensus_ConsensusService_RunLeaderElection));
+      num_elections++;
+    }
+    return num_elections;
+  };
+
+  int initial_num_elections = get_num_elections();
+  ASSERT_LT(0, initial_num_elections);
 
   // We should continue to write uninterrupted.
   int start_rows = first_workload->rows_inserted();
   ASSERT_EVENTUALLY([&] {
     ASSERT_GT(first_workload->rows_inserted(), start_rows + 1000);
   });
+
+  // We also should not have triggered any elections.
+  ASSERT_EQ(initial_num_elections, get_num_elections());
 }
 
 TEST_P(TServerQuiescingParamITest, TestAbruptStepdownWhileAllQuiescing) {