You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2020/01/06 21:29:59 UTC

[kudu] 02/02: [consensus] short-circuit response path for RequestVote()

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 80e1d82c6ff10d5b6e507b68c6f5149dd65bb525
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Fri Dec 20 22:41:16 2019 -0800

    [consensus] short-circuit response path for RequestVote()
    
    I saw cases of contention on replica's RaftConsensus lock when the
    filesystem was slow on updating Raft metadata files after receiving
    Raft update message from a leader replica, while there was a vote
    request from another follower replica waiting on the RaftConsensus
    object's lock.  The latter request would be simply rejected after just
    after acquiring the lock because of the recent updates on the voting
    withhold interval.
    
    This patch updates the code by moving the last-heard-from-leader check
    into the very beginning of the method, so it's possible to respond NO
    to a vote request without acquiring the lock in case if receiving Raft
    heartbeats from the leader replica just recently.  It should help a bit
    with the overflow of the RaftConsensus RPC queue during election storms.
    
    Change-Id: I67efef72b74ce243ca060e89fcec6eb11e95e8e8
    Reviewed-on: http://gerrit.cloudera.org:8080/14954
    Tested-by: Alexey Serbin <as...@cloudera.com>
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
---
 src/kudu/consensus/raft_consensus.cc | 112 ++++++++++++++++++++---------------
 src/kudu/consensus/raft_consensus.h  |  16 ++---
 src/kudu/util/monotime.cc            |  10 ++--
 src/kudu/util/monotime.h             |  19 +++++-
 4 files changed, 93 insertions(+), 64 deletions(-)

diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index d4b7d86..dbbf2f4 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -1414,10 +1414,13 @@ Status RaftConsensus::UpdateReplica(const ConsensusRequestPB* request,
       return Status::OK();
     }
 
-    // Snooze the failure detector as soon as we decide to accept the message.
+    // As soon as we decide to accept the message:
+    //   * snooze the failure detector
+    //   * prohibit voting for anyone for the minimum election timeout
     // We are guaranteed to be acting as a FOLLOWER at this point by the above
     // sanity check.
     SnoozeFailureDetector();
+    WithholdVotes();
 
     last_leader_communication_time_micros_ = GetMonoTimeMicros();
 
@@ -1435,9 +1438,6 @@ Status RaftConsensus::UpdateReplica(const ConsensusRequestPB* request,
     // metrics get updated even when the operation is rejected.
     queue_->UpdateLastIndexAppendedToLeader(request->last_idx_appended_to_leader());
 
-    // Also prohibit voting for anyone for the minimum election timeout.
-    WithholdVotesUnlocked();
-
     // 1 - Early commit pending (and committed) transactions
 
     // What should we commit?
@@ -1601,10 +1601,7 @@ Status RaftConsensus::UpdateReplica(const ConsensusRequestPB* request,
       // waiting on our own log.
       if (s.IsTimedOut()) {
         SnoozeFailureDetector();
-        {
-          LockGuard l(lock_);
-          WithholdVotesUnlocked();
-        }
+        WithholdVotes();
       }
     } while (s.IsTimedOut());
     RETURN_NOT_OK(s);
@@ -1647,6 +1644,32 @@ Status RaftConsensus::RequestVote(const VoteRequestPB* request,
                "tablet", options_.tablet_id);
   response->set_responder_uuid(peer_uuid());
 
+  // If we've heard recently from the leader, then we should ignore the request.
+  // It might be from a "disruptive" server. This could happen in a few cases:
+  //
+  // 1) Network partitions
+  // If the leader can talk to a majority of the nodes, but is partitioned from a
+  // bad node, the bad node's failure detector will trigger. If the bad node is
+  // able to reach other nodes in the cluster, it will continuously trigger elections.
+  //
+  // 2) An abandoned node
+  // It's possible that a node has fallen behind the log GC mark of the leader. In that
+  // case, the leader will stop sending it requests. Eventually, the the configuration
+  // will change to eject the abandoned node, but until that point, we don't want the
+  // abandoned follower to disturb the other nodes.
+  //
+  // 3) Other dynamic scenarios with a stale former leader
+  // This is a generalization of the case 1. It's possible that a stale former
+  // leader detects it's not a leader anymore at some point, but a majority
+  // of replicas has elected a new leader already.
+  //
+  // See also https://ramcloud.stanford.edu/~ongaro/thesis.pdf
+  // section 4.2.3.
+  if (PREDICT_TRUE(!request->ignore_live_leader()) &&
+      MonoTime::Now() < withhold_votes_until_) {
+    return RequestVoteRespondLeaderIsAlive(request, response);
+  }
+
   // We must acquire the update lock in order to ensure that this vote action
   // takes place between requests.
   // Lock ordering: update_lock_ must be acquired before lock_.
@@ -1711,27 +1734,11 @@ Status RaftConsensus::RequestVote(const VoteRequestPB* request,
                                    << request->candidate_uuid();
   }
 
-  // If we've heard recently from the leader, then we should ignore the request.
-  // It might be from a "disruptive" server. This could happen in a few cases:
-  //
-  // 1) Network partitions
-  // If the leader can talk to a majority of the nodes, but is partitioned from a
-  // bad node, the bad node's failure detector will trigger. If the bad node is
-  // able to reach other nodes in the cluster, it will continuously trigger elections.
-  //
-  // 2) An abandoned node
-  // It's possible that a node has fallen behind the log GC mark of the leader. In that
-  // case, the leader will stop sending it requests. Eventually, the the configuration
-  // will change to eject the abandoned node, but until that point, we don't want the
-  // abandoned follower to disturb the other nodes.
-  //
-  // 3) Other dynamic scenarios with a stale former leader
-  // This is a generalization of the case 1. It's possible that a stale former
-  // leader detects it's not a leader anymore at some point, but a majority
-  // of replicas has elected a new leader already.
-  //
-  // See also https://ramcloud.stanford.edu/~ongaro/thesis.pdf
-  // section 4.2.3.
+  // Recheck the heard-from-the-leader condition. There is a slight chance
+  // that a heartbeat from the leader replica registers after the first check
+  // in the very beginning of this method and before lock_ is acquired. This
+  // extra check costs a little, but it helps in avoiding extra election rounds
+  // and disruptive transfers of the replica leadership.
   if (PREDICT_TRUE(!request->ignore_live_leader()) &&
       MonoTime::Now() < withhold_votes_until_) {
     return RequestVoteRespondLeaderIsAlive(request, response);
@@ -2173,15 +2180,14 @@ void RaftConsensus::Stop() {
     if (cmeta_) {
       ClearLeaderUnlocked();
     }
-
-    // If we were the leader, stop witholding votes.
-    if (withhold_votes_until_ == MonoTime::Max()) {
-      withhold_votes_until_ = MonoTime::Min();
-    }
-
     LOG_WITH_PREFIX_UNLOCKED(INFO) << "Raft consensus is shut down!";
   }
 
+  // If we were the leader, stop withholding votes.
+  auto max_time = MonoTime::Max();
+  withhold_votes_until_.compare_exchange_strong(max_time, MonoTime::Min());
+
+
   // Shut down things that might acquire locks during destruction.
   if (raft_pool_token_) raft_pool_token_->Shutdown();
   if (failure_detector_) DisableFailureDetector();
@@ -2319,14 +2325,15 @@ Status RaftConsensus::RequestVoteRespondLastOpIdTooOld(const OpId& local_last_lo
 
 Status RaftConsensus::RequestVoteRespondLeaderIsAlive(const VoteRequestPB* request,
                                                       VoteResponsePB* response) {
-  FillVoteResponseVoteDenied(ConsensusErrorPB::LEADER_IS_ALIVE, response);
-  string msg = Substitute("$0: Denying vote to candidate $1 for term $2 because "
-                          "replica is either leader or believes a valid leader to "
-                          "be alive.",
-                          GetRequestVoteLogPrefixUnlocked(*request),
-                          request->candidate_uuid(),
-                          request->candidate_term());
-  LOG(INFO) << msg;
+  FillVoteResponseVoteDenied(ConsensusErrorPB::LEADER_IS_ALIVE, response,
+                             ResponderTermPolicy::DO_NOT_SET);
+  auto msg = Substitute("$0: Denying vote to candidate $1 for term $2 because "
+                        "replica is either leader or believes a valid leader to "
+                        "be alive.",
+                        GetRequestVoteLogPrefixThreadSafe(*request),
+                        request->candidate_uuid(),
+                        request->candidate_term());
+  VLOG(1) << msg;
   StatusToPB(Status::InvalidArgument(msg),
              response->mutable_consensus_error()->mutable_status());
   return Status::OK();
@@ -2876,10 +2883,17 @@ void RaftConsensus::SnoozeFailureDetector(boost::optional<string> reason_for_log
   }
 }
 
-void RaftConsensus::WithholdVotesUnlocked() {
-  DCHECK(lock_.is_locked());
-  withhold_votes_until_ = std::max(withhold_votes_until_,
-                                   MonoTime::Now() + MinimumElectionTimeout());
+void RaftConsensus::WithholdVotes() {
+  MonoTime prev = withhold_votes_until_;
+  MonoTime next = MonoTime::Now() + MinimumElectionTimeout();
+  do {
+    if (prev == MonoTime::Max()) {
+      // Maximum withholding time already. It might be the case if replica
+      // has become a leader already.
+      break;
+    }
+    next = MonoTime::Now() + MinimumElectionTimeout();
+  } while (!withhold_votes_until_.compare_exchange_weak(prev, next));
 }
 
 MonoDelta RaftConsensus::LeaderElectionExpBackoffDeltaUnlocked() {
@@ -3047,7 +3061,7 @@ Status RaftConsensus::SetCurrentTermUnlocked(int64_t new_term,
   return Status::OK();
 }
 
-const int64_t RaftConsensus::CurrentTermUnlocked() const {
+int64_t RaftConsensus::CurrentTermUnlocked() const {
   DCHECK(lock_.is_locked());
   return cmeta_->current_term();
 }
@@ -3067,7 +3081,7 @@ void RaftConsensus::ClearLeaderUnlocked() {
   cmeta_->set_leader_uuid("");
 }
 
-const bool RaftConsensus::HasVotedCurrentTermUnlocked() const {
+bool RaftConsensus::HasVotedCurrentTermUnlocked() const {
   DCHECK(lock_.is_locked());
   return cmeta_->has_voted_for();
 }
diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index ddf5020..23f5bef 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -667,7 +667,7 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   // election timeout interval, i.e. 'FLAGS_raft_heartbeat_interval_ms' *
   // 'FLAGS_leader_failure_max_missed_heartbeat_periods' milliseconds.
   // This method is safe to call even it's a leader replica.
-  void WithholdVotesUnlocked();
+  void WithholdVotes();
 
   // Calculates a snooze delta for leader election.
   //
@@ -784,7 +784,7 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
                                 FlushToDisk flush) WARN_UNUSED_RESULT;
 
   // Returns the term set in the last config change round.
-  const int64_t CurrentTermUnlocked() const;
+  int64_t CurrentTermUnlocked() const;
 
   // Accessors for the leader of the current term.
   std::string GetLeaderUuidUnlocked() const;
@@ -792,7 +792,7 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   void ClearLeaderUnlocked();
 
   // Return whether this peer has voted in the current term.
-  const bool HasVotedCurrentTermUnlocked() const;
+  bool HasVotedCurrentTermUnlocked() const;
 
   // Record replica's vote for the current term, then flush the consensus
   // metadata to disk.
@@ -891,10 +891,10 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   // the failure detector during elections.
   simple_spinlock failure_detector_election_lock_;
 
-  // If any RequestVote() RPC arrives before this timestamp,
-  // the request will be ignored. This prevents abandoned or partitioned
-  // nodes from disturbing the healthy leader.
-  MonoTime withhold_votes_until_;
+  // Any RequestVote() arriving before this timestamp is ignored (i.e. responded
+  // to with NO vote). This prevents abandoned or partitioned nodes from
+  // disturbing the healthy leader.
+  std::atomic<MonoTime> withhold_votes_until_;
 
   // The last OpId received from the current leader. This is updated whenever the follower
   // accepts operations from a leader, and passed back so that the leader knows from what
@@ -918,6 +918,8 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
 
   FunctionGaugeDetacher metric_detacher_;
 
+  // The wrapping into std::atomic<> is to simplify the synchronization between
+  // consensus-related writers and readers of the attached metric gauge.
   std::atomic<int64_t> last_leader_communication_time_micros_;
 
   scoped_refptr<Counter> follower_memory_pressure_rejections_;
diff --git a/src/kudu/util/monotime.cc b/src/kudu/util/monotime.cc
index b6ffa65..b0299cc 100644
--- a/src/kudu/util/monotime.cc
+++ b/src/kudu/util/monotime.cc
@@ -192,8 +192,8 @@ const MonoTime& MonoTime::Earliest(const MonoTime& a, const MonoTime& b) {
   return a;
 }
 
-MonoTime::MonoTime()
-  : nanos_(0) {
+MonoTime::MonoTime() KUDU_MONOTIME_NOEXCEPT
+    : nanos_(0) {
 }
 
 bool MonoTime::Initialized() const {
@@ -238,7 +238,7 @@ MonoTime& MonoTime::operator-=(const MonoDelta& delta) {
   return *this;
 }
 
-MonoTime::MonoTime(const struct timespec &ts) {
+MonoTime::MonoTime(const struct timespec &ts) KUDU_MONOTIME_NOEXCEPT {
   // Monotonic time resets when the machine reboots.  The 64-bit limitation
   // means that we can't represent times larger than 292 years, which should be
   // adequate.
@@ -248,8 +248,8 @@ MonoTime::MonoTime(const struct timespec &ts) {
   nanos_ += ts.tv_nsec;
 }
 
-MonoTime::MonoTime(int64_t nanos)
-  : nanos_(nanos) {
+MonoTime::MonoTime(int64_t nanos) KUDU_MONOTIME_NOEXCEPT
+    : nanos_(nanos) {
 }
 
 double MonoTime::ToSeconds() const {
diff --git a/src/kudu/util/monotime.h b/src/kudu/util/monotime.h
index f0013a1..7943019 100644
--- a/src/kudu/util/monotime.h
+++ b/src/kudu/util/monotime.h
@@ -35,6 +35,19 @@
 
 #include "kudu/util/kudu_export.h"
 
+
+// The 'noexcept' specifier is recognized by a C++11-capable compiler, but this
+// file is targeted to compile by C++-98 compiler as well. As it turns out,
+// adding 'noexcept' doesn't affect the generated symbols in the exported
+// MonoTime class, so it's safe to turn it on when compiling in the C++11 mode.
+// The 'noexcept' specified is useful in cases when wrapping MonoTime into
+// std::atomic<> and the standard C++ library explicitly requires that.
+#ifdef LANG_CXX11
+#define KUDU_MONOTIME_NOEXCEPT noexcept
+#else
+#define KUDU_MONOTIME_NOEXCEPT
+#endif // #ifdef LANG_CXX11 ... #else ...
+
 namespace kudu {
 
 /// @brief A representation of a time interval.
@@ -182,7 +195,7 @@ class KUDU_EXPORT MonoTime {
 
   /// Build a MonoTime object. The resulting object is not initialized
   /// and not ready to use.
-  MonoTime();
+  MonoTime() KUDU_MONOTIME_NOEXCEPT;
 
   /// @return @c true iff the object is initialized.
   bool Initialized() const;
@@ -257,8 +270,8 @@ class KUDU_EXPORT MonoTime {
   FRIEND_TEST(TestMonoTime, TestTimeSpec);
   FRIEND_TEST(TestMonoTime, TestDeltaConversions);
 
-  explicit MonoTime(const struct timespec &ts);
-  explicit MonoTime(int64_t nanos);
+  explicit MonoTime(const struct timespec &ts) KUDU_MONOTIME_NOEXCEPT;
+  explicit MonoTime(int64_t nanos) KUDU_MONOTIME_NOEXCEPT;
   double ToSeconds() const;
   int64_t nanos_;
 };