You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by gr...@apache.org on 2019/01/10 23:05:17 UTC

[kudu] 01/09: Reduce election-related logging

This is an automated email from the ASF dual-hosted git repository.

granthenke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 8fb302289b7fe8eefa9aa3cd9fb1ae97ac4ae5ec
Author: Will Berkeley <wd...@gmail.org>
AuthorDate: Tue Jan 8 16:03:14 2019 -0800

    Reduce election-related logging
    
    Frequent simultaneous pre-elections on lots of tablets are a common
    symptom of underlying problems on Kudu clusters. This results in a ton
    of log spam because a [pre-]election results in multiple log messages.
    
    This patch makes a few conservative changes to reduce the amount of
    election-related logging, especially on candidates.
    
    1. No log message per successful VoteRequest response. The candidate
       previously logged a message for every granted or denied vote from a
       voter, e.g.
    
         I1221 20:16:10.479441  1852 leader_election.cc:387] T cdd0e560bc5c451c978e38c0d7620e52 P dd658299c5194f4b88ae408cd3130103 [CANDIDATE]: Term 1 pre-election: Vote granted by peer 1a228ec03dbc4bb79ef66cd036a99bca (127.1.92.67:38265)
    
       This seems like overkill. Instead, these messages have been moved to
       VLOG(1) and the candidate will print a more detailed summary of the
       election once it is decided:
    
          I0109 12:39:42.203627 158707712 leader_election.cc:304] T 00000000000000000000000000000000 P 99b33389ebdc47b1ae18947dea49a5b7 [CANDIDATE]: Term 2 pre-election: Election decided. Result: candidate won. Election summary: received 2 responses out of 3 voters: 2 yes votes; 0 no votes. yes voters: 465a338e21d84ad1a4e03867eb608623, 99b33389ebdc47b1ae18947dea49a5b7; no voters:
    
       Compared to the current state:
    
         I1221 20:16:10.479472  1852 leader_election.cc:278] T cdd0e560bc5c451c978e38c0d7620e52 P dd658299c5194f4b88ae408cd3130103 [CANDIDATE]: Term 1 pre-election: Election decided. Result: candidate won.
    
    Note that it is still possible to find a record of each vote in each
    voter's INFO logs, and error condition logging has not been changed.
    
    2. No log message per vote request. The candidate previously logged a
       message for each voter it was requesting a vote from. e.g.
    
         I1221 20:16:10.474704  1993 leader_election.cc:251] T cdd0e560bc5c451c978e38c0d7620e52 P 1a228ec03dbc4bb79ef66cd036a99bca [CANDIDATE]: Term 1 pre-election: Requesting pre-vote from peer dd658299c5194f4b88ae408cd3130103 (127.1.92.66:39093)
         I1221 20:16:10.475266  1993 leader_election.cc:251] T cdd0e560bc5c451c978e38c0d7620e52 P 1a228ec03dbc4bb79ef66cd036a99bca [CANDIDATE]: Term 1 pre-election: Requesting pre-vote from peer c624ece2977e486fbab958afb000f122 (127.1.92.65:45651)
    
       Instead, the candidate will summarize all the voters it sent requests
       to:
    
         I0109 12:39:42.206408 156561408 leader_election.cc:291] T 00000000000000000000000000000000 P 99b33389ebdc47b1ae18947dea49a5b7 [CANDIDATE]: Term 2 election: Requested vote from peers 465a338e21d84ad1a4e03867eb608623 (127.0.0.1:7053), 08b8de9c7a7c4ec5b72c9eb275d29cfd (127.0.0.1:7052)
    
    3. No logging of failure detector sleeps. It seems like it might be
       useful to log when the FD sleeps, and why, e.g.
    
         I1221 20:16:12.399258  2803 raft_consensus.cc:2834] T cdd0e560bc5c451c978e38c0d7620e52 P c624ece2977e486fbab958afb000f122: Snoozing failure detection for 0.224s (vote granted)
    
       But frankly in practice it's never been relevant or useful. Since a
       message like this is produced multiple times on the candidate and
       possibly on the voters during an election, I moved it to VLOG(1)
       level to reduce the total amount of logging from elections.
    
    Change-Id: Ie0ab1a37c2b2e9a3f37baf74772b694b907c6adf
    Reviewed-on: http://gerrit.cloudera.org:8080/12200
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/consensus/leader_election.cc | 50 ++++++++++++++++++++++++++++-------
 src/kudu/consensus/leader_election.h  |  3 +++
 src/kudu/consensus/raft_consensus.cc  |  8 +++---
 3 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/kudu/consensus/leader_election.cc b/src/kudu/consensus/leader_election.cc
index bd2c8a9..931e5ef 100644
--- a/src/kudu/consensus/leader_election.cc
+++ b/src/kudu/consensus/leader_election.cc
@@ -21,6 +21,7 @@
 #include <mutex>
 #include <ostream>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include <boost/bind.hpp> // IWYU pragma: keep
@@ -132,6 +133,29 @@ bool VoteCounter::AreAllVotesIn() const {
   return GetTotalVotesCounted() == num_voters_;
 }
 
+string VoteCounter::GetElectionSummary() const {
+  vector<string> yes_voter_uuids;
+  vector<string> no_voter_uuids;
+  for (const auto& entry : votes_) {
+    switch (entry.second) {
+      case VOTE_GRANTED:
+        yes_voter_uuids.push_back(entry.first);
+        break;
+      case VOTE_DENIED:
+        no_voter_uuids.push_back(entry.first);
+        break;
+    }
+  }
+  return Substitute("received $0 responses out of $1 voters: $2 yes votes; "
+                    "$3 no votes. yes voters: $4; no voters: $5",
+                    yes_votes_ + no_votes_,
+                    num_voters_,
+                    yes_votes_,
+                    no_votes_,
+                    JoinStrings(yes_voter_uuids, ", "),
+                    JoinStrings(no_voter_uuids, ", "));
+}
+
 ///////////////////////////////////////////////////
 // ElectionResult
 ///////////////////////////////////////////////////
@@ -224,6 +248,8 @@ void LeaderElection::Run() {
   CheckForDecision();
 
   // The rest of the code below is for a typical multi-node configuration.
+  vector<string> other_voter_info;
+  other_voter_info.reserve(other_voter_uuids.size());
   for (const auto& voter_uuid : other_voter_uuids) {
     VoterState* state = nullptr;
     {
@@ -232,6 +258,7 @@ void LeaderElection::Run() {
       // Safe to drop the lock because voter_state_ is not mutated outside of
       // the constructor / destructor. We do this to avoid deadlocks below.
     }
+    other_voter_info.push_back(state->PeerInfo());
 
     // If we failed to construct the proxy, just record a 'NO' vote with the status
     // that indicates why it failed.
@@ -248,9 +275,6 @@ void LeaderElection::Run() {
     }
 
     // Send the RPC request.
-    LOG_WITH_PREFIX(INFO) << "Requesting "
-                          << (request_.is_pre_election() ? "pre-" : "")
-                          << "vote from peer " << state->PeerInfo();
     state->rpc.set_timeout(timeout_);
 
     state->request = request_;
@@ -265,6 +289,9 @@ void LeaderElection::Run() {
         boost::bind(&Closure::Run,
                     Bind(&LeaderElection::VoteResponseRpcCallback, this, voter_uuid)));
   }
+  LOG_WITH_PREFIX(INFO) << Substitute("Requested $0vote from peers $1",
+                                      request_.is_pre_election() ? "pre-" : "",
+                                      JoinStrings(other_voter_info, ", "));
 }
 
 void LeaderElection::CheckForDecision() {
@@ -275,9 +302,12 @@ void LeaderElection::CheckForDecision() {
     if (!result_ && vote_counter_->IsDecided()) {
       ElectionVote decision;
       CHECK_OK(vote_counter_->GetDecision(&decision));
-      LOG_WITH_PREFIX(INFO) << "Election decided. Result: candidate "
-                << ((decision == VOTE_GRANTED) ? "won." : "lost.");
-      string msg = (decision == VOTE_GRANTED) ?
+      const auto election_won = decision == VOTE_GRANTED;
+      LOG_WITH_PREFIX(INFO) << Substitute("Election decided. Result: candidate $0. "
+                                          "Election summary: $1",
+                                          election_won ? "won" : "lost",
+                                          vote_counter_->GetElectionSummary());
+      string msg = election_won ?
           "achieved majority votes" : "could not achieve majority";
       result_.reset(new ElectionResult(request_, decision, highest_voter_term_, msg));
     }
@@ -384,7 +414,7 @@ void LeaderElection::HandleVoteGrantedUnlocked(const VoterState& state) {
   }
   DCHECK(state.response.vote_granted());
 
-  LOG_WITH_PREFIX(INFO) << "Vote granted by peer " << state.PeerInfo();
+  VLOG_WITH_PREFIX(1) << "Vote granted by peer " << state.PeerInfo();
   RecordVoteUnlocked(state, VOTE_GRANTED);
 }
 
@@ -398,8 +428,10 @@ void LeaderElection::HandleVoteDeniedUnlocked(const VoterState& state) {
     return HandleHigherTermUnlocked(state);
   }
 
-  LOG_WITH_PREFIX(INFO) << "Vote denied by peer " << state.PeerInfo() << ". Message: "
-            << StatusFromPB(state.response.consensus_error().status()).ToString();
+  VLOG_WITH_PREFIX(1) << Substitute(
+      "Vote denied by peer $0. Message: $1",
+      state.PeerInfo(),
+      StatusFromPB(state.response.consensus_error().status()).ToString());
   RecordVoteUnlocked(state, VOTE_DENIED);
 }
 
diff --git a/src/kudu/consensus/leader_election.h b/src/kudu/consensus/leader_election.h
index 5e145e2..8770906 100644
--- a/src/kudu/consensus/leader_election.h
+++ b/src/kudu/consensus/leader_election.h
@@ -78,6 +78,9 @@ class VoteCounter {
   // Return true iff GetTotalVotesCounted() == num_voters_;
   bool AreAllVotesIn() const;
 
+  // Return a summary of the election so far, suitable for logging.
+  std::string GetElectionSummary() const;
+
  private:
   friend class VoteCounterTest;
 
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index d71f8e9..7428fe0 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -2831,10 +2831,10 @@ void RaftConsensus::SnoozeFailureDetector(boost::optional<string> reason_for_log
                                           boost::optional<MonoDelta> delta) {
   if (PREDICT_TRUE(failure_detector_ && FLAGS_enable_leader_failure_detection)) {
     if (reason_for_log) {
-      LOG(INFO) << LogPrefixThreadSafe()
-                << Substitute("Snoozing failure detection for $0 ($1)",
-                              delta ? delta->ToString() : "election timeout",
-                              *reason_for_log);
+      VLOG(1) << LogPrefixThreadSafe()
+              << Substitute("Snoozing failure detection for $0 ($1)",
+                            delta ? delta->ToString() : "election timeout",
+                            *reason_for_log);
     }
 
     if (!delta) {