You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/11/28 21:11:03 UTC

kudu git commit: [consensus_queue] update replica health reporting

Repository: kudu
Updated Branches:
  refs/heads/master 68fa8010d -> 32aacb4e7


[consensus_queue] update replica health reporting

Report the health status of a replica as HEALTHY only if the
last recent exchage status was Status::OK.  Except for a few
other definitive cases where it's clear that the replica has failed,
the health of the replica is reported as UNKNOWN.

The rationalte behind is simple:

  * The HEALTHY replicas are the ones which can replicate Raft
    transactions from the leader replica with no issues.

  * The FAILED replicas are the ones which will definitely fail
    to replicate Raft transactions.

  * The rest is goes into the UNKNOWN bucket, which covers the cases
    when there hasn't been a single attempt to contact the replica yet,
    there was a transient error during the last communication with the
    replica, etc.

Change-Id: I99da4352da0b0d1846c92eb2abac7197a58bfd62
Reviewed-on: http://gerrit.cloudera.org:8080/8663
Tested-by: Alexey Serbin <as...@cloudera.com>
Reviewed-by: Mike Percy <mp...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/32aacb4e
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/32aacb4e
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/32aacb4e

Branch: refs/heads/master
Commit: 32aacb4e7c9a13afa5c80237ac181d940e7fa3d2
Parents: 68fa801
Author: Alexey Serbin <as...@cloudera.com>
Authored: Mon Nov 27 21:25:02 2017 -0800
Committer: Mike Percy <mp...@apache.org>
Committed: Tue Nov 28 21:10:49 2017 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus_queue.cc | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/32aacb4e/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc
index e1c329a..5fc968d 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -502,30 +502,32 @@ void PeerMessageQueue::UpdatePeerHealthUnlocked(TrackedPeer* peer) {
 }
 
 HealthReportPB::HealthStatus PeerMessageQueue::PeerHealthStatus(const TrackedPeer& peer) {
-  // Unreachable peers are considered failed.
+  // Replicas which have been unreachable for too long are considered failed.
   auto max_unreachable = MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec);
   if (MonoTime::Now() - peer.last_communication_time > max_unreachable) {
     return HealthReportPB::FAILED;
   }
 
-  // Replicas returning TABLET_FAILED status are considered to have FAILED health.
-  if (peer.last_exchange_status == PeerStatus::TABLET_FAILED) {
+  // Replicas that have fallen behind the leader's retained WAL are considered failed.
+  if (!peer.wal_catchup_possible) {
     return HealthReportPB::FAILED;
   }
 
-  // If we have never connected to this peer before, and we have not exceeded
-  // the unreachable timeout, its health is unknown.
-  if (peer.last_exchange_status == PeerStatus::NEW) {
-    return HealthReportPB::UNKNOWN;
+  // Replicas returning TABLET_FAILED status are considered failed.
+  if (peer.last_exchange_status == PeerStatus::TABLET_FAILED) {
+    return HealthReportPB::FAILED;
   }
 
-  // Tablets that have fallen behind the leader's retained WAL are considered failed.
-  if (!peer.wal_catchup_possible) {
-    return HealthReportPB::FAILED;
+  // The happy case: replicas returned OK during the recent exchange are considered healthy.
+  if (peer.last_exchange_status == PeerStatus::OK) {
+    return HealthReportPB::HEALTHY;
   }
 
-  // All other cases are considered healthy.
-  return HealthReportPB::HEALTHY;
+  // Other cases are for various situations when there hasn't been a contact
+  // with the replica yet or it's impossible to definitely tell the health
+  // status of the replica based on the last exchange status (transient error,
+  // etc.). For such cases, the replica health status is reported as UNKNOWN.
+  return HealthReportPB::UNKNOWN;
 }
 
 Status PeerMessageQueue::RequestForPeer(const string& uuid,