You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/11/27 06:16:30 UTC
[6/6] kudu git commit: KUDU-1097 (patch 1): Make leader report config member health to master

KUDU-1097 (patch 1): Make leader report config member health to master

Passes existing tests (when the feature is disabled).

Tests will be added in a follow-up patch.

Change-Id: Ia5081cbe0c0d81733a781d4729211dd0c530cdfa
Reviewed-on: http://gerrit.cloudera.org:8080/8630
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/88e39bad
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/88e39bad
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/88e39bad

Branch: refs/heads/master
Commit: 88e39bad14cfab17882d72b69d3382c219b93c23
Parents: b436845
Author: Mike Percy <mp...@apache.org>
Authored: Tue Nov 21 20:44:18 2017 -0800
Committer: Mike Percy <mp...@apache.org>
Committed: Mon Nov 27 06:13:55 2017 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus_queue.cc           | 175 +++++++++++++++----
 src/kudu/consensus/consensus_queue.h            |  28 ++-
 src/kudu/consensus/raft_consensus.cc            |  34 +++-
 src/kudu/consensus/raft_consensus.h             |  18 +-
 .../integration-tests/raft_consensus-itest.cc   |   3 +-
 src/kudu/tserver/ts_tablet_manager.cc           |   8 +-
 6 files changed, 224 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/88e39bad/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc
index 0bf75a7..b3897f7 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -48,6 +48,7 @@
 #include "kudu/util/logging.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/pb_util.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/threadpool.h"
 #include "kudu/util/url-coding.h"
 
@@ -70,12 +71,14 @@ TAG_FLAG(consensus_inject_latency_ms_in_notifications, unsafe);
 
 DECLARE_int32(consensus_rpc_timeout_ms);
 DECLARE_bool(safe_time_advancement_without_writes);
+DECLARE_bool(raft_prepare_replacement_before_eviction);
 
 using kudu::log::Log;
 using kudu::pb_util::SecureDebugString;
 using kudu::pb_util::SecureShortDebugString;
 using std::string;
 using std::unique_ptr;
+using std::unordered_map;
 using std::vector;
 using strings::Substitute;
 
@@ -244,6 +247,25 @@ void PeerMessageQueue::UntrackPeer(const string& uuid) {
   }
 }
 
+unordered_map<string, HealthReportPB> PeerMessageQueue::ReportHealthOfPeers() const {
+  unordered_map<string, HealthReportPB> reports;
+  std::lock_guard<simple_spinlock> lock(queue_lock_);
+  for (const auto& entry : peers_map_) {
+    const string& peer_uuid = entry.first;
+    const TrackedPeer* peer = entry.second;
+    HealthReportPB report;
+    auto overall_health = peer->last_overall_health_status;
+    // We always consider the local peer (ourselves) to be healthy.
+    // TODO(mpercy): Is this always a safe assumption?
+    if (peer_uuid == local_peer_pb_.permanent_uuid()) {
+      overall_health = HealthReportPB::HEALTHY;
+    }
+    report.set_overall_health(overall_health);
+    reports.emplace(peer_uuid, std::move(report));
+  }
+  return reports;
+}
+
 void PeerMessageQueue::CheckPeersInActiveConfigIfLeaderUnlocked() const {
   DCHECK(queue_lock_.is_locked());
   if (queue_state_.mode != LEADER) return;
@@ -369,20 +391,20 @@ OpId PeerMessageQueue::GetNextOpId() const {
                   queue_state_.last_appended.index() + 1);
 }
 
-bool PeerMessageQueue::SafeToEvict(const string& evict_uuid) {
+bool PeerMessageQueue::SafeToEvictUnlocked(const string& evict_uuid) const {
+  DCHECK(queue_lock_.is_locked());
   auto now = MonoTime::Now();
 
-  std::lock_guard<simple_spinlock> lock(queue_lock_);
   int remaining_voters = 0;
   int remaining_viable_voters = 0;
 
   for (const auto& e : peers_map_) {
     const auto& uuid = e.first;
     const auto& peer = e.second;
-    if (!IsRaftConfigVoter(uuid, *queue_state_.active_config)) {
+    if (uuid == evict_uuid) {
       continue;
     }
-    if (uuid == evict_uuid) {
+    if (!IsRaftConfigVoter(uuid, *queue_state_.active_config)) {
       continue;
     }
     remaining_voters++;
@@ -429,6 +451,76 @@ bool PeerMessageQueue::SafeToEvict(const string& evict_uuid) {
   return true;
 }
 
+void PeerMessageQueue::UpdatePeerHealthUnlocked(TrackedPeer* peer) {
+  DCHECK(queue_lock_.is_locked());
+
+  auto overall_health_status = PeerHealthStatus(*peer);
+
+  // Prepare error messages for different conditions.
+  string error_msg;
+  if (overall_health_status == HealthReportPB::FAILED) {
+    if (peer->last_exchange_status == PeerStatus::TABLET_FAILED) {
+      error_msg = Substitute("The tablet replica hosted on peer $0 has failed", peer->uuid);
+    } else if (!peer->wal_catchup_possible) {
+      error_msg = Substitute("The logs necessary to catch up peer $0 have been "
+                             "garbage collected. The replica will never be able "
+                             "to catch up", peer->uuid);
+    } else {
+      error_msg = Substitute("Leader has been unable to successfully communicate "
+                             "with peer $0 for more than $1 seconds ($2)",
+                             peer->uuid,
+                             FLAGS_follower_unavailable_considered_failed_sec,
+                             (MonoTime::Now() - peer->last_communication_time).ToString());
+    }
+  }
+
+  bool changed = overall_health_status != peer->last_overall_health_status;
+  peer->last_overall_health_status = overall_health_status;
+
+  if (FLAGS_raft_prepare_replacement_before_eviction) {
+    if (changed) {
+      if (overall_health_status == HealthReportPB::FAILED) {
+        // Only log when the status changes to FAILED.
+        LOG_WITH_PREFIX_UNLOCKED(INFO) << error_msg;
+      }
+      // Only notify when there is a change.
+      NotifyObserversOfPeerHealthChange();
+    }
+  } else {
+    if (overall_health_status == HealthReportPB::FAILED &&
+        SafeToEvictUnlocked(peer->uuid)) {
+      NotifyObserversOfFailedFollower(peer->uuid, queue_state_.current_term, error_msg);
+    }
+  }
+}
+
+HealthReportPB::HealthStatus PeerMessageQueue::PeerHealthStatus(const TrackedPeer& peer) {
+  // Unreachable peers are considered failed.
+  auto max_unreachable = MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec);
+  if (MonoTime::Now() - peer.last_communication_time > max_unreachable) {
+    return HealthReportPB::FAILED;
+  }
+
+  // Replicas returning TABLET_FAILED status are considered to have FAILED health.
+  if (peer.last_exchange_status == PeerStatus::TABLET_FAILED) {
+    return HealthReportPB::FAILED;
+  }
+
+  // If we have never connected to this peer before, and we have not exceeded
+  // the unreachable timeout, its health is unknown.
+  if (peer.last_exchange_status == PeerStatus::NEW) {
+    return HealthReportPB::UNKNOWN;
+  }
+
+  // Tablets that have fallen behind the leader's retained WAL are considered failed.
+  if (!peer.wal_catchup_possible) {
+    return HealthReportPB::FAILED;
+  }
+
+  // All other cases are considered healthy.
+  return HealthReportPB::HEALTHY;
+}
+
 Status PeerMessageQueue::RequestForPeer(const string& uuid,
                                         ConsensusRequestPB* request,
                                         vector<ReplicateRefPtr>* msg_refs,
@@ -436,18 +528,18 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
   // Maintain a thread-safe copy of necessary members.
   OpId preceding_id;
   int64_t current_term;
-  TrackedPeer peer;
+  TrackedPeer peer_copy;
   MonoDelta unreachable_time;
   {
     std::lock_guard<simple_spinlock> lock(queue_lock_);
     DCHECK_EQ(queue_state_.state, kQueueOpen);
     DCHECK_NE(uuid, local_peer_pb_.permanent_uuid());
 
-    TrackedPeer* peer_ptr = FindPtrOrNull(peers_map_, uuid);
-    if (PREDICT_FALSE(peer_ptr == nullptr || queue_state_.mode == NON_LEADER)) {
+    TrackedPeer* peer = FindPtrOrNull(peers_map_, uuid);
+    if (PREDICT_FALSE(peer == nullptr || queue_state_.mode == NON_LEADER)) {
       return Status::NotFound("Peer not tracked or queue not in leader mode.");
     }
-    peer = *peer_ptr;
+    peer_copy = *peer;
 
     // Clear the requests without deleting the entries, as they may be in use by other peers.
     request->mutable_ops()->ExtractSubrange(0, request->ops_size(), nullptr);
@@ -461,19 +553,22 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
     request->set_all_replicated_index(queue_state_.all_replicated_index);
     request->set_last_idx_appended_to_leader(queue_state_.last_appended.index());
     request->set_caller_term(current_term);
-    unreachable_time = MonoTime::Now() - peer.last_communication_time;
-  }
-  if (unreachable_time.ToSeconds() > FLAGS_follower_unavailable_considered_failed_sec) {
-    if (SafeToEvict(uuid)) {
-      string msg = Substitute("Leader has been unable to successfully communicate "
-                              "with Peer $0 for more than $1 seconds ($2)",
-                              uuid,
-                              FLAGS_follower_unavailable_considered_failed_sec,
-                              unreachable_time.ToString());
-      NotifyObserversOfFailedFollower(uuid, current_term, msg);
-    }
+    unreachable_time = MonoTime::Now() - peer_copy.last_communication_time;
   }
-  if (peer.last_exchange_status == PeerStatus::TABLET_NOT_FOUND) {
+
+  // Always trigger a health status update check at the end of this function.
+  bool wal_catchup_progress = false;
+  bool wal_catchup_failure = false;
+  SCOPED_CLEANUP({
+      std::lock_guard<simple_spinlock> lock(queue_lock_);
+      TrackedPeer* peer = FindPtrOrNull(peers_map_, uuid);
+      if (!peer) return;
+      if (wal_catchup_progress) peer->wal_catchup_possible = true;
+      if (wal_catchup_failure) peer->wal_catchup_possible = false;
+      UpdatePeerHealthUnlocked(peer);
+    });
+
+  if (peer_copy.last_exchange_status == PeerStatus::TABLET_NOT_FOUND) {
     VLOG(3) << LogPrefixUnlocked() << "Peer " << uuid << " needs tablet copy" << THROTTLE_MSG;
     *needs_tablet_copy = true;
     return Status::OK();
@@ -483,14 +578,14 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
   // If we've never communicated with the peer, we don't know what messages to
   // send, so we'll send a status-only request. Otherwise, we grab requests
   // from the log starting at the last_received point.
-  if (peer.last_exchange_status != PeerStatus::NEW) {
+  if (peer_copy.last_exchange_status != PeerStatus::NEW) {
 
     // The batch of messages to send to the peer.
     vector<ReplicateRefPtr> messages;
     int max_batch_size = FLAGS_consensus_max_batch_size_bytes - request->ByteSize();
 
     // We try to get the follower's next_index from our log.
-    Status s = log_cache_.ReadOps(peer.next_index - 1,
+    Status s = log_cache_.ReadOps(peer_copy.next_index - 1,
                                   max_batch_size,
                                   &messages,
                                   &preceding_id);
@@ -501,7 +596,7 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
         string msg = Substitute("The logs necessary to catch up peer $0 have been "
                                 "garbage collected. The follower will never be able "
                                 "to catch up ($1)", uuid, s.ToString());
-        NotifyObserversOfFailedFollower(uuid, current_term, msg);
+        wal_catchup_failure = true;
         return s;
       // IsIncomplete() means that we tried to read beyond the head of the log
       // (in the future). See KUDU-1078.
@@ -510,14 +605,18 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
         LOG_WITH_PREFIX_UNLOCKED(ERROR) << "Error trying to read ahead of the log "
                                         << "while preparing peer request: "
                                         << s.ToString() << ". Destination peer: "
-                                        << peer.ToString();
+                                        << peer_copy.ToString();
         return s;
       }
       LOG_WITH_PREFIX_UNLOCKED(FATAL) << "Error reading the log while preparing peer request: "
                                       << s.ToString() << ". Destination peer: "
-                                      << peer.ToString();
+                                      << peer_copy.ToString();
     }
 
+    // Since we were able to read ops through the log cache, we know that
+    // catchup is possible.
+    wal_catchup_progress = true;
+
     // We use AddAllocated rather than copy, because we pin the log cache at the
     // "all replicated" point. At some point we may want to allow partially loading
     // (and not pinning) earlier messages. At that point we'll need to do something
@@ -537,7 +636,7 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
   if (request->ops_size() > 0) {
     int64_t last_op_sent = request->ops(request->ops_size() - 1).id().index();
     if (last_op_sent < request->committed_index()) {
-      KLOG_EVERY_N_SECS_THROTTLER(INFO, 3, peer.status_log_throttler, "lagging")
+      KLOG_EVERY_N_SECS_THROTTLER(INFO, 3, peer_copy.status_log_throttler, "lagging")
           << LogPrefixUnlocked() << "Peer " << uuid << " is lagging by at least "
           << (request->committed_index() - last_op_sent)
           << " ops behind the committed index " << THROTTLE_MSG;
@@ -719,11 +818,7 @@ void PeerMessageQueue::UpdatePeerStatus(const string& peer_uuid,
       break;
 
     case PeerStatus::TABLET_FAILED: {
-      // Use the current term to ensure the peer will be evicted, otherwise this
-      // notification may be ignored.
-      int64_t current_term = queue_state_.current_term;
-      l.unlock();
-      NotifyObserversOfFailedFollower(peer_uuid, current_term, status.ToString());
+      UpdatePeerHealthUnlocked(peer);
       return;
     }
 
@@ -1150,6 +1245,24 @@ void PeerMessageQueue::NotifyObserversOfFailedFollowerTask(const string& uuid,
   }
 }
 
+void PeerMessageQueue::NotifyObserversOfPeerHealthChange() {
+  WARN_NOT_OK(raft_pool_observers_token_->SubmitClosure(
+      Bind(&PeerMessageQueue::NotifyObserversOfPeerHealthChangeTask, Unretained(this))),
+              LogPrefixUnlocked() + "Unable to notify RaftConsensus peer health change.");
+}
+
+void PeerMessageQueue::NotifyObserversOfPeerHealthChangeTask() {
+  MAYBE_INJECT_RANDOM_LATENCY(FLAGS_consensus_inject_latency_ms_in_notifications);
+  std::vector<PeerMessageQueueObserver*> observers_copy;
+  {
+    std::lock_guard<simple_spinlock> lock(queue_lock_);
+    observers_copy = observers_;
+  }
+  for (PeerMessageQueueObserver* observer : observers_copy) {
+    observer->NotifyPeerHealthChange();
+  }
+}
+
 PeerMessageQueue::~PeerMessageQueue() {
   Close();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/88e39bad/src/kudu/consensus/consensus_queue.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.h b/src/kudu/consensus/consensus_queue.h
index c3cf620..871089b 100644
--- a/src/kudu/consensus/consensus_queue.h
+++ b/src/kudu/consensus/consensus_queue.h
@@ -120,6 +120,8 @@ class PeerMessageQueue {
           last_known_committed_index(MinimumOpId().index()),
           last_exchange_status(PeerStatus::NEW),
           last_communication_time(MonoTime::Now()),
+          wal_catchup_possible(true),
+          last_overall_health_status(HealthReportPB::UNKNOWN),
           last_seen_term_(0) {}
 
     TrackedPeer() = default;
@@ -165,6 +167,13 @@ class PeerMessageQueue {
     // successful communication ever took place.
     MonoTime last_communication_time;
 
+    // Set to false if it is determined that the remote peer has fallen behind
+    // the local peer's WAL.
+    bool wal_catchup_possible;
+
+    // The peer's latest overall health status.
+    HealthReportPB::HealthStatus last_overall_health_status;
+
     // Throttler for how often we will log status messages pertaining to this
     // peer (eg when it is lagging, etc).
     logging::LogThrottler status_log_throttler;
@@ -211,6 +220,10 @@ class PeerMessageQueue {
   // Makes the queue untrack this peer.
   void UntrackPeer(const std::string& uuid);
 
+  // Returns a health report for all active peers.
+  // Returns IllegalState if the local peer is not the leader of the config.
+  std::unordered_map<std::string, HealthReportPB> ReportHealthOfPeers() const;
+
   // Appends a single message to be replicated to the peers.
   // Returns OK unless the message could not be added to the queue for some
   // reason (e.g. the queue reached max size).
@@ -419,7 +432,14 @@ class PeerMessageQueue {
 
   // Return true if it would be safe to evict the peer 'evict_uuid' at this
   // point in time.
-  bool SafeToEvict(const std::string& evict_uuid);
+  bool SafeToEvictUnlocked(const std::string& evict_uuid) const;
+
+  // Update a peer's last_health_status field and trigger the appropriate
+  // notifications.
+  void UpdatePeerHealthUnlocked(TrackedPeer* peer);
+
+  // Calculate a peer's up-to-date health status based on internal fields.
+  static HealthReportPB::HealthStatus PeerHealthStatus(const TrackedPeer& peer);
 
   void NotifyObserversOfCommitIndexChange(int64_t new_commit_index);
   void NotifyObserversOfCommitIndexChangeTask(int64_t new_commit_index);
@@ -434,6 +454,9 @@ class PeerMessageQueue {
                                            int64_t term,
                                            const std::string& reason);
 
+  void NotifyObserversOfPeerHealthChange();
+  void NotifyObserversOfPeerHealthChangeTask();
+
   typedef std::unordered_map<std::string, TrackedPeer*> PeersMap;
 
   std::string ToStringUnlocked() const;
@@ -520,6 +543,9 @@ class PeerMessageQueueObserver {
                                     int64_t term,
                                     const std::string& reason) = 0;
 
+  // Notify the observer that the health of one of the peers has changed.
+  virtual void NotifyPeerHealthChange() = 0;
+
   virtual ~PeerMessageQueueObserver() {}
 };
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/88e39bad/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 58cc087..45066bb 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -24,8 +24,9 @@
 #include <memory>
 #include <mutex>
 #include <ostream>
-#include <unordered_set>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
@@ -778,6 +779,10 @@ void RaftConsensus::NotifyFailedFollower(const string& uuid,
               LogPrefixThreadSafe() + "Unable to start RemoteFollowerTask");
 }
 
+void RaftConsensus::NotifyPeerHealthChange() {
+  MarkDirty("Peer health change");
+}
+
 void RaftConsensus::TryRemoveFollowerTask(const string& uuid,
                                           const RaftConfigPB& committed_config,
                                           const std::string& reason) {
@@ -2166,10 +2171,31 @@ const string& RaftConsensus::tablet_id() const {
   return options_.tablet_id;
 }
 
-ConsensusStatePB RaftConsensus::ConsensusState() const {
+ConsensusStatePB RaftConsensus::ConsensusState(IncludeHealthReport report_health) const {
   ThreadRestrictions::AssertWaitAllowed();
-  LockGuard l(lock_);
-  return cmeta_->ToConsensusStatePB();
+  UniqueLock l(lock_);
+  ConsensusStatePB cstate = cmeta_->ToConsensusStatePB();
+
+  // If we need to include the health report, merge it into the committed
+  // config iff we believe we are the current leader of the config.
+  if (report_health == INCLUDE_HEALTH_REPORT &&
+      cmeta_->active_role() == RaftPeerPB::LEADER) {
+    auto reports = queue_->ReportHealthOfPeers();
+
+    // We don't need to access the queue anymore, so drop the consensus lock.
+    l.unlock();
+
+    // Iterate through each peer in the committed config and attach the health
+    // report to it.
+    RaftConfigPB* committed_raft_config = cstate.mutable_committed_config();
+    for (int i = 0; i < committed_raft_config->peers_size(); i++) {
+      RaftPeerPB* peer = committed_raft_config->mutable_peers(i);
+      const HealthReportPB* report = FindOrNull(reports, peer->permanent_uuid());
+      if (!report) continue; // Only attach details if we know about the peer.
+      *peer->mutable_health_report() = *report;
+    }
+  }
+  return cstate;
 }
 
 RaftConfigPB RaftConsensus::CommittedConfig() const {

http://git-wip-us.apache.org/repos/asf/kudu/blob/88e39bad/src/kudu/consensus/raft_consensus.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index 7229bb7..fb43e10 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -277,8 +277,16 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
 
   scoped_refptr<TimeManager> time_manager() const { return time_manager_; }
 
+  enum IncludeHealthReport {
+    EXCLUDE_HEALTH_REPORT,
+    INCLUDE_HEALTH_REPORT
+  };
+
   // Returns a copy of the state of the consensus system.
-  ConsensusStatePB ConsensusState() const;
+  // If 'report_health' is set to 'INCLUDE_HEALTH_REPORT', and if the
+  // local replica believes it is the leader of the config, it will include a
+  // health report about each active peer in the committed config.
+  ConsensusStatePB ConsensusState(IncludeHealthReport report_health = EXCLUDE_HEALTH_REPORT) const;
 
   // Returns a copy of the current committed Raft configuration.
   RaftConfigPB CommittedConfig() const;
@@ -309,13 +317,15 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   // Updates the committed_index and triggers the Apply()s for whatever
   // transactions were pending.
   // This is idempotent.
-  void NotifyCommitIndex(int64_t commit_index);
+  void NotifyCommitIndex(int64_t commit_index) override;
 
-  void NotifyTermChange(int64_t term);
+  void NotifyTermChange(int64_t term) override;
 
   void NotifyFailedFollower(const std::string& uuid,
                             int64_t term,
-                            const std::string& reason);
+                            const std::string& reason) override;
+
+  void NotifyPeerHealthChange() override;
 
   // Return the log indexes which the consensus implementation would like to retain.
   //

http://git-wip-us.apache.org/repos/asf/kudu/blob/88e39bad/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index e41a331..c31d7eb 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -2424,7 +2424,8 @@ TEST_F(RaftConsensusITest, TestUpdateConsensusErrorNonePrepared) {
 TEST_F(RaftConsensusITest, TestCorruptReplicaMetadata) {
   // Start cluster and wait until we have a stable leader.
   // Switch off tombstoning of evicted replicas to observe the failed tablet state.
-  NO_FATALS(BuildAndStart({}, { "--master_tombstone_evicted_tablet_replicas=false" }));
+  NO_FATALS(BuildAndStart({ "--consensus_rpc_timeout_ms=10000" }, // Ensure we are safe to evict.
+                          { "--master_tombstone_evicted_tablet_replicas=false" }));
   ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_,
                                   tablet_id_, 1));
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/88e39bad/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 9e234ec..bef2d9d 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -29,6 +29,7 @@
 #include <boost/bind.hpp> // IWYU pragma: keep
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
+#include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 
 #include "kudu/clock/clock.h"
@@ -118,6 +119,8 @@ DEFINE_int32(tablet_state_walk_min_period_ms, 1000,
              "tablet map to update tablet state counts.");
 TAG_FLAG(tablet_state_walk_min_period_ms, advanced);
 
+DECLARE_bool(raft_prepare_replacement_before_eviction);
+
 METRIC_DEFINE_gauge_int32(server, tablets_num_not_initialized,
                           "Number of Not Initialized Tablets",
                           kudu::MetricUnit::kTablets,
@@ -1153,7 +1156,10 @@ void TSTabletManager::CreateReportedTabletPB(const scoped_refptr<TabletReplica>&
   // We cannot get consensus state information unless the TabletReplica is running.
   shared_ptr<consensus::RaftConsensus> consensus = replica->shared_consensus();
   if (consensus) {
-    *reported_tablet->mutable_consensus_state() = consensus->ConsensusState();
+    auto include_health = FLAGS_raft_prepare_replacement_before_eviction ?
+                          RaftConsensus::INCLUDE_HEALTH_REPORT :
+                          RaftConsensus::EXCLUDE_HEALTH_REPORT;
+    *reported_tablet->mutable_consensus_state() = consensus->ConsensusState(include_health);
   }
 }