You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/08/23 21:30:06 UTC

kudu git commit: KUDU-1407: reassign failed tablets

Repository: kudu
Updated Branches:
  refs/heads/master 65b7bf970 -> a9d17c00a


KUDU-1407: reassign failed tablets

Tablets put into the state tablet::FAILED are left until they are
manually deleted; they are not evicted and reassigned. If a tablet
fails to bootstrap, it will sit, responding to heartbeats, doing nothing
else. This patch ensures failed tablets will be reassigned.

As the tablets are not used, rather than directly setting replicas to
FAILED, an error is first recorded and the TabletReplica::Shutdown(), leaving
the final state as FAILED. A replica can no longer leave the FAILED
state (calls to Shutdown() leave it FAILED). The tserver response
generated by FAILED tablets is now TABLET_FAILED. Upon receiving this, a
leader will immediately evict the peer.

Prior to this patch, a tablet was marked FAILED if its WAL or metadata
failed to delete (after already shutting down). If this occurs, there is
no guarantee that the tablet's metadata reflects the deleted state. This
has been made fatal.

Testing is done in a few places:
- raft_consensus-itest is updated to ensure that tablets that fail to
  bootstrap are evicted and replaced.
- tablet_server-test is also updated to ensure that, instead of
  TABLET_NOT_RUNNING, TABLET_FAILED is returned by failed tablets.
- a test is added to ts_tablet_manager-itest to test that a tablet that
  is manually failed while running is evicted and replaced.

This patch is a part of a series of patches to handle disk failure. See
section 2.5 in this doc:
https://docs.google.com/document/d/1zZk-vb_ETKUuePcZ9ZqoSK2oPvAAaEV1sjDXes8Pxgk/edit

Change-Id: I5f61585b02fbe270d215bf7f49c0d390ceee3345
Reviewed-on: http://gerrit.cloudera.org:8080/7440
Tested-by: Kudu Jenkins
Reviewed-by: Mike Percy <mp...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/a9d17c00
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/a9d17c00
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/a9d17c00

Branch: refs/heads/master
Commit: a9d17c00af8ebd32e8a93eb8a3e98f989d0feda7
Parents: 65b7bf9
Author: Andrew Wong <aw...@cloudera.com>
Authored: Thu Jul 13 15:36:27 2017 -0700
Committer: Mike Percy <mp...@apache.org>
Committed: Wed Aug 23 21:29:45 2017 +0000

----------------------------------------------------------------------
 src/kudu/client/scanner-internal.cc             |  1 +
 src/kudu/consensus/consensus_peers.cc           |  9 +++
 src/kudu/consensus/consensus_queue.cc           | 14 +++-
 src/kudu/consensus/consensus_queue.h            |  4 ++
 .../integration-tests/raft_consensus-itest.cc   | 17 +++--
 src/kudu/integration-tests/ts_recovery-itest.cc | 13 ++--
 .../ts_tablet_manager-itest.cc                  | 70 ++++++++++++++++----
 src/kudu/tablet/tablet_replica.cc               | 17 +++--
 src/kudu/tablet/tablet_replica.h                | 20 +++---
 src/kudu/tserver/tablet_server-test.cc          |  2 +-
 src/kudu/tserver/tablet_service.cc              |  5 +-
 src/kudu/tserver/ts_tablet_manager.cc           | 37 ++++++++---
 src/kudu/tserver/tserver.proto                  |  3 +
 13 files changed, 154 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/client/scanner-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc
index aec1cfc..f61a5a3 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -246,6 +246,7 @@ ScanRpcStatus KuduScanner::Data::AnalyzeResponse(const Status& rpc_status,
       return ScanRpcStatus{ScanRpcStatus::SCANNER_EXPIRED, server_status};
     case tserver::TabletServerErrorPB::TABLET_NOT_RUNNING:
       return ScanRpcStatus{ScanRpcStatus::TABLET_NOT_RUNNING, server_status};
+    case tserver::TabletServerErrorPB::TABLET_FAILED: // fall-through
     case tserver::TabletServerErrorPB::TABLET_NOT_FOUND:
       return ScanRpcStatus{ScanRpcStatus::TABLET_NOT_FOUND, server_status};
     default:

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/consensus/consensus_peers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers.cc b/src/kudu/consensus/consensus_peers.cc
index 66d155c..17fcc18 100644
--- a/src/kudu/consensus/consensus_peers.cc
+++ b/src/kudu/consensus/consensus_peers.cc
@@ -297,6 +297,15 @@ void Peer::ProcessResponse() {
     return;
   }
 
+  // Notify consensus that the peer has failed.
+  if (response_.has_error() && response_.error().code() == TabletServerErrorPB::TABLET_FAILED) {
+    Status response_status = StatusFromPB(response_.error().status());
+    queue_->NotifyPeerHasFailed(peer_pb_.permanent_uuid(),
+                                response_status.ToString());
+    ProcessResponseError(response_status);
+    return;
+  }
+
   // Pass through errors we can respond to, like not found, since in that case
   // we will need to start a Tablet Copy. TODO: Handle DELETED response once implemented.
   if ((response_.has_error() &&

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc
index 4445137..6e29ab2 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -617,13 +617,25 @@ void PeerMessageQueue::UpdateLastIndexAppendedToLeader(int64_t last_idx_appended
   UpdateLagMetricsUnlocked();
 }
 
-void PeerMessageQueue::NotifyPeerIsResponsive(const std::string& peer_uuid) {
+void PeerMessageQueue::NotifyPeerIsResponsive(const string& peer_uuid) {
   std::lock_guard<simple_spinlock> l(queue_lock_);
   TrackedPeer* peer = FindPtrOrNull(peers_map_, peer_uuid);
   if (!peer) return;
   peer->last_successful_communication_time = MonoTime::Now();
 }
 
+void PeerMessageQueue::NotifyPeerHasFailed(const string& peer_uuid, const string& reason) {
+  std::unique_lock<simple_spinlock> l(queue_lock_);
+  TrackedPeer* peer = FindPtrOrNull(peers_map_, peer_uuid);
+  if (peer) {
+    // Use the current term to ensure the peer will be evicted, otherwise this
+    // notification may be ignored.
+    int64_t current_term = queue_state_.current_term;
+    l.unlock();
+    NotifyObserversOfFailedFollower(peer_uuid, current_term, reason);
+  }
+}
+
 void PeerMessageQueue::ResponseFromPeer(const std::string& peer_uuid,
                                         const ConsensusResponsePB& response,
                                         bool* more_pending) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/consensus/consensus_queue.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.h b/src/kudu/consensus/consensus_queue.h
index 976dd1d..d977eb3 100644
--- a/src/kudu/consensus/consensus_queue.h
+++ b/src/kudu/consensus/consensus_queue.h
@@ -239,6 +239,10 @@ class PeerMessageQueue {
   // it is alive and making progress.
   void NotifyPeerIsResponsive(const std::string& peer_uuid);
 
+  // Notify consensus that the given peer has failed.
+  void NotifyPeerHasFailed(const std::string& peer_uuid,
+                           const std::string& reason);
+
   // Updates the request queue with the latest response of a peer, returns
   // whether this peer has more requests pending.
   void ResponseFromPeer(const std::string& peer_uuid,

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 5c0918f..43bb4ab 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -3016,14 +3016,14 @@ TEST_F(RaftConsensusITest, TestUpdateConsensusErrorNonePrepared) {
 }
 
 // Test that, if the raft metadata on a replica is corrupt, then the server
-// doesn't crash, but instead just marks the tablet as corrupt.
+// doesn't crash, but instead marks the tablet as failed.
 TEST_F(RaftConsensusITest, TestCorruptReplicaMetadata) {
   // Start cluster and wait until we have a stable leader.
-  NO_FATALS(BuildAndStart());
+  // Switch off tombstoning of evicted replicas to observe the failed tablet state.
+  NO_FATALS(BuildAndStart({}, { "--master_tombstone_evicted_tablet_replicas=false" }));
   ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_,
                                   tablet_id_, 1));
 
-
   // Shut down one of the tablet servers, and then muck
   // with its consensus metadata to corrupt it.
   auto* ts = cluster_->tablet_server(0);
@@ -3042,13 +3042,12 @@ TEST_F(RaftConsensusITest, TestCorruptReplicaMetadata) {
                                    tablet::FAILED,
                                    MonoDelta::FromSeconds(30)));
 
-  // Currently, the tablet server does not automatically delete FAILED replicas.
-  // So, manually delete the bad replica in order to recover.
-  ASSERT_OK(itest::DeleteTablet(tablet_servers_[ts->uuid()], tablet_id_,
-                                tablet::TABLET_DATA_TOMBSTONED, boost::none,
-                                MonoDelta::FromSeconds(30)));
+  // Switch on deletion of evicted replicas (this is default behavior) to ensure
+  // the tablet is reassigned.
+  ASSERT_OK(cluster_->SetFlag(cluster_->master(0),
+      "master_tombstone_evicted_tablet_replicas", "true"));
 
-  // A new good copy should get created.
+  // A new good copy should get created automatically.
   ASSERT_OK(WaitUntilTabletInState(tablet_servers_[ts->uuid()],
                                    tablet_id_,
                                    tablet::RUNNING,

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/integration-tests/ts_recovery-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_recovery-itest.cc b/src/kudu/integration-tests/ts_recovery-itest.cc
index bc00958..4edfbf8 100644
--- a/src/kudu/integration-tests/ts_recovery-itest.cc
+++ b/src/kudu/integration-tests/ts_recovery-itest.cc
@@ -106,13 +106,15 @@ class TsRecoveryITest : public ExternalMiniClusterITestBase,
   TsRecoveryITest() { extra_tserver_flags_.emplace_back(GetParam()); }
 
  protected:
-  void StartClusterOneTs(const vector<string>& extra_tserver_flags = {});
+  void StartClusterOneTs(const vector<string>& extra_tserver_flags,
+                         const vector<string>& extra_master_flags = {});
 
   vector<string> extra_tserver_flags_ = {};
 };
 
-void TsRecoveryITest::StartClusterOneTs(const vector<string>& extra_tserver_flags) {
-  StartCluster(extra_tserver_flags, {}, 1 /* replicas */);
+void TsRecoveryITest::StartClusterOneTs(const vector<string>& extra_tserver_flags,
+                                        const vector<string>& extra_master_flags) {
+  StartCluster(extra_tserver_flags, extra_master_flags, 1 /* replicas */);
 }
 
 #if defined(__APPLE__)
@@ -304,7 +306,10 @@ TEST_P(TsRecoveryITest, TestCrashBeforeWriteLogSegmentHeader) {
 // - the bootstrap should fail (but not crash) because it cannot apply the cell size
 // - if we manually increase the cell size limit again, it should replay correctly.
 TEST_P(TsRecoveryITest, TestChangeMaxCellSize) {
-  NO_FATALS(StartClusterOneTs(extra_tserver_flags_));
+  // Prevent the master from tombstoning the evicted tablet so we can observe
+  // its FAILED state.
+  NO_FATALS(StartClusterOneTs(extra_tserver_flags_,
+      { "--master_tombstone_evicted_tablet_replicas=false" }));
   TestWorkload work(cluster_.get());
   work.set_num_replicas(1);
   work.set_payload_bytes(10000);

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/integration-tests/ts_tablet_manager-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_tablet_manager-itest.cc b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
index 58d2342..97ccf77 100644
--- a/src/kudu/integration-tests/ts_tablet_manager-itest.cc
+++ b/src/kudu/integration-tests/ts_tablet_manager-itest.cc
@@ -40,6 +40,7 @@
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
 #include "kudu/integration-tests/internal_mini_cluster.h"
+#include "kudu/integration-tests/test_workload.h"
 #include "kudu/master/master.pb.h"
 #include "kudu/master/master.proxy.h"
 #include "kudu/master/mini_master.h"
@@ -81,45 +82,88 @@ using rpc::MessengerBuilder;
 using std::shared_ptr;
 using std::string;
 using std::vector;
+using std::unique_ptr;
 using strings::Substitute;
 using tablet::TabletReplica;
 using tserver::MiniTabletServer;
-using tserver::TSTabletManager;
 
 static const char* const kTableName = "test-table";
-static const int kNumReplicas = 2;
 
 class TsTabletManagerITest : public KuduTest {
  public:
   TsTabletManagerITest()
       : schema_(SimpleIntKeyKuduSchema()) {
   }
-  virtual void SetUp() OVERRIDE;
+  virtual void SetUp() override {
+    KuduTest::SetUp();
+
+    MessengerBuilder bld("client");
+    ASSERT_OK(bld.Build(&client_messenger_));
+  }
+
+  void StartCluster(int num_replicas) {
+    InternalMiniClusterOptions opts;
+    opts.num_tablet_servers = num_replicas;
+    cluster_.reset(new InternalMiniCluster(env_, opts));
+    ASSERT_OK(cluster_->Start());
+    ASSERT_OK(cluster_->CreateClient(nullptr, &client_));
+  }
 
  protected:
   const KuduSchema schema_;
 
-  gscoped_ptr<InternalMiniCluster> cluster_;
+  unique_ptr<InternalMiniCluster> cluster_;
   client::sp::shared_ptr<KuduClient> client_;
   std::shared_ptr<Messenger> client_messenger_;
 };
 
-void TsTabletManagerITest::SetUp() {
-  KuduTest::SetUp();
-
-  MessengerBuilder bld("client");
-  ASSERT_OK(bld.Build(&client_messenger_));
+// Test that when a tablet is marked as failed, it will eventually be evicted
+// and replaced.
+TEST_F(TsTabletManagerITest, TestFailedTabletsAreReplaced) {
+  const int kNumReplicas = 3;
+  NO_FATALS(StartCluster(kNumReplicas));
 
   InternalMiniClusterOptions opts;
   opts.num_tablet_servers = kNumReplicas;
-  cluster_.reset(new InternalMiniCluster(env_, opts));
-  ASSERT_OK(cluster_->Start());
-  ASSERT_OK(cluster_->CreateClient(nullptr, &client_));
+  unique_ptr<InternalMiniCluster> cluster(new InternalMiniCluster(env_, opts));
+  ASSERT_OK(cluster->Start());
+  TestWorkload work(cluster.get());
+  work.set_num_replicas(3);
+  work.Setup();
+  work.Start();
+
+  // Insert data until until the tablet becomes visible to the server.
+  MiniTabletServer* ts = cluster->mini_tablet_server(0);
+  string tablet_id;
+  ASSERT_EVENTUALLY([&] {
+    vector<string> tablet_ids = ts->ListTablets();
+    ASSERT_EQ(1, tablet_ids.size());
+    tablet_id = tablet_ids[0];
+  });
+  {
+    // Fail one of the replicas (the first one, chosen arbitrarily).
+    scoped_refptr<TabletReplica> replica;
+    ASSERT_OK(ts->server()->tablet_manager()->GetTabletReplica(tablet_id, &replica));
+    replica->SetError(Status::IOError("INJECTED ERROR: tablet failed"));
+    replica->Shutdown();
+    ASSERT_EQ(tablet::FAILED, replica->state());
+  }
+
+  // Ensure the tablet eventually is replicated.
+  ASSERT_EVENTUALLY([&]{
+    scoped_refptr<TabletReplica> replica;
+    ASSERT_OK(ts->server()->tablet_manager()->GetTabletReplica(tablet_id, &replica));
+    ASSERT_EQ(replica->state(), tablet::RUNNING);
+  });
+  work.StopAndJoin();
 }
 
 // Test that when the leader changes, the tablet manager gets notified and
 // includes that information in the next tablet report.
 TEST_F(TsTabletManagerITest, TestReportNewLeaderOnLeaderChange) {
+  const int kNumReplicas = 2;
+  NO_FATALS(StartCluster(kNumReplicas));
+
   // We need to control elections precisely for this test since we're using
   // EmulateElection() with a distributed consensus configuration.
   FLAGS_enable_leader_failure_detection = false;
@@ -133,7 +177,7 @@ TEST_F(TsTabletManagerITest, TestReportNewLeaderOnLeaderChange) {
 
   // Create the table.
   client::sp::shared_ptr<KuduTable> table;
-  gscoped_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
+  unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
   ASSERT_OK(table_creator->table_name(kTableName)
             .schema(&schema_)
             .set_range_partition_columns({ "key" })

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/tablet/tablet_replica.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_replica.cc b/src/kudu/tablet/tablet_replica.cc
index c16b465..c1a0477 100644
--- a/src/kudu/tablet/tablet_replica.cc
+++ b/src/kudu/tablet/tablet_replica.cc
@@ -226,13 +226,17 @@ void TabletReplica::Shutdown() {
 
   LOG(INFO) << "Initiating TabletReplica shutdown for tablet: " << tablet_id_;
 
+  TabletStatePB shutdown_type = SHUTDOWN;
   {
     std::unique_lock<simple_spinlock> lock(lock_);
-    if (state_ == QUIESCING || state_ == SHUTDOWN) {
+    if (state_ == QUIESCING || state_ == SHUTDOWN || state_ == FAILED) {
       lock.unlock();
       WaitUntilShutdown();
       return;
     }
+    if (!error_.ok()) {
+      shutdown_type = FAILED;
+    }
     state_ = QUIESCING;
   }
 
@@ -246,7 +250,7 @@ void TabletReplica::Shutdown() {
 
   if (consensus_) consensus_->Shutdown();
 
-  // TODO: KUDU-183: Keep track of the pending tasks and send an "abort" message.
+  // TODO(KUDU-183): Keep track of the pending tasks and send an "abort" message.
   LOG_SLOW_EXECUTION(WARNING, 1000,
       Substitute("TabletReplica: tablet $0: Waiting for Transactions to complete", tablet_id())) {
     txn_tracker_.WaitForAllToFinish();
@@ -268,13 +272,13 @@ void TabletReplica::Shutdown() {
     tablet_->Shutdown();
   }
 
-  // Only mark the peer as SHUTDOWN when all other components have shut down.
+  // Only update the replica state when all other components have shut down.
   {
     std::lock_guard<simple_spinlock> lock(lock_);
     // Release mem tracker resources.
     consensus_.reset();
     tablet_.reset();
-    state_ = SHUTDOWN;
+    state_ = shutdown_type;
   }
 }
 
@@ -282,7 +286,7 @@ void TabletReplica::WaitUntilShutdown() {
   while (true) {
     {
       std::lock_guard<simple_spinlock> lock(lock_);
-      if (state_ == SHUTDOWN) {
+      if (state_ == SHUTDOWN || state_ == FAILED) {
         return;
       }
     }
@@ -396,10 +400,9 @@ string TabletReplica::last_status() const {
   return last_status_;
 }
 
-void TabletReplica::SetFailed(const Status& error) {
+void TabletReplica::SetError(const Status& error) {
   std::lock_guard<simple_spinlock> lock(lock_);
   CHECK(!error.ok());
-  state_ = FAILED;
   error_ = error;
   last_status_ = error.ToString();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/tablet/tablet_replica.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_replica.h b/src/kudu/tablet/tablet_replica.h
index b1d8bf9..fd272ad 100644
--- a/src/kudu/tablet/tablet_replica.h
+++ b/src/kudu/tablet/tablet_replica.h
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef KUDU_TABLET_TABLET_REPLICA_H_
-#define KUDU_TABLET_TABLET_REPLICA_H_
+#pragma once
 
 #include <cstdint>
 #include <map>
@@ -106,8 +105,11 @@ class TabletReplica : public RefCountedThreadSafe<TabletReplica>,
                ThreadPool* raft_pool,
                ThreadPool* prepare_pool);
 
-  // Shutdown this tablet replica.
-  // If a shutdown is already in progress, blocks until that shutdown is complete.
+  // Shutdown this tablet replica. If a shutdown is already in progress,
+  // blocks until that shutdown is complete.
+  //
+  // If 'error_' has been set to a non-OK status, the final state will be
+  // FAILED, and SHUTDOWN otherwise.
   void Shutdown();
 
   // Check that the tablet is in a RUNNING state.
@@ -193,9 +195,8 @@ class TabletReplica : public RefCountedThreadSafe<TabletReplica>,
   // Retrieve the last human-readable status of this tablet replica.
   std::string last_status() const;
 
-  // Sets the tablet state to FAILED additionally setting the error to the provided
-  // one.
-  void SetFailed(const Status& error);
+  // Sets the error to the provided one.
+  void SetError(const Status& error);
 
   // Returns the error that occurred, when state is FAILED.
   Status error() const {
@@ -261,7 +262,7 @@ class TabletReplica : public RefCountedThreadSafe<TabletReplica>,
   // Tablet::RegisterMaintenanceOps().
   void RegisterMaintenanceOps(MaintenanceManager* maintenance_manager);
 
-  // Unregister the maintenance ops associated with this peer's tablet.
+  // Unregister the maintenance ops associated with this replica's tablet.
   // This method is not thread safe.
   void UnregisterMaintenanceOps();
 
@@ -286,7 +287,7 @@ class TabletReplica : public RefCountedThreadSafe<TabletReplica>,
 
   ~TabletReplica();
 
-  // Wait until the TabletReplica is fully in SHUTDOWN state.
+  // Wait until the TabletReplica is fully in a FAILED or SHUTDOWN state.
   void WaitUntilShutdown();
 
   // After bootstrap is complete and consensus is setup this initiates the transactions
@@ -378,4 +379,3 @@ class FlushInflightsToLogCallback : public RefCountedThreadSafe<FlushInflightsTo
 }  // namespace tablet
 }  // namespace kudu
 
-#endif /* KUDU_TABLET_TABLET_REPLICA_H_ */

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/tserver/tablet_server-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc
index da58e59..d254e40 100644
--- a/src/kudu/tserver/tablet_server-test.cc
+++ b/src/kudu/tserver/tablet_server-test.cc
@@ -1140,7 +1140,7 @@ TEST_F(TabletServerTest, TestClientGetsErrorBackWhenRecoveryFailed) {
 
   // We're expecting the write to fail.
   ASSERT_OK(DCHECK_NOTNULL(proxy_.get())->Write(req, &resp, &controller));
-  ASSERT_EQ(TabletServerErrorPB::TABLET_NOT_RUNNING, resp.error().code());
+  ASSERT_EQ(TabletServerErrorPB::TABLET_FAILED, resp.error().code());
   ASSERT_STR_CONTAINS(resp.error().status().message(), "Tablet not RUNNING: FAILED");
 
   // Check that the TabletReplica's status message is updated with the failure.

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/tserver/tablet_service.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc
index 8cc0262..aad6045 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -207,11 +207,12 @@ bool LookupTabletReplicaOrRespond(TabletReplicaLookupIf* tablet_manager,
   if (PREDICT_FALSE(state != tablet::RUNNING)) {
     Status s = Status::IllegalState("Tablet not RUNNING",
                                     tablet::TabletStatePB_Name(state));
+    TabletServerErrorPB::Code code = TabletServerErrorPB::TABLET_NOT_RUNNING;
     if (state == tablet::FAILED) {
       s = s.CloneAndAppend((*replica)->error().ToString());
+      code = TabletServerErrorPB::TABLET_FAILED;
     }
-    SetupErrorAndRespond(resp->mutable_error(), s,
-                         TabletServerErrorPB::TABLET_NOT_RUNNING, context);
+    SetupErrorAndRespond(resp->mutable_error(), s, code, context);
     return false;
   }
   return true;

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 47b8102..3b9b20e 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -66,6 +66,7 @@
 #include "kudu/util/monotime.h"
 #include "kudu/util/net/net_util.h"
 #include "kudu/util/net/sockaddr.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/stopwatch.h"
 #include "kudu/util/threadpool.h"
 #include "kudu/util/trace.h"
@@ -639,13 +640,15 @@ Status TSTabletManager::DeleteTablet(
   // If the tablet is already deleted, the CAS check isn't possible because
   // consensus and therefore the log is not available.
   TabletDataState data_state = replica->tablet_metadata()->tablet_data_state();
-  bool tablet_deleted = (data_state == TABLET_DATA_DELETED || data_state == TABLET_DATA_TOMBSTONED);
+  bool tablet_deleted = (data_state == TABLET_DATA_DELETED ||
+                         data_state == TABLET_DATA_TOMBSTONED ||
+                         replica->state() == tablet::FAILED);
 
   // They specified an "atomic" delete. Check the committed config's opid_index.
-  // TODO: There's actually a race here between the check and shutdown, but
-  // it's tricky to fix. We could try checking again after the shutdown and
-  // restarting the tablet if the local replica committed a higher config
-  // change op during that time, or potentially something else more invasive.
+  // TODO(mpercy): There's actually a race here between the check and shutdown,
+  // but it's tricky to fix. We could try checking again after the shutdown and
+  // restarting the tablet if the local replica committed a higher config change
+  // op during that time, or potentially something else more invasive.
   shared_ptr<RaftConsensus> consensus = replica->shared_consensus();
   if (cas_config_opid_index_less_or_equal && !tablet_deleted) {
     if (!consensus) {
@@ -675,9 +678,15 @@ Status TSTabletManager::DeleteTablet(
   if (PREDICT_FALSE(!s.ok())) {
     s = s.CloneAndPrepend(Substitute("Unable to delete on-disk data from tablet $0",
                                      tablet_id));
-    LOG(WARNING) << s.ToString();
-    replica->SetFailed(s);
-    return s;
+    // TODO(awong): A failure status here indicates a failure to update the
+    // tablet metadata, consensus metadta, or WAL (failures to remove blocks
+    // only log warnings). Once the above are no longer points of failure,
+    // handle errors here accordingly.
+    //
+    // If this fails, there is no guarantee that the on-disk metadata reflects
+    // that the tablet is deleted. To be safe, crash here.
+    LOG(FATAL) << Substitute("Failed to delete tablet data for $0: ",
+        tablet_id) << s.ToString();
   }
 
   replica->SetStatusMessage("Deleted tablet blocks from disk");
@@ -768,9 +777,14 @@ void TSTabletManager::OpenTablet(const scoped_refptr<TabletReplica>& replica,
 
   scoped_refptr<ConsensusMetadata> cmeta;
   Status s = cmeta_manager_->Load(replica->tablet_id(), &cmeta);
+  auto fail_tablet = MakeScopedCleanup([&]() {
+    // If something goes wrong, clean up the replica's internal members and mark
+    // it FAILED.
+    replica->SetError(s);
+    replica->Shutdown();
+  });
   if (PREDICT_FALSE(!s.ok())) {
     LOG(ERROR) << LogPrefix(tablet_id) << "Failed to load consensus metadata: " << s.ToString();
-    replica->SetFailed(s);
     return;
   }
 
@@ -798,7 +812,6 @@ void TSTabletManager::OpenTablet(const scoped_refptr<TabletReplica>& replica,
     if (!s.ok()) {
       LOG(ERROR) << LogPrefix(tablet_id) << "Tablet failed to bootstrap: "
                  << s.ToString();
-      replica->SetFailed(s);
       return;
     }
   }
@@ -817,13 +830,15 @@ void TSTabletManager::OpenTablet(const scoped_refptr<TabletReplica>& replica,
     if (!s.ok()) {
       LOG(ERROR) << LogPrefix(tablet_id) << "Tablet failed to start: "
                  << s.ToString();
-      replica->SetFailed(s);
       return;
     }
 
     replica->RegisterMaintenanceOps(server_->maintenance_manager());
   }
 
+  // Now that the tablet has successfully opened, cancel the cleanup.
+  fail_tablet.cancel();
+
   int elapsed_ms = (MonoTime::Now() - start).ToMilliseconds();
   if (elapsed_ms > FLAGS_tablet_start_warn_threshold_ms) {
     LOG(WARNING) << LogPrefix(tablet_id) << "Tablet startup took " << elapsed_ms << "ms";

http://git-wip-us.apache.org/repos/asf/kudu/blob/a9d17c00/src/kudu/tserver/tserver.proto
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tserver.proto b/src/kudu/tserver/tserver.proto
index 0914cf2..500a600 100644
--- a/src/kudu/tserver/tserver.proto
+++ b/src/kudu/tserver/tserver.proto
@@ -93,6 +93,9 @@ message TabletServerErrorPB {
 
     // The request is throttled.
     THROTTLED = 19;
+
+    // The tablet needs to be evicted and reassigned.
+    TABLET_FAILED = 20;
   }
 
   // The error code.