You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/11/27 06:16:29 UTC

[5/6] kudu git commit: shutdown tablets on disk failure at runtime

shutdown tablets on disk failure at runtime

Before, various code paths pass along disk failure Statuses until they
eventually hit a CHECK failure and crash the server. Such fatal errors
were "safe" by design, as they would ensure no additional changes were
made durable to each tablet. This patch aims to achieve similar behavior
for failed replicas while keeping the server alive.

These failures are permitted provided the following have occurred for
each tablet in the affected directory:
- The failed directory is immediately marked as failed, preventing
  further tablets from being striped across a failed disk.
- The tablet's MvccManager is shut down to prevent further writes from
  being made durable and preventing I/O to the tablet.
- A request is submitted to a threadpool to eventually completely shut
  down the replica, leaving it for eviction.

NOTE: failures of metadata file and the WAL directory are fatal. Code
paths that update these explicitly crash the server.

This is a part of a series of patches to handle disk failure. To see how
this patch fits in, see section 3 of:
https://docs.google.com/document/d/1yGVzDzV14mKReZ7EzlZZV_KfDBRnHJkRtlDox_RPXAA/edit

Change-Id: I109635a54268b9db741b2ae9ea3e9f1fe072d0a8
Reviewed-on: http://gerrit.cloudera.org:8080/7442
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <aw...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/b4368459
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/b4368459
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/b4368459

Branch: refs/heads/master
Commit: b436845989d7519fe473ca3579d3385c4a2bdaf3
Parents: 5523b0a
Author: Andrew Wong <aw...@cloudera.com>
Authored: Wed Nov 22 16:02:06 2017 -0800
Committer: Andrew Wong <aw...@cloudera.com>
Committed: Thu Nov 23 04:41:55 2017 +0000

----------------------------------------------------------------------
 src/kudu/tablet/tablet.cc                   |   6 +-
 src/kudu/tablet/tablet.h                    |   6 +
 src/kudu/tablet/tablet_replica.cc           |  22 +++-
 src/kudu/tablet/tablet_replica.h            |   3 +
 src/kudu/tserver/tablet_copy_client-test.cc |   5 +-
 src/kudu/tserver/ts_tablet_manager.cc       | 139 +++++++++++++++++------
 src/kudu/tserver/ts_tablet_manager.h        |  13 +++
 7 files changed, 152 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/b4368459/src/kudu/tablet/tablet.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet.cc b/src/kudu/tablet/tablet.cc
index 887bb7e..5e0c805 100644
--- a/src/kudu/tablet/tablet.cc
+++ b/src/kudu/tablet/tablet.cc
@@ -1065,10 +1065,7 @@ Status Tablet::Flush() {
 
 Status Tablet::FlushUnlocked() {
   TRACE_EVENT0("tablet", "Tablet::FlushUnlocked");
-  {
-    std::lock_guard<simple_spinlock> l(state_lock_);
-    RETURN_NOT_OK(CheckHasNotBeenStoppedUnlocked());
-  }
+  RETURN_NOT_OK(CheckHasNotBeenStopped());
   RowSetsInCompaction input;
   shared_ptr<MemRowSet> old_mrs;
   {
@@ -2256,6 +2253,7 @@ Tablet::Iterator::Iterator(const Tablet* tablet, const Schema& projection,
 Tablet::Iterator::~Iterator() {}
 
 Status Tablet::Iterator::Init(ScanSpec *spec) {
+  RETURN_NOT_OK(tablet_->CheckHasNotBeenStopped());
   DCHECK(iter_.get() == nullptr);
 
   RETURN_NOT_OK(tablet_->GetMappedReadProjection(projection_, &projection_));

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4368459/src/kudu/tablet/tablet.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet.h b/src/kudu/tablet/tablet.h
index 15bca73..38638c0 100644
--- a/src/kudu/tablet/tablet.h
+++ b/src/kudu/tablet/tablet.h
@@ -482,6 +482,12 @@ class Tablet {
   // Must be called while 'state_lock_' is held.
   Status CheckHasNotBeenStoppedUnlocked() const;
 
+  // Returns an error if the tablet is in the 'kStopped' or 'kShutdown' state.
+  Status CheckHasNotBeenStopped() const {
+    std::lock_guard<simple_spinlock> l(state_lock_);
+    return CheckHasNotBeenStoppedUnlocked();
+  }
+
   Status FlushUnlocked();
 
   // Validate the contents of 'op' and return a bad Status if it is invalid.

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4368459/src/kudu/tablet/tablet_replica.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_replica.cc b/src/kudu/tablet/tablet_replica.cc
index d44e6b4..537f728 100644
--- a/src/kudu/tablet/tablet_replica.cc
+++ b/src/kudu/tablet/tablet_replica.cc
@@ -224,8 +224,12 @@ Status TabletReplica::Start(const ConsensusBootstrapInfo& bootstrap_info,
         metric_entity,
         mark_dirty_clbk_));
 
-    // Re-acquire 'lock_' to update our state variable.
     std::lock_guard<simple_spinlock> l(lock_);
+
+    // If an error has been set (e.g. due to a disk failure from a separate
+    // thread), error out.
+    RETURN_NOT_OK(error_);
+
     CHECK_EQ(BOOTSTRAPPING, state_); // We are still protected by 'state_change_lock_'.
     set_state(RUNNING);
   }
@@ -737,6 +741,22 @@ size_t TabletReplica::OnDiskSize() const {
   return ret;
 }
 
+void TabletReplica::MakeUnavailable(const Status& error) {
+  std::shared_ptr<Tablet> tablet;
+  {
+    std::lock_guard<simple_spinlock> lock(lock_);
+    tablet = tablet_;
+    for (MaintenanceOp* op : maintenance_ops_) {
+      op->CancelAndDisable();
+    }
+  }
+  // Stop the Tablet from doing further I/O.
+  if (tablet) tablet->Stop();
+
+  // Set the error; when the replica is shut down, it will end up FAILED.
+  SetError(error);
+}
+
 Status FlushInflightsToLogCallback::WaitForInflightsAndFlushLog() {
   // This callback is triggered prior to any TabletMetadata flush.
   // The guarantee that we are trying to enforce is this:

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4368459/src/kudu/tablet/tablet_replica.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_replica.h b/src/kudu/tablet/tablet_replica.h
index 27829ba..bf3476d 100644
--- a/src/kudu/tablet/tablet_replica.h
+++ b/src/kudu/tablet/tablet_replica.h
@@ -274,6 +274,9 @@ class TabletReplica : public RefCountedThreadSafe<TabletReplica>,
   // Only to be used in tests.
   void CancelMaintenanceOpsForTests();
 
+  // Stops further I/O on the replica.
+  void MakeUnavailable(const Status& error);
+
   // Return pointer to the transaction tracker for this peer.
   const TransactionTracker* transaction_tracker() const { return &txn_tracker_; }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4368459/src/kudu/tserver/tablet_copy_client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client-test.cc b/src/kudu/tserver/tablet_copy_client-test.cc
index 277d8ff..7ac12cd 100644
--- a/src/kudu/tserver/tablet_copy_client-test.cc
+++ b/src/kudu/tserver/tablet_copy_client-test.cc
@@ -293,10 +293,11 @@ TEST_F(TabletCopyClientTest, TestFailedDiskStopsClient) {
     }
   });
 
-  // In a separate thread, mark one of the directories as failed.
+  // In a separate thread, mark one of the directories as failed (not the
+  // metadata directory).
   while (true) {
     if (rand() % 10 == 0) {
-      dd_manager->MarkDataDirFailed(0, "injected failure in non-client thread");
+      dd_manager->MarkDataDirFailed(1, "injected failure in non-client thread");
       LOG(INFO) << "INJECTING FAILURE";
       break;
     }

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4368459/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 47eae78..9e234ec 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -21,6 +21,7 @@
 #include <memory>
 #include <mutex>
 #include <ostream>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -52,6 +53,7 @@
 #include "kudu/master/master.pb.h"
 #include "kudu/rpc/result_tracker.h"
 #include "kudu/tablet/metadata.pb.h"
+#include "kudu/tablet/tablet.h"
 #include "kudu/tablet/tablet_bootstrap.h"
 #include "kudu/tablet/tablet_metadata.h"
 #include "kudu/tablet/tablet_replica.h"
@@ -156,6 +158,7 @@ METRIC_DEFINE_gauge_int32(server, tablets_num_shutdown,
                           kudu::MetricUnit::kTablets,
                           "Number of tablets currently shut down");
 
+using std::set;
 using std::shared_ptr;
 using std::string;
 using std::vector;
@@ -163,10 +166,6 @@ using strings::Substitute;
 
 namespace kudu {
 
-namespace tablet {
-class Tablet;
-}
-
 using consensus::ConsensusMetadata;
 using consensus::ConsensusMetadataCreateMode;
 using consensus::ConsensusMetadataManager;
@@ -737,6 +736,35 @@ Status TSTabletManager::CreateAndRegisterTabletReplica(
   return Status::OK();
 }
 
+Status TSTabletManager::BeginReplicaStateTransition(
+    const string& tablet_id,
+    const string& reason,
+    scoped_refptr<TabletReplica>* replica,
+    scoped_refptr<TransitionInProgressDeleter>* deleter,
+    TabletServerErrorPB::Code* error_code) {
+  // Acquire the lock in exclusive mode as we'll add a entry to the
+  // transition_in_progress_ map.
+  std::lock_guard<RWMutex> lock(lock_);
+  TRACE("Acquired tablet manager lock");
+  RETURN_NOT_OK(CheckRunningUnlocked(error_code));
+
+  if (!LookupTabletUnlocked(tablet_id, replica)) {
+    if (error_code) {
+      *error_code = TabletServerErrorPB::TABLET_NOT_FOUND;
+    }
+    return Status::NotFound("Tablet not found", tablet_id);
+  }
+  // Sanity check that the tablet's transition isn't already in progress
+  Status s = StartTabletStateTransitionUnlocked(tablet_id, reason, deleter);
+  if (PREDICT_FALSE(!s.ok())) {
+    if (error_code) {
+      *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
+    }
+    return s;
+  }
+  return Status::OK();
+}
+
 Status TSTabletManager::DeleteTablet(
     const string& tablet_id,
     TabletDataState delete_type,
@@ -754,31 +782,11 @@ Status TSTabletManager::DeleteTablet(
 
   scoped_refptr<TabletReplica> replica;
   scoped_refptr<TransitionInProgressDeleter> deleter;
-  {
-    // Acquire the lock in exclusive mode as we'll add a entry to the
-    // transition_in_progress_ map.
-    std::lock_guard<RWMutex> lock(lock_);
-    TRACE("Acquired tablet manager lock");
-    RETURN_NOT_OK(CheckRunningUnlocked(error_code));
+  RETURN_NOT_OK(BeginReplicaStateTransition(tablet_id, "deleting tablet", &replica,
+                                            &deleter, error_code));
 
-    if (!LookupTabletUnlocked(tablet_id, &replica)) {
-      if (error_code) {
-        *error_code = TabletServerErrorPB::TABLET_NOT_FOUND;
-      }
-      return Status::NotFound("Tablet not found", tablet_id);
-    }
-    // Sanity check that the tablet's deletion isn't already in progress
-    Status s = StartTabletStateTransitionUnlocked(tablet_id, "deleting tablet", &deleter);
-    if (PREDICT_FALSE(!s.ok())) {
-      if (error_code) {
-        *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
-      }
-      return s;
-    }
-  }
-
-  // If the tablet is already deleted, the CAS check isn't possible because
-  // consensus and therefore the log is not available.
+  // If the tablet has been deleted or forcefully shut down, the CAS check
+  // isn't possible because consensus and therefore the log is not available.
   TabletDataState data_state = replica->tablet_metadata()->tablet_data_state();
   bool tablet_already_deleted = (data_state == TABLET_DATA_DELETED ||
                                  data_state == TABLET_DATA_TOMBSTONED);
@@ -881,7 +889,7 @@ Status TSTabletManager::StartTabletStateTransitionUnlocked(
   }
 
   if (!InsertIfNotPresent(&transition_in_progress_, tablet_id, reason)) {
-    return Status::IllegalState(
+    return Status::AlreadyPresent(
         Substitute("State transition of tablet $0 already in progress: $1",
                     tablet_id, transition_in_progress_[tablet_id]));
   }
@@ -1259,7 +1267,7 @@ Status TSTabletManager::DeleteTabletData(
   CHECK(delete_type == TABLET_DATA_DELETED ||
         delete_type == TABLET_DATA_TOMBSTONED ||
         delete_type == TABLET_DATA_COPYING)
-      << "Unexpected delete_type to delete tablet " << meta->tablet_id() << ": "
+      << "Unexpected delete_type to delete tablet " << tablet_id << ": "
       << TabletDataState_Name(delete_type) << " (" << delete_type << ")";
 
   // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid
@@ -1271,7 +1279,7 @@ Status TSTabletManager::DeleteTabletData(
             << (last_logged_opid ? OpIdToString(*last_logged_opid) : "(unknown)");
   MAYBE_FAULT(FLAGS_fault_crash_after_blocks_deleted);
 
-  RETURN_NOT_OK(Log::DeleteOnDiskData(meta->fs_manager(), meta->tablet_id()));
+  CHECK_OK(Log::DeleteOnDiskData(meta->fs_manager(), tablet_id));
   MAYBE_FAULT(FLAGS_fault_crash_after_wal_deleted);
 
   // We do not delete the superblock or the consensus metadata when tombstoning
@@ -1285,13 +1293,25 @@ Status TSTabletManager::DeleteTabletData(
   DCHECK_EQ(TABLET_DATA_DELETED, delete_type);
 
   LOG(INFO) << LogPrefix(tablet_id, meta->fs_manager()) << "Deleting consensus metadata";
-  Status s = cmeta_manager->Delete(meta->tablet_id());
+  Status s = cmeta_manager->Delete(tablet_id);
   // NotFound means we already deleted the cmeta in a previous attempt.
   if (PREDICT_FALSE(!s.ok() && !s.IsNotFound())) {
+    if (s.IsDiskFailure()) {
+      LOG(FATAL) << LogPrefix(tablet_id, meta->fs_manager())
+                 << "consensus metadata is on a failed disk";
+    }
     return s;
   }
   MAYBE_FAULT(FLAGS_fault_crash_after_cmeta_deleted);
-  return meta->DeleteSuperBlock();
+  s = meta->DeleteSuperBlock();
+  if (PREDICT_FALSE(!s.ok())) {
+    if (s.IsDiskFailure()) {
+      LOG(FATAL) << LogPrefix(tablet_id, meta->fs_manager())
+                 << "tablet metadata is on a failed disk";
+    }
+    return s;
+  }
+  return Status::OK();
 }
 
 void TSTabletManager::FailTabletsInDataDir(const string& uuid) {
@@ -1299,8 +1319,57 @@ void TSTabletManager::FailTabletsInDataDir(const string& uuid) {
   int uuid_idx;
   CHECK(dd_manager->FindUuidIndexByUuid(uuid, &uuid_idx))
       << Substitute("No data directory found with UUID $0", uuid);
-  LOG(FATAL) << Substitute("Data directory $0 failed. Disk failure handling not implemented",
-                           dd_manager->FindDataDirByUuidIndex(uuid_idx)->dir());
+  if (fs_manager_->dd_manager()->IsDataDirFailed(uuid_idx)) {
+    LOG(WARNING) << "Data directory is already marked failed.";
+    return;
+  }
+  // Fail the directory to prevent other tablets from being placed in it.
+  dd_manager->MarkDataDirFailed(uuid_idx);
+  set<string> tablets = dd_manager->FindTabletsByDataDirUuidIdx(uuid_idx);
+  LOG(INFO) << Substitute("Data dir $0 has $1 tablets", uuid, tablets.size());
+  for (const string& tablet_id : dd_manager->FindTabletsByDataDirUuidIdx(uuid_idx)) {
+    FailTabletAndScheduleShutdown(tablet_id);
+  }
+}
+
+void TSTabletManager::FailTabletAndScheduleShutdown(const string& tablet_id) {
+  LOG(INFO) << LogPrefix(tablet_id, fs_manager_) << "failing tablet";
+  scoped_refptr<TabletReplica> replica;
+  if (LookupTablet(tablet_id, &replica)) {
+    // Stop further IO to the replica and set an error in the replica.
+    // When the replica is shutdown, this will leave it in a FAILED state.
+    replica->MakeUnavailable(Status::IOError("failing tablet"));
+
+    // Submit a request to actually shut down the tablet asynchronously.
+    CHECK_OK(open_tablet_pool_->SubmitFunc([tablet_id, this]() {
+      scoped_refptr<TabletReplica> replica;
+      scoped_refptr<TransitionInProgressDeleter> deleter;
+      TabletServerErrorPB::Code error;
+      Status s;
+      // Transition tablet state to ensure nothing else (e.g. tablet copies,
+      // deletions, etc) happens concurrently.
+      while (true) {
+        s = BeginReplicaStateTransition(tablet_id, "failing tablet",
+                                        &replica, &deleter, &error);
+        if (!s.IsAlreadyPresent()) {
+          break;
+        }
+        SleepFor(MonoDelta::FromMilliseconds(10));
+      }
+      // Success: we started the transition.
+      //
+      // Only proceed if there is no Tablet (e.g. a bootstrap terminated early
+      // due to error before creating the Tablet) or if the tablet has been
+      // stopped (e.g. due to the above call to MakeUnavailable).
+      std::shared_ptr<Tablet> tablet = replica->shared_tablet();
+      if (s.ok() && (!tablet || tablet->HasBeenStopped())) {
+        replica->Shutdown();
+      }
+      // Else: the tablet is healthy, or is already either not running or
+      // deleted (e.g. because another thread was able to successfully create a
+      // new replica).
+    }));
+  }
 }
 
 int TSTabletManager::RefreshTabletStateCacheAndReturnCount(tablet::TabletStatePB st) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4368459/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 317ab9f..a02a13c 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -199,6 +199,10 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
       tablet::TabletDataState delete_type,
       boost::optional<consensus::OpId> last_logged_opid);
 
+  // Synchronously makes the specified tablet unavailable for further I/O and
+  // schedules its asynchronous shutdown.
+  void FailTabletAndScheduleShutdown(const std::string& tablet_id);
+
   // Forces shutdown of the tablet replicas in the data dir corresponding to 'uuid'.
   void FailTabletsInDataDir(const std::string& uuid);
 
@@ -236,6 +240,15 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
                                             const std::string& reason,
                                             scoped_refptr<TransitionInProgressDeleter>* deleter);
 
+  // Marks the replica indicated by 'tablet_id' as being in a transitional
+  // state. Returns an error status if the replica is already in a transitional
+  // state.
+  Status BeginReplicaStateTransition(const std::string& tablet_id,
+                                     const std::string& reason,
+                                     scoped_refptr<tablet::TabletReplica>* replica,
+                                     scoped_refptr<TransitionInProgressDeleter>* deleter,
+                                     TabletServerErrorPB::Code* error_code);
+
   // Open a tablet meta from the local file system by loading its superblock.
   Status OpenTabletMeta(const std::string& tablet_id,
                         scoped_refptr<tablet::TabletMetadata>* metadata);