You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/08/24 08:21:19 UTC

[2/2] kudu git commit: disk failure: don't open tablets on failed disks

disk failure: don't open tablets on failed disks

Currently Kudu servers open the FS layout with failed disks. However,
the moment tablets attempt to bootstrap (i.e. open blocks, etc.), they
will attempt to read from the failed disk and fail. This can be avoided
by checking whether a tablet's disk group contains a failed disk before
attempting to read data from the tablet. If so, the tablet should be
marked as having an error so it can be reassigned.

The default behavior of the 'fs_target_data_dirs_per_tablet' flag is
updated to take into account disk state when assigning new directory
groups. This allows the tablet to be reassigned to a server without
being spread across a failed directory.

Testing is done by loading data into a cluster configured to use
multiple directories for data blocks, failing a single directory on one
of the tablet servers, and ensuring that the tablets with blocks on the
failed directory get re-replicated at startup time. The test uses a
cluster verifier to verify the healthy end-state of the cluster.
Necessary changes have been made to do this on a cluster comprising of
multiple data directories.

Change-Id: Id3fae98355657f6aa4b134c542f92fc07f5c0aa1
Reviewed-on: http://gerrit.cloudera.org:8080/7766
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/11dfe78e
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/11dfe78e
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/11dfe78e

Branch: refs/heads/master
Commit: 11dfe78e6181351a77ccb5efd0a74ff37b94b844
Parents: 22f4a0e
Author: Andrew Wong <aw...@cloudera.com>
Authored: Mon Aug 21 18:21:23 2017 -0700
Committer: Mike Percy <mp...@apache.org>
Committed: Thu Aug 24 08:20:41 2017 +0000

----------------------------------------------------------------------
 src/kudu/fs/data_dirs.cc                        |  32 ++++--
 src/kudu/fs/data_dirs.h                         |   5 +-
 src/kudu/fs/file_block_manager.cc               |  25 +++++
 src/kudu/fs/log_block_manager.cc                |  15 ++-
 src/kudu/integration-tests/CMakeLists.txt       |   1 +
 .../integration-tests/disk_failure-itest.cc     | 104 +++++++++++++++++++
 .../external_mini_cluster_fs_inspector.cc       |   8 +-
 src/kudu/tserver/ts_tablet_manager.cc           |   8 ++
 8 files changed, 185 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/fs/data_dirs.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/data_dirs.cc b/src/kudu/fs/data_dirs.cc
index 8c55431..19ae6c4 100644
--- a/src/kudu/fs/data_dirs.cc
+++ b/src/kudu/fs/data_dirs.cc
@@ -63,7 +63,7 @@ DEFINE_int32(fs_target_data_dirs_per_tablet, 0,
               "tablet's data across. If greater than the number of data dirs "
               "available, data will be striped across those available. The "
               "default value 0 indicates striping should occur across all "
-              "data directories.");
+              "healthy data directories.");
 DEFINE_validator(fs_target_data_dirs_per_tablet,
     [](const char* /*n*/, int32_t v) { return v >= 0; });
 TAG_FLAG(fs_target_data_dirs_per_tablet, advanced);
@@ -636,10 +636,15 @@ Status DataDirManager::CreateDataDirGroup(const string& tablet_id,
                                   "registered", tablet_id);
   }
   // Adjust the disk group size to fit within the total number of data dirs.
-  int group_target_size = std::min(FLAGS_fs_target_data_dirs_per_tablet,
-                                   static_cast<int>(data_dirs_.size()));
+  int group_target_size;
+  if (FLAGS_fs_target_data_dirs_per_tablet == 0) {
+    group_target_size = data_dirs_.size();
+  } else {
+    group_target_size = std::min(FLAGS_fs_target_data_dirs_per_tablet,
+                                 static_cast<int>(data_dirs_.size()));
+  }
   vector<uint16_t> group_indices;
-  if (group_target_size == 0 || mode == DirDistributionMode::ACROSS_ALL_DIRS) {
+  if (mode == DirDistributionMode::ACROSS_ALL_DIRS) {
     // If using all dirs, add all regardless of directory state.
     AppendKeysFromMap(data_dir_by_uuid_idx_, &group_indices);
   } else {
@@ -756,11 +761,14 @@ Status DataDirManager::GetDirsForGroupUnlocked(int target_size,
   DCHECK(dir_group_lock_.is_locked());
   vector<uint16_t> candidate_indices;
   for (auto& e : data_dir_by_uuid_idx_) {
+    if (ContainsKey(failed_data_dirs_, e.first)) {
+      continue;
+    }
     RETURN_NOT_OK(e.second->RefreshIsFull(DataDir::RefreshMode::ALWAYS));
     // TODO(awong): If a disk is unhealthy at the time of group creation, the
     // resulting group may be below targeted size. Add functionality to resize
     // groups. See KUDU-2040 for more details.
-    if (!e.second->is_full() && !ContainsKey(failed_data_dirs_, e.first)) {
+    if (!e.second->is_full()) {
       candidate_indices.push_back(e.first);
     }
   }
@@ -804,10 +812,10 @@ bool DataDirManager::FindUuidByRoot(const string& root, string* uuid) const {
   return FindCopy(uuid_by_root_, root, uuid);
 }
 
-set<string> DataDirManager::FindTabletsByDataDirUuidIdx(uint16_t uuid_idx) {
+set<string> DataDirManager::FindTabletsByDataDirUuidIdx(uint16_t uuid_idx) const {
   DCHECK_LT(uuid_idx, data_dirs_.size());
   shared_lock<rw_spinlock> lock(dir_group_lock_.get_lock());
-  set<string>* tablet_set_ptr = FindOrNull(tablets_by_uuid_idx_map_, uuid_idx);
+  const set<string>* tablet_set_ptr = FindOrNull(tablets_by_uuid_idx_map_, uuid_idx);
   if (tablet_set_ptr) {
     return *tablet_set_ptr;
   }
@@ -849,6 +857,16 @@ bool DataDirManager::IsDataDirFailed(uint16_t uuid_idx) const {
   return ContainsKey(failed_data_dirs_, uuid_idx);
 }
 
+bool DataDirManager::IsTabletInFailedDir(const string& tablet_id) const {
+  const set<uint16_t> failed_dirs = GetFailedDataDirs();
+  for (uint16_t failed_dir : failed_dirs) {
+    if (ContainsKey(FindTabletsByDataDirUuidIdx(failed_dir), tablet_id)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 void DataDirManager::RemoveUnhealthyDataDirsUnlocked(const vector<uint16_t>& uuid_indices,
                                                      vector<uint16_t>* healthy_indices) const {
   if (PREDICT_TRUE(failed_data_dirs_.empty())) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/fs/data_dirs.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/data_dirs.h b/src/kudu/fs/data_dirs.h
index 9cf2771..a110a5d 100644
--- a/src/kudu/fs/data_dirs.h
+++ b/src/kudu/fs/data_dirs.h
@@ -308,7 +308,7 @@ class DataDirManager {
 
   // Finds the set of tablet_ids in the data dir specified by 'uuid_idx' and
   // returns a copy, returning an empty set if none are found.
-  std::set<std::string> FindTabletsByDataDirUuidIdx(uint16_t uuid_idx);
+  std::set<std::string> FindTabletsByDataDirUuidIdx(uint16_t uuid_idx) const;
 
   // ==========================================================================
   // Directory Health
@@ -328,6 +328,9 @@ class DataDirManager {
   // Returns whether or not the 'uuid_idx' refers to a failed directory.
   bool IsDataDirFailed(uint16_t uuid_idx) const;
 
+  // Returns whether the tablet's data is spread across a failed directory.
+  bool IsTabletInFailedDir(const std::string& tablet_id) const;
+
   const std::set<uint16_t> GetFailedDataDirs() const {
     shared_lock<rw_spinlock> group_lock(dir_group_lock_.get_lock());
     return failed_data_dirs_;

http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/fs/file_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/file_block_manager.cc b/src/kudu/fs/file_block_manager.cc
index 10633e6..26aa10a 100644
--- a/src/kudu/fs/file_block_manager.cc
+++ b/src/kudu/fs/file_block_manager.cc
@@ -22,6 +22,7 @@
 #include <mutex>
 #include <numeric>
 #include <ostream>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -54,6 +55,7 @@
 #include "kudu/util/status.h"
 
 using std::accumulate;
+using std::set;
 using std::shared_ptr;
 using std::string;
 using std::unique_ptr;
@@ -589,7 +591,19 @@ Status FileBlockManager::Open(FsReport* report) {
 
   // Prepare the filesystem report and either return or log it.
   FsReport local_report;
+  set<uint16_t> failed_dirs = dd_manager_->GetFailedDataDirs();
   for (const auto& dd : dd_manager_->data_dirs()) {
+    // Don't report failed directories.
+    // TODO(KUDU-2111): currently the FsReport only reports on containers for
+    // the log block manager. Implement some sort of reporting for failed
+    // directories as well.
+    if (PREDICT_FALSE(!failed_dirs.empty())) {
+      uint16_t uuid_idx;
+      CHECK(dd_manager_->FindUuidIndexByDataDir(dd.get(), &uuid_idx));
+      if (ContainsKey(failed_dirs, uuid_idx)) {
+        continue;
+      }
+    }
     // TODO(adar): probably too expensive to fill out the stats/checks.
     local_report.data_dirs.push_back(dd->dir());
   }
@@ -691,6 +705,17 @@ Status FileBlockManager::OpenBlock(const BlockId& block_id,
 Status FileBlockManager::DeleteBlock(const BlockId& block_id) {
   CHECK(!read_only_);
 
+  // Return early if deleting a block in a failed directory.
+  set<uint16_t> failed_dirs = dd_manager_->GetFailedDataDirs();
+  if (PREDICT_FALSE(!failed_dirs.empty())) {
+    uint16_t uuid_idx = internal::FileBlockLocation::GetDataDirIdx(block_id);
+    if (ContainsKey(failed_dirs, uuid_idx)) {
+      LOG_EVERY_N(INFO, 10) << Substitute("Block $0 is in a failed directory; not deleting",
+                                          block_id.ToString());
+      return Status::IOError("Block is in a failed directory");
+    }
+  }
+
   string path;
   if (!FindBlockPath(block_id, &path)) {
     return Status::NotFound(

http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/fs/log_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index 97295f4..6f0648f 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -142,6 +142,7 @@ using internal::LogBlockContainer;
 using pb_util::ReadablePBContainerFile;
 using pb_util::WritablePBContainerFile;
 using std::map;
+using std::set;
 using std::shared_ptr;
 using std::string;
 using std::unique_ptr;
@@ -482,7 +483,7 @@ class LogBlockContainer {
         (max_num_blocks_ && (total_blocks_ >= max_num_blocks_));
   }
   const LogBlockManagerMetrics* metrics() const { return metrics_; }
-  const DataDir* data_dir() const { return data_dir_; }
+  DataDir* data_dir() const { return data_dir_; }
   const PathInstanceMetadataPB* instance() const { return data_dir_->instance()->metadata(); }
   bool read_only() const { return read_only_.Load(); }
 
@@ -1702,6 +1703,18 @@ Status LogBlockManager::DeleteBlock(const BlockId& block_id) {
       return Status::IllegalState("container $0 is read-only",
                                   lb->container()->ToString());
     }
+
+    // Return early if deleting a block in a failed directory.
+    set<uint16_t> failed_dirs = dd_manager_->GetFailedDataDirs();
+    if (PREDICT_FALSE(!failed_dirs.empty())) {
+      uint16_t uuid_idx;
+      CHECK(dd_manager_->FindUuidIndexByDataDir(lb->container()->data_dir(), &uuid_idx));
+      if (ContainsKey(failed_dirs, uuid_idx)) {
+        LOG_EVERY_N(INFO, 10) << Substitute("Block $0 is in a failed directory; not deleting",
+                                            block_id.ToString());
+        return Status::IOError("Block is in a failed directory");
+      }
+    }
     RemoveLogBlockUnlocked(it);
   }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/integration-tests/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/CMakeLists.txt b/src/kudu/integration-tests/CMakeLists.txt
index 4920e88..57c3f6c 100644
--- a/src/kudu/integration-tests/CMakeLists.txt
+++ b/src/kudu/integration-tests/CMakeLists.txt
@@ -68,6 +68,7 @@ ADD_KUDU_TEST(create-table-itest)
 ADD_KUDU_TEST(create-table-stress-test)
 ADD_KUDU_TEST(delete_table-itest)
 ADD_KUDU_TEST(delete_tablet-itest)
+ADD_KUDU_TEST(disk_failure-itest)
 ADD_KUDU_TEST(disk_reservation-itest)
 ADD_KUDU_TEST(exactly_once_writes-itest)
 ADD_KUDU_TEST(external_mini_cluster-test RESOURCE_LOCK "master-rpc-ports")

http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/integration-tests/disk_failure-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/disk_failure-itest.cc b/src/kudu/integration-tests/disk_failure-itest.cc
new file mode 100644
index 0000000..c36ccd1
--- /dev/null
+++ b/src/kudu/integration-tests/disk_failure-itest.cc
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "kudu/gutil/strings/substitute.h"
+#include "kudu/integration-tests/cluster_itest_util.h"
+#include "kudu/integration-tests/cluster_verifier.h"
+#include "kudu/integration-tests/external_mini_cluster-itest-base.h"
+#include "kudu/integration-tests/test_workload.h"
+#include "kudu/util/metrics.h"
+#include "kudu/util/path_util.h"
+#include "kudu/util/test_macros.h"
+
+METRIC_DECLARE_gauge_uint64(data_dirs_failed);
+
+namespace kudu {
+
+using std::string;
+using std::vector;
+using strings::Substitute;
+
+const MonoDelta kAgreementTimeout = MonoDelta::FromSeconds(30);
+
+class DiskFailureITest : public ExternalMiniClusterITestBase {
+ public:
+
+  // Waits for 'ext_tserver' to experience 'target_failed_disks' disk failures.
+  void WaitForDiskFailures(const ExternalTabletServer* ext_tserver,
+                           int64_t target_failed_disks = 1) const {
+    ASSERT_EVENTUALLY([&] {
+      int64_t failed_on_ts;
+      ASSERT_OK(ext_tserver->GetInt64Metric(
+          &METRIC_ENTITY_server, nullptr, &METRIC_data_dirs_failed, "value", &failed_on_ts));
+      ASSERT_EQ(target_failed_disks, failed_on_ts);
+    });
+  }
+};
+
+// Test ensuring that tablet server can be started with failed directories. A
+// cluster is started and loaded with some tablets. The tablet server is then
+// shut down and restarted. Errors are injected to one of the directories while
+// it is shut down.
+TEST_F(DiskFailureITest, TestFailDuringServerStartup) {
+  // Set up a cluster with three servers with five disks each.
+  NO_FATALS(StartCluster({}, {}, 3, 5));
+  const int kNumTablets = 5;
+  const int kNumRows = 100;
+
+  // Write some data to a tablet. This will spread blocks across all
+  // directories.
+  TestWorkload write_workload(cluster_.get());
+  write_workload.set_num_tablets(kNumTablets);
+  write_workload.Setup();
+  write_workload.Start();
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_GT(kNumRows, write_workload.rows_inserted());
+  });
+  write_workload.StopAndJoin();
+
+  // Ensure the tablets get to a running state.
+  ExternalTabletServer* ts = cluster_->tablet_server(0);
+  ASSERT_OK(cluster_->WaitForTabletsRunning(ts, kNumTablets, kAgreementTimeout));
+
+  // Introduce flags to fail one of the directories, avoiding the metadata
+  // directory, the next time the tablet server starts.
+  string failed_dir = ts->data_dirs()[1];
+  vector<string> extra_flags = {
+      Substitute("--env_inject_eio_globs=$0", JoinPathSegments(failed_dir, "**")),
+      "--env_inject_eio=1.0",
+      "--suicide_on_eio=false",
+  };
+  ts->mutable_flags()->insert(ts->mutable_flags()->begin(), extra_flags.begin(), extra_flags.end());
+  ts->Shutdown();
+
+  // Restart the tablet server with disk failures and ensure it can startup.
+  ASSERT_OK(ts->Restart());
+  NO_FATALS(WaitForDiskFailures(cluster_->tablet_server(0)));
+
+  // Ensure that the tablets are successfully evicted and copied.
+  ClusterVerifier v(cluster_.get());
+  NO_FATALS(v.CheckCluster());
+  NO_FATALS(v.CheckRowCount(write_workload.table_name(), ClusterVerifier::AT_LEAST,
+                            write_workload.batches_completed()));
+}
+
+}  // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
index 9d12343..3873d43 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
@@ -100,7 +100,7 @@ vector<string> ExternalMiniClusterFsInspector::ListTablets() {
 }
 
 vector<string> ExternalMiniClusterFsInspector::ListTabletsOnTS(int index) {
-  string data_dir = cluster_->tablet_server(index)->data_dir();
+  string data_dir = cluster_->tablet_server(index)->data_dirs()[0];
   string meta_dir = JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName);
   vector<string> tablets;
   CHECK_OK(ListFilesInDir(meta_dir, &tablets));
@@ -142,7 +142,7 @@ int ExternalMiniClusterFsInspector::CountReplicasInMetadataDirs() {
   // tablet servers isn't easy.
   int count = 0;
   for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
-    string data_dir = cluster_->tablet_server(i)->data_dir();
+    string data_dir = cluster_->tablet_server(i)->data_dirs()[0];
     count += CountFilesInDir(JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName));
   }
   return count;
@@ -171,7 +171,7 @@ Status ExternalMiniClusterFsInspector::CheckNoData() {
 
 string ExternalMiniClusterFsInspector::GetTabletSuperBlockPathOnTS(int ts_index,
                                                                    const string& tablet_id) const {
-  string data_dir = cluster_->tablet_server(ts_index)->data_dir();
+  string data_dir = cluster_->tablet_server(ts_index)->data_dirs()[0];
   string meta_dir = JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName);
   return JoinPathSegments(meta_dir, tablet_id);
 }
@@ -192,7 +192,7 @@ int64_t ExternalMiniClusterFsInspector::GetTabletSuperBlockMTimeOrDie(int ts_ind
 
 string ExternalMiniClusterFsInspector::GetConsensusMetadataPathOnTS(int index,
                                                                     const string& tablet_id) const {
-  string data_dir = cluster_->tablet_server(index)->data_dir();
+  string data_dir = cluster_->tablet_server(index)->data_dirs()[0];
   string cmeta_dir = JoinPathSegments(data_dir, FsManager::kConsensusMetadataDirName);
   return JoinPathSegments(cmeta_dir, tablet_id);
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/11dfe78e/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 0a60fd8..3bba569 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -804,6 +804,14 @@ void TSTabletManager::OpenTablet(const scoped_refptr<TabletReplica>& replica,
     return;
   }
 
+  // If the tablet is in a failed directory, don't bother bootstrapping.
+  if (fs_manager_->dd_manager()->IsTabletInFailedDir(tablet_id)) {
+    LOG(ERROR) << LogPrefix(tablet_id) << "aborting tablet bootstrap: tablet "
+                                          "has data in a failed directory";
+    s = Status::IOError("Tablet data is in a failed directory");
+    return;
+  }
+
   consensus::ConsensusBootstrapInfo bootstrap_info;
   LOG_TIMING_PREFIX(INFO, LogPrefix(tablet_id), "bootstrapping tablet") {
     // Disable tracing for the bootstrap, since this would result in