You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2017/11/07 19:15:14 UTC

[2/6] kudu git commit: KUDU-2202 avoid block ID reuse for missing dirs

KUDU-2202 avoid block ID reuse for missing dirs

The block manager is initially only aware of the blocks it reads when
first loading the FS layout. If the opening of a directory fails (e.g.
due to a disk failure), the server can still start up, but will fail any
tablet configured to use that directory. By not scanning the failed
directory, the server will "forget" about some blocks.

Since LBM block IDs are sequentially allocated just past the end of the
greatest known ID, it's possible for a new tablet to create blocks with
IDs already assigned to a failed tablet. If that failed tablet is
deleted, the live data belonging to the new tablet will be deleted.

To prevent this, a new API has been added to the block managers to grant
external components the ability to notify the block manager of block
IDs. In loading a tablet's metadata, once all rowsets and orphaned
blocks have been seen, the block manager is notified of the highest
block ID in the superblock.

This patch only targets the LBM, which assigns blocks sequentially,
and therefore only needs to know about the largest block ID referenced
by tablets.

A unit test is added to log_block_manager-test and an integration test
is added to ts_recovery-itest to ensure block ID reuse does not happen.
Also includes some C++11/stylistic cleanup.

Change-Id: I06f10e95278dafdedcab432a7d4d1dc5c59bf4cc
Reviewed-on: http://gerrit.cloudera.org:8080/8465
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/47b81c45
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/47b81c45
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/47b81c45

Branch: refs/heads/master
Commit: 47b81c452194e75da7fd966f07766de4bdcdeab0
Parents: b7afa9c
Author: Andrew Wong <aw...@cloudera.com>
Authored: Thu Nov 2 20:09:22 2017 -0700
Committer: Andrew Wong <aw...@cloudera.com>
Committed: Mon Nov 6 23:17:22 2017 +0000

----------------------------------------------------------------------
 src/kudu/fs/block_id.h                          |   8 +-
 src/kudu/fs/block_manager.h                     |   6 +
 src/kudu/fs/file_block_manager.cc               |   6 +
 src/kudu/fs/file_block_manager.h                |   2 +
 src/kudu/fs/log_block_manager-test.cc           |  32 ++++
 src/kudu/fs/log_block_manager.cc                |   4 +
 src/kudu/fs/log_block_manager.h                 |   3 +
 src/kudu/integration-tests/ts_recovery-itest.cc | 188 +++++++++++++++----
 src/kudu/tablet/diskrowset.cc                   |   2 +-
 src/kudu/tablet/metadata-test.cc                |   5 +-
 src/kudu/tablet/rowset_metadata.cc              |   9 +-
 src/kudu/tablet/rowset_metadata.h               |  11 +-
 src/kudu/tablet/tablet-test-util.h              |   3 +-
 src/kudu/tablet/tablet_metadata.cc              |  24 ++-
 src/kudu/tablet/tablet_metadata.h               |   2 +-
 15 files changed, 247 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/fs/block_id.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_id.h b/src/kudu/fs/block_id.h
index bbfba6c..b1ebe03 100644
--- a/src/kudu/fs/block_id.h
+++ b/src/kudu/fs/block_id.h
@@ -59,6 +59,10 @@ class BlockId {
     return id_ != other.id_;
   }
 
+  bool operator<(const BlockId& other) const {
+    return id_ < other.id_;
+  }
+
   // Returns the raw ID. Use with care; in most cases the BlockId should be
   // treated as a completely opaque value.
   uint64_t id() const { return id_; }
@@ -86,13 +90,13 @@ struct BlockIdHash {
 
 struct BlockIdCompare {
   bool operator()(const BlockId& first, const BlockId& second) const {
-    return first.id() < second.id();
+    return first < second;
   }
 };
 
 struct BlockIdEqual {
   bool operator()(const BlockId& first, const BlockId& second) const {
-    return first.id() == second.id();
+    return first == second;
   }
 };
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/fs/block_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_manager.h b/src/kudu/fs/block_manager.h
index 62b9ef7..a80ab43 100644
--- a/src/kudu/fs/block_manager.h
+++ b/src/kudu/fs/block_manager.h
@@ -260,6 +260,12 @@ class BlockManager {
   // even exist after the call.
   virtual Status GetAllBlockIds(std::vector<BlockId>* block_ids) = 0;
 
+  // Notifies the block manager of the presence of a block id. This allows
+  // block managers that use sequential block ids to avoid reusing
+  // externally-referenced ids that they may not have previously found (e.g.
+  // because those ids' blocks were on a data directory that failed).
+  virtual void NotifyBlockId(BlockId block_id) = 0;
+
   // Exposes the FsErrorManager used to handle fs errors.
   virtual FsErrorManager* error_manager() = 0;
 };

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/fs/file_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/file_block_manager.cc b/src/kudu/fs/file_block_manager.cc
index 2e87008..dd69c5c 100644
--- a/src/kudu/fs/file_block_manager.cc
+++ b/src/kudu/fs/file_block_manager.cc
@@ -939,5 +939,11 @@ Status FileBlockManager::GetAllBlockIds(vector<BlockId>* block_ids) {
   return Status::OK();
 }
 
+void FileBlockManager::NotifyBlockId(BlockId /* block_id */) {
+  // Since the FileBlockManager doesn't keep a record of blocks, this does
+  // nothing. This opens it up for block ID reuse if, say, a directory were
+  // removed.
+}
+
 } // namespace fs
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/fs/file_block_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/file_block_manager.h b/src/kudu/fs/file_block_manager.h
index 27d7e67..877b14a 100644
--- a/src/kudu/fs/file_block_manager.h
+++ b/src/kudu/fs/file_block_manager.h
@@ -94,6 +94,8 @@ class FileBlockManager : public BlockManager {
 
   Status GetAllBlockIds(std::vector<BlockId>* block_ids) override;
 
+  void NotifyBlockId(BlockId block_id) override;
+
   FsErrorManager* error_manager() override { return error_manager_; }
 
  private:

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/fs/log_block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test.cc b/src/kudu/fs/log_block_manager-test.cc
index 5394099..1353345 100644
--- a/src/kudu/fs/log_block_manager-test.cc
+++ b/src/kudu/fs/log_block_manager-test.cc
@@ -485,6 +485,38 @@ TEST_F(LogBlockManagerTest, ContainerPreallocationTest) {
   ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size);
 }
 
+// Test for KUDU-2202 to ensure that once the block manager has been notified
+// of a block ID, it will not reuse it.
+TEST_F(LogBlockManagerTest, TestBumpBlockIds) {
+  const int kNumBlocks = 10;
+  vector<BlockId> block_ids;
+  unique_ptr<WritableBlock> writer;
+  for (int i = 0; i < kNumBlocks; i++) {
+    ASSERT_OK(bm_->CreateBlock(test_block_opts_, &writer));
+    block_ids.push_back(writer->id());
+  }
+  BlockId max_so_far = *std::max_element(block_ids.begin(), block_ids.end());
+
+  // Simulate a complete reset of the block manager's block ID record, e.g.
+  // from restarting but with all the blocks gone.
+  bm_->next_block_id_.Store(1);
+
+  // Now simulate being notified by some other component (e.g. tablet metadata)
+  // of the presence of a block ID.
+  bm_->NotifyBlockId(BlockId(max_so_far));
+
+  // Once notified, new blocks should be assigned higher IDs.
+  ASSERT_OK(bm_->CreateBlock(test_block_opts_, &writer));
+  ASSERT_LT(max_so_far, writer->id());
+  max_so_far = writer->id();
+
+  // Notifications of lower or invalid block IDs should not disrupt ordering.
+  bm_->NotifyBlockId(BlockId(1));
+  bm_->NotifyBlockId(BlockId());
+  ASSERT_OK(bm_->CreateBlock(test_block_opts_, &writer));
+  ASSERT_LT(max_so_far, writer->id());
+}
+
 // Regression test for KUDU-1190, a crash at startup when a block ID has been
 // reused.
 TEST_F(LogBlockManagerTest, TestReuseBlockIds) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/fs/log_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index 2b1051b..adca9f3 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -1895,6 +1895,10 @@ Status LogBlockManager::GetAllBlockIds(vector<BlockId>* block_ids) {
   return Status::OK();
 }
 
+void LogBlockManager::NotifyBlockId(BlockId block_id) {
+  next_block_id_.StoreMax(block_id.id() + 1);
+}
+
 void LogBlockManager::AddNewContainerUnlocked(LogBlockContainer* container) {
   DCHECK(lock_.is_locked());
   InsertOrDie(&all_containers_by_name_, container->ToString(), container);

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/fs/log_block_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.h b/src/kudu/fs/log_block_manager.h
index 8922e15..edcec5c 100644
--- a/src/kudu/fs/log_block_manager.h
+++ b/src/kudu/fs/log_block_manager.h
@@ -205,6 +205,8 @@ class LogBlockManager : public BlockManager {
 
   Status GetAllBlockIds(std::vector<BlockId>* block_ids) override;
 
+  void NotifyBlockId(BlockId block_id) override;
+
   FsErrorManager* error_manager() override { return error_manager_; }
 
  private:
@@ -216,6 +218,7 @@ class LogBlockManager : public BlockManager {
   FRIEND_TEST(LogBlockManagerTest, TestLookupBlockLimit);
   FRIEND_TEST(LogBlockManagerTest, TestMetadataTruncation);
   FRIEND_TEST(LogBlockManagerTest, TestParseKernelRelease);
+  FRIEND_TEST(LogBlockManagerTest, TestBumpBlockIds);
   FRIEND_TEST(LogBlockManagerTest, TestReuseBlockIds);
   FRIEND_TEST(LogBlockManagerTest, TestFailMultipleTransactionsPerContainer);
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/integration-tests/ts_recovery-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_recovery-itest.cc b/src/kudu/integration-tests/ts_recovery-itest.cc
index 772054a..c396bf8 100644
--- a/src/kudu/integration-tests/ts_recovery-itest.cc
+++ b/src/kudu/integration-tests/ts_recovery-itest.cc
@@ -15,13 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
 #include <cstdint>
+#include <iterator>
 #include <memory>
 #include <ostream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
 #include <vector>
 
 #include <glog/logging.h>
@@ -44,23 +45,31 @@
 #include "kudu/consensus/log.h"
 #include "kudu/consensus/log_util.h"
 #include "kudu/consensus/opid.pb.h"
+#include "kudu/fs/block_id.h"
+#include "kudu/fs/block_manager.h"
+#include "kudu/fs/data_dirs.h"
+#include "kudu/fs/fs.pb.h"
 #include "kudu/fs/fs_manager.h"
 #include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/ref_counted.h"
+#include "kudu/gutil/strings/substitute.h"
 #include "kudu/gutil/strings/util.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
 #include "kudu/integration-tests/cluster_verifier.h"
 #include "kudu/integration-tests/external_mini_cluster-itest-base.h"
+#include "kudu/integration-tests/external_mini_cluster_fs_inspector.h"
 #include "kudu/integration-tests/test_workload.h"
 #include "kudu/mini-cluster/external_mini_cluster.h"
 #include "kudu/tablet/metadata.pb.h"
 #include "kudu/tablet/tablet.pb.h"
+#include "kudu/tablet/tablet_metadata.h"
 #include "kudu/tserver/tserver.pb.h"
 #include "kudu/util/atomic.h"
 #include "kudu/util/env.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/path_util.h"
 #include "kudu/util/random.h"
 #include "kudu/util/random_util.h"
 #include "kudu/util/status.h"
@@ -68,6 +77,8 @@
 #include "kudu/util/test_util.h"
 #include "kudu/util/thread.h"
 
+METRIC_DECLARE_gauge_uint64(tablets_num_failed);
+
 using std::string;
 using std::unique_ptr;
 using std::vector;
@@ -81,15 +92,22 @@ using client::KuduSession;
 using client::KuduTable;
 using client::KuduUpdate;
 using client::sp::shared_ptr;
+using cluster::ExternalTabletServer;
+using cluster::ExternalMiniClusterOptions;
 using clock::Clock;
 using clock::HybridClock;
 using consensus::ConsensusMetadata;
 using consensus::ConsensusMetadataManager;
 using consensus::OpId;
 using consensus::RECEIVED_OPID;
+using fs::BlockManager;
+using itest::ExternalMiniClusterFsInspector;
 using log::AppendNoOpsToLogSync;
 using log::Log;
 using log::LogOptions;
+using strings::Substitute;
+using tablet::TabletMetadata;
+using tablet::TabletSuperBlockPB;
 
 namespace {
 // Generate a row key such that an increasing sequence (0...N) ends up spreading writes
@@ -101,44 +119,141 @@ int IntToKey(int i) {
 } // anonymous namespace
 
 class TsRecoveryITest : public ExternalMiniClusterITestBase,
-                        public ::testing::WithParamInterface<const char*> {
+                        public ::testing::WithParamInterface<string> {
  public:
-  TsRecoveryITest() { extra_tserver_flags_.emplace_back(GetParam()); }
 
  protected:
-  void StartClusterOneTs(const vector<string>& extra_tserver_flags,
-                         const vector<string>& extra_master_flags = {});
+  void StartClusterOneTs(vector<string> extra_tserver_flags = {},
+                         vector<string> extra_master_flags = {});
 
-  vector<string> extra_tserver_flags_ = {};
 };
 
-void TsRecoveryITest::StartClusterOneTs(const vector<string>& extra_tserver_flags,
-                                        const vector<string>& extra_master_flags) {
-  StartCluster(extra_tserver_flags, extra_master_flags, 1 /* replicas */);
+void TsRecoveryITest::StartClusterOneTs(vector<string> extra_tserver_flags,
+                                        vector<string> extra_master_flags) {
+  ExternalMiniClusterOptions opts;
+  opts.num_tablet_servers = 1;
+  opts.block_manager_type = GetParam();
+  opts.extra_tserver_flags = std::move(extra_tserver_flags);
+  opts.extra_master_flags = std::move(extra_master_flags);
+  StartClusterWithOpts(opts);
 }
 
-#if defined(__APPLE__)
-static const char* kBlockManagerFlags[] = {"--block_manager=file"};
-#else
-static const char* kBlockManagerFlags[] = {"--block_manager=log",
-                                           "--block_manager=file"};
-#endif
+// Test for KUDU-2202 that ensures that blocks not found in the FS layer but
+// that are referenced by a tablet will not be reused.
+TEST_P(TsRecoveryITest, TestNoBlockIDReuseIfMissingBlocks) {
+  if (GetParam() != "log") {
+    LOG(INFO) << "Missing blocks is currently only supported by the log "
+                 "block manager. Exiting early!";
+    return;
+  }
+  // Set up a basic server that flushes often so we create blocks quickly.
+  NO_FATALS(StartClusterOneTs({
+    "--flush_threshold_secs=1",
+    "--flush_threshold_mb=1"
+  }));
+
+  // Write to the tserver and wait for some blocks to be written.
+  const auto StartSingleTabletWorkload = [&] (const string& table_name) {
+    TestWorkload* write_workload(new TestWorkload(cluster_.get()));
+    write_workload->set_table_name(table_name);
+    write_workload->set_num_tablets(1);
+    write_workload->set_num_replicas(1);
+    write_workload->Setup();
+    write_workload->Start();
+    return write_workload;
+  };
+
+  unique_ptr<TestWorkload> write_workload(StartSingleTabletWorkload("foo"));
+  unique_ptr<ExternalMiniClusterFsInspector> inspect(
+      new ExternalMiniClusterFsInspector(cluster_.get()));
+  vector<string> tablets = inspect->ListTabletsOnTS(0);
+  ASSERT_EQ(1, tablets.size());
+  const string tablet_id = tablets[0];
+
+  // Get the block ids for a given tablet.
+  const auto BlocksForTablet = [&] (const string& tablet_id,
+                                    vector<BlockId>* live_block_ids,
+                                    vector<BlockId>* orphaned_block_ids = nullptr) {
+    TabletSuperBlockPB superblock_pb;
+    RETURN_NOT_OK(inspect->ReadTabletSuperBlockOnTS(0, tablet_id, &superblock_pb));
+    if (orphaned_block_ids) {
+      orphaned_block_ids->clear();
+      for (const auto& pb : superblock_pb.orphaned_blocks()) {
+        orphaned_block_ids->push_back(BlockId::FromPB(pb));
+      }
+    }
+    live_block_ids->clear();
+    vector<BlockIdPB> block_id_pbs = TabletMetadata::CollectBlockIdPBs(superblock_pb);
+    for (const auto& pb : block_id_pbs) {
+      live_block_ids->push_back(BlockId::FromPB(pb));
+    }
+    return Status::OK();
+  };
+
+  // Wait until we have some live blocks and some orphaned blocks and collect
+  // all of the blocks ids.
+  vector<BlockId> block_ids;
+  vector<BlockId> orphaned_block_ids;
+  ASSERT_EVENTUALLY([&] () {
+    ASSERT_OK(BlocksForTablet(tablet_id, &block_ids, &orphaned_block_ids));
+    ASSERT_TRUE(!block_ids.empty());
+    ASSERT_TRUE(!orphaned_block_ids.empty());
+    block_ids.insert(block_ids.end(), orphaned_block_ids.begin(), orphaned_block_ids.end());
+  });
+  write_workload->StopAndJoin();
+  ExternalTabletServer* ts = cluster_->tablet_server(0);
+  ts->Shutdown();
 
-// Passes block manager types to the recovery test so we get some extra
-// testing to cover non-default block manager types.
-// Note that this could actually be any other flags you want to pass to
-// the cluster startup, as long as they don't conflict with any test
-// specific flags.
-INSTANTIATE_TEST_CASE_P(BlockManagerType,
-                        TsRecoveryITest,
-                        ::testing::ValuesIn(kBlockManagerFlags));
+  // Now empty the data directories of blocks. The server will start up, not
+  // read any blocks, but should still avoid the old tablet's blocks.
+  for (const string& dir : ts->data_dirs()) {
+    const string& data_dir = JoinPathSegments(dir, "data");
+    vector<string> children;
+    ASSERT_OK(env_->GetChildren(data_dir, &children));
+    for (const string& child : children) {
+      if (child != "." && child != ".." && child != fs::kInstanceMetadataFileName) {
+        ASSERT_OK(env_->DeleteFile(JoinPathSegments(data_dir, child)));
+      }
+    }
+  }
+  ASSERT_OK(ts->Restart());
+
+  // Sanity check that the tablet goes into a failed state with its missing blocks.
+  int64_t failed_on_ts = 0;
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(itest::GetInt64Metric(ts->bound_http_hostport(),
+        &METRIC_ENTITY_server, nullptr, &METRIC_tablets_num_failed, "value", &failed_on_ts));
+    ASSERT_EQ(1, failed_on_ts);
+  });
+
+  // Create a new tablet and collect its blocks.
+  write_workload.reset(StartSingleTabletWorkload("bar"));
+  tablets = inspect->ListTabletsOnTS(0);
+  ASSERT_EQ(2, tablets.size());
+  const string& new_tablet_id = tablets[0] == tablet_id ? tablets[1] : tablets[0];
+  vector<BlockId> new_block_ids;
+  ASSERT_EVENTUALLY([&] () {
+    ASSERT_OK(BlocksForTablet(new_tablet_id, &new_block_ids));
+    ASSERT_TRUE(!new_block_ids.empty());
+  });
 
+  // Compare the tablets' block IDs and ensure there is no overlap.
+  vector<BlockId> block_id_intersection;
+  std::set_intersection(block_ids.begin(), block_ids.end(),
+                        new_block_ids.begin(), new_block_ids.end(),
+                        std::back_inserter(block_id_intersection));
+  {
+    SCOPED_TRACE(Substitute("First tablet's blocks: $0", BlockId::JoinStrings(block_ids)));
+    SCOPED_TRACE(Substitute("Second tablet's blocks: $0", BlockId::JoinStrings(new_block_ids)));
+    SCOPED_TRACE(Substitute("Intersection: $0", BlockId::JoinStrings(block_id_intersection)));
+    ASSERT_TRUE(block_id_intersection.empty());
+  }
+}
 // Test crashing a server just before appending a COMMIT message.
 // We then restart the server and ensure that all rows successfully
 // inserted before the crash are recovered.
 TEST_P(TsRecoveryITest, TestRestartWithOrphanedReplicates) {
-  // Add the block manager type.
-  NO_FATALS(StartClusterOneTs(extra_tserver_flags_));
+  NO_FATALS(StartClusterOneTs());
 
   TestWorkload work(cluster_.get());
   work.set_num_replicas(1);
@@ -180,7 +295,7 @@ TEST_P(TsRecoveryITest, TestRestartWithOrphanedReplicates) {
 // bootstrap to fail if that message only included errors and no
 // successful operations.
 TEST_P(TsRecoveryITest, TestRestartWithPendingCommitFromFailedOp) {
-  NO_FATALS(StartClusterOneTs(extra_tserver_flags_));
+  NO_FATALS(StartClusterOneTs());
   ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(0),
                               "fault_crash_before_append_commit", "0.01"));
 
@@ -219,9 +334,7 @@ TEST_P(TsRecoveryITest, TestRestartWithPendingCommitFromFailedOp) {
 
 // Test that we replay from the recovery directory, if it exists.
 TEST_P(TsRecoveryITest, TestCrashDuringLogReplay) {
-  vector<string> extra_tserver_flags_with_crash = extra_tserver_flags_;
-  extra_tserver_flags_with_crash.emplace_back("--fault_crash_during_log_replay=0.05");
-  NO_FATALS(StartClusterOneTs(extra_tserver_flags_with_crash));
+  NO_FATALS(StartClusterOneTs({ "--fault_crash_during_log_replay=0.05" }));
 
   TestWorkload work(cluster_.get());
   work.set_num_replicas(1);
@@ -268,9 +381,10 @@ TEST_P(TsRecoveryITest, TestCrashDuringLogReplay) {
 // but before writing its header, the TS would previously crash on restart.
 // Instead, it should ignore the uninitialized segment.
 TEST_P(TsRecoveryITest, TestCrashBeforeWriteLogSegmentHeader) {
-  extra_tserver_flags_.emplace_back("--log_segment_size_mb=1");
-  extra_tserver_flags_.emplace_back("--log_compression_codec=NO_COMPRESSION");
-  NO_FATALS(StartClusterOneTs(extra_tserver_flags_));
+  NO_FATALS(StartClusterOneTs({
+    "--log_segment_size_mb=1",
+    "--log_compression_codec=NO_COMPRESSION"
+  }));
   TestWorkload work(cluster_.get());
   work.set_num_replicas(1);
   work.set_write_timeout_millis(1000);
@@ -308,8 +422,7 @@ TEST_P(TsRecoveryITest, TestCrashBeforeWriteLogSegmentHeader) {
 TEST_P(TsRecoveryITest, TestChangeMaxCellSize) {
   // Prevent the master from tombstoning the evicted tablet so we can observe
   // its FAILED state.
-  NO_FATALS(StartClusterOneTs(extra_tserver_flags_,
-      { "--master_tombstone_evicted_tablet_replicas=false" }));
+  NO_FATALS(StartClusterOneTs({}, { "--master_tombstone_evicted_tablet_replicas=false" }));
   TestWorkload work(cluster_.get());
   work.set_num_replicas(1);
   work.set_payload_bytes(10000);
@@ -362,7 +475,7 @@ TEST_P(TsRecoveryITestDeathTest, TestRecoverFromOpIdOverflow) {
 
   // Create the initial tablet files on disk, then shut down the cluster so we
   // can meddle with the WAL.
-  NO_FATALS(StartClusterOneTs(extra_tserver_flags_));
+  NO_FATALS(StartClusterOneTs());
   TestWorkload workload(cluster_.get());
   workload.set_num_replicas(1);
   workload.Setup();
@@ -655,4 +768,9 @@ TEST_P(Kudu969Test, Test) {
   NO_FATALS(v.CheckCluster());
 }
 
+// Passes block manager types to the recovery test so we get some extra
+// testing to cover non-default block manager types.
+INSTANTIATE_TEST_CASE_P(BlockManagerType, TsRecoveryITest,
+    ::testing::ValuesIn(BlockManager::block_manager_types()));
+
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/tablet/diskrowset.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc
index b05015f..bc3c99a 100644
--- a/src/kudu/tablet/diskrowset.cc
+++ b/src/kudu/tablet/diskrowset.cc
@@ -338,7 +338,7 @@ Status RollingDiskRowSetWriter::RollWriter() {
   // Close current writer if it is open
   RETURN_NOT_OK(FinishCurrentWriter());
 
-  RETURN_NOT_OK(tablet_metadata_->CreateRowSet(&cur_drs_metadata_, schema_));
+  RETURN_NOT_OK(tablet_metadata_->CreateRowSet(&cur_drs_metadata_));
 
   cur_writer_.reset(new DiskRowSetWriter(cur_drs_metadata_.get(), &schema_, bloom_sizing_));
   RETURN_NOT_OK(cur_writer_->Open());

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/tablet/metadata-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/metadata-test.cc b/src/kudu/tablet/metadata-test.cc
index cf419ec..c213540 100644
--- a/src/kudu/tablet/metadata-test.cc
+++ b/src/kudu/tablet/metadata-test.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -22,7 +23,6 @@
 #include <gtest/gtest.h>
 
 #include "kudu/fs/block_id.h"
-#include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/tablet/rowset_metadata.h"
@@ -31,6 +31,7 @@
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
 
+using std::unique_ptr;
 using std::vector;
 using std::string;
 using strings::Substitute;
@@ -54,7 +55,7 @@ class MetadataTest : public KuduTest {
  protected:
   vector<BlockId> all_blocks_;
   scoped_refptr<TabletMetadata> tablet_meta_;
-  gscoped_ptr<RowSetMetadata> meta_;
+  unique_ptr<RowSetMetadata> meta_;
 };
 
 // Swap out some deltas from the middle of the list

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/tablet/rowset_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/rowset_metadata.cc b/src/kudu/tablet/rowset_metadata.cc
index 7f26988..f459310 100644
--- a/src/kudu/tablet/rowset_metadata.cc
+++ b/src/kudu/tablet/rowset_metadata.cc
@@ -30,6 +30,7 @@
 #include "kudu/gutil/map-util.h"
 #include "kudu/tablet/metadata.pb.h"
 
+using std::unique_ptr;
 using std::vector;
 using strings::Substitute;
 
@@ -41,16 +42,16 @@ namespace tablet {
 // ============================================================================
 Status RowSetMetadata::Load(TabletMetadata* tablet_metadata,
                             const RowSetDataPB& pb,
-                            gscoped_ptr<RowSetMetadata>* metadata) {
-  gscoped_ptr<RowSetMetadata> ret(new RowSetMetadata(tablet_metadata));
+                            unique_ptr<RowSetMetadata>* metadata) {
+  unique_ptr<RowSetMetadata> ret(new RowSetMetadata(tablet_metadata));
   RETURN_NOT_OK(ret->InitFromPB(pb));
-  metadata->reset(ret.release());
+  *metadata = std::move(ret);
   return Status::OK();
 }
 
 Status RowSetMetadata::CreateNew(TabletMetadata* tablet_metadata,
                                  int64_t id,
-                                 gscoped_ptr<RowSetMetadata>* metadata) {
+                                 unique_ptr<RowSetMetadata>* metadata) {
   metadata->reset(new RowSetMetadata(tablet_metadata, id));
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/tablet/rowset_metadata.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/rowset_metadata.h b/src/kudu/tablet/rowset_metadata.h
index 9f1e62e..c546cb5 100644
--- a/src/kudu/tablet/rowset_metadata.h
+++ b/src/kudu/tablet/rowset_metadata.h
@@ -14,11 +14,12 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#ifndef KUDU_TABLET_ROWSET_METADATA_H
-#define KUDU_TABLET_ROWSET_METADATA_H
+
+#pragma once
 
 #include <cstdint>
 #include <map>
+#include <memory>
 #include <mutex>
 #include <string>
 #include <vector>
@@ -30,7 +31,6 @@
 #include "kudu/common/schema.h"
 #include "kudu/fs/block_id.h"
 #include "kudu/fs/fs_manager.h"
-#include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/tablet/tablet_metadata.h"
@@ -75,12 +75,12 @@ class RowSetMetadata {
   // Create a new RowSetMetadata
   static Status CreateNew(TabletMetadata* tablet_metadata,
                           int64_t id,
-                          gscoped_ptr<RowSetMetadata>* metadata);
+                          std::unique_ptr<RowSetMetadata>* metadata);
 
   // Load metadata from a protobuf which was previously read from disk.
   static Status Load(TabletMetadata* tablet_metadata,
                      const RowSetDataPB& pb,
-                     gscoped_ptr<RowSetMetadata>* metadata);
+                     std::unique_ptr<RowSetMetadata>* metadata);
 
   Status Flush();
 
@@ -270,4 +270,3 @@ class RowSetMetadataUpdate {
 
 } // namespace tablet
 } // namespace kudu
-#endif /* KUDU_TABLET_ROWSET_METADATA_H */

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/tablet/tablet-test-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet-test-util.h b/src/kudu/tablet/tablet-test-util.h
index 7256502..1881df8 100644
--- a/src/kudu/tablet/tablet-test-util.h
+++ b/src/kudu/tablet/tablet-test-util.h
@@ -124,8 +124,7 @@ class KuduRowSetTest : public KuduTabletTest {
 
   virtual void SetUp() OVERRIDE {
     KuduTabletTest::SetUp();
-    ASSERT_OK(tablet()->metadata()->CreateRowSet(&rowset_meta_,
-                                                       SchemaBuilder(schema_).Build()));
+    ASSERT_OK(tablet()->metadata()->CreateRowSet(&rowset_meta_));
   }
 
   Status FlushMetadata() {

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index 873eb10..d848c1f 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -69,6 +69,7 @@ using kudu::pb_util::SecureShortDebugString;
 using std::memory_order_relaxed;
 using std::shared_ptr;
 using std::string;
+using std::unique_ptr;
 using std::vector;
 using strings::Substitute;
 
@@ -393,17 +394,31 @@ Status TabletMetadata::LoadFromSuperBlock(const TabletSuperBlockPB& superblock)
 
     rowsets_.clear();
     for (const RowSetDataPB& rowset_pb : superblock.rowsets()) {
-      gscoped_ptr<RowSetMetadata> rowset_meta;
+      unique_ptr<RowSetMetadata> rowset_meta;
       RETURN_NOT_OK(RowSetMetadata::Load(this, rowset_pb, &rowset_meta));
       next_rowset_idx_ = std::max(next_rowset_idx_, rowset_meta->id() + 1);
       rowsets_.push_back(shared_ptr<RowSetMetadata>(rowset_meta.release()));
     }
 
+    // Determine the largest block ID known to the tablet metadata so we can
+    // notify the block manager of blocks it may have missed (e.g. if a data
+    // directory failed and the blocks on it were not read).
+    BlockId max_block_id;
+    const auto& block_ids = CollectBlockIds();
+    for (BlockId block_id : block_ids) {
+      max_block_id = std::max(max_block_id, block_id);
+    }
+
     for (const BlockIdPB& block_pb : superblock.orphaned_blocks()) {
-      orphaned_blocks.push_back(BlockId::FromPB(block_pb));
+      BlockId orphaned_block_id = BlockId::FromPB(block_pb);
+      max_block_id = std::max(max_block_id, orphaned_block_id);
+      orphaned_blocks.push_back(orphaned_block_id);
     }
     AddOrphanedBlocksUnlocked(orphaned_blocks);
 
+    // Notify the block manager of the highest block ID seen.
+    fs_manager()->block_manager()->NotifyBlockId(max_block_id);
+
     if (superblock.has_data_dir_group()) {
       RETURN_NOT_OK_PREPEND(fs_manager_->dd_manager()->LoadDataDirGroupFromPB(
           tablet_id_, superblock.data_dir_group()), "Failed to load DataDirGroup from superblock");
@@ -663,10 +678,9 @@ Status TabletMetadata::ToSuperBlockUnlocked(TabletSuperBlockPB* super_block,
   return Status::OK();
 }
 
-Status TabletMetadata::CreateRowSet(shared_ptr<RowSetMetadata> *rowset,
-                                    const Schema& schema) {
+Status TabletMetadata::CreateRowSet(shared_ptr<RowSetMetadata>* rowset) {
   AtomicWord rowset_idx = Barrier_AtomicIncrement(&next_rowset_idx_, 1) - 1;
-  gscoped_ptr<RowSetMetadata> scoped_rsm;
+  unique_ptr<RowSetMetadata> scoped_rsm;
   RETURN_NOT_OK(RowSetMetadata::CreateNew(this, rowset_idx, &scoped_rsm));
   rowset->reset(DCHECK_NOTNULL(scoped_rsm.release()));
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/kudu/blob/47b81c45/src/kudu/tablet/tablet_metadata.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.h b/src/kudu/tablet/tablet_metadata.h
index 447e6ce..e94101e 100644
--- a/src/kudu/tablet/tablet_metadata.h
+++ b/src/kudu/tablet/tablet_metadata.h
@@ -216,7 +216,7 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> {
   // Create a new RowSetMetadata for this tablet.
   // Does not add the new rowset to the list of rowsets. Use one of the Update()
   // calls to do so.
-  Status CreateRowSet(std::shared_ptr<RowSetMetadata> *rowset, const Schema& schema);
+  Status CreateRowSet(std::shared_ptr<RowSetMetadata>* rowset);
 
   const RowSetMetadataVector& rowsets() const { return rowsets_; }