You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2017/09/25 18:52:04 UTC

kudu git commit: KUDU-2055 [part 1]: Add fs::BlockDeletionTransaction API

Repository: kudu
Updated Branches:
  refs/heads/master 9c1997ad3 -> 26a32fc4b


KUDU-2055 [part 1]: Add fs::BlockDeletionTransaction API

Similar to 'BlockCreationTransaction', this patch adds a new layer of
abstraction at Block Manager to coalesce blocks deletions in a logical
operation, e.g. compaction.

By coalescing blocks deletions, the number of holes punched in LBM will
be reduced from one per block to one per log block container in the
best case (that the individual holes in one container are all
contiguous). This should overall optimize the performance of operation
that involves batch deletions, such as tablet deletion and hole
repunching.

This patch is the first part of a series. It only adds the new
abstraction 'BlockDeletionTransaction', and doesn't yet change any
deletion semantics or improve performance.

Change-Id: Iecb252b3a5665d5471bb82301d0c8012a68de959
Reviewed-on: http://gerrit.cloudera.org:8080/7656
Tested-by: Kudu Jenkins
Reviewed-by: Dan Burkert <da...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/26a32fc4
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/26a32fc4
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/26a32fc4

Branch: refs/heads/master
Commit: 26a32fc4b01a2bb9c552ae586060bc0ba97b53c1
Parents: 9c1997a
Author: hahao <ha...@cloudera.com>
Authored: Thu Sep 21 11:40:36 2017 -0700
Committer: Dan Burkert <da...@apache.org>
Committed: Mon Sep 25 18:51:48 2017 +0000

----------------------------------------------------------------------
 src/kudu/cfile/bloomfile.cc            |  2 +-
 src/kudu/cfile/cfile-test.cc           |  2 +-
 src/kudu/cfile/cfile_writer.cc         |  2 +-
 src/kudu/cfile/cfile_writer.h          |  2 +
 src/kudu/fs/block_manager-test.cc      | 35 ++++++++++++++--
 src/kudu/fs/block_manager.h            | 63 +++++++++++++++++++++++++++--
 src/kudu/fs/log_block_manager-test.cc  | 10 ++---
 src/kudu/tablet/deltafile.cc           |  2 +-
 src/kudu/tablet/diskrowset.cc          |  6 ++-
 src/kudu/tablet/multi_column_writer.cc |  2 +-
 src/kudu/tserver/tablet_copy_client.cc |  3 +-
 11 files changed, 109 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/cfile/bloomfile.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/bloomfile.cc b/src/kudu/cfile/bloomfile.cc
index d4a6d90..cb08687 100644
--- a/src/kudu/cfile/bloomfile.cc
+++ b/src/kudu/cfile/bloomfile.cc
@@ -113,7 +113,7 @@ Status BloomFileWriter::Start() {
 }
 
 Status BloomFileWriter::Finish() {
-  BlockCreationTransaction transaction;
+  BlockCreationTransaction transaction(writer_->block()->block_manager());
   RETURN_NOT_OK(FinishAndReleaseBlock(&transaction));
   return transaction.CommitCreatedBlocks();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/cfile/cfile-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile-test.cc b/src/kudu/cfile/cfile-test.cc
index 82e71a2..d30787d 100644
--- a/src/kudu/cfile/cfile-test.cc
+++ b/src/kudu/cfile/cfile-test.cc
@@ -934,7 +934,7 @@ TEST_P(TestCFileBothCacheTypes, TestReleaseBlock) {
   WriterOptions opts;
   CFileWriter w(opts, GetTypeInfo(STRING), false, std::move(sink));
   ASSERT_OK(w.Start());
-  fs::BlockCreationTransaction transaction;
+  fs::BlockCreationTransaction transaction(fs_manager_->block_manager());
   ASSERT_OK(w.FinishAndReleaseBlock(&transaction));
   ASSERT_EQ(1, transaction.created_blocks().size());
   ASSERT_EQ(WritableBlock::FINALIZED, transaction.created_blocks()[0]->state());

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/cfile/cfile_writer.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile_writer.cc b/src/kudu/cfile/cfile_writer.cc
index c438585..9dd30ac 100644
--- a/src/kudu/cfile/cfile_writer.cc
+++ b/src/kudu/cfile/cfile_writer.cc
@@ -203,7 +203,7 @@ Status CFileWriter::Start() {
 
 Status CFileWriter::Finish() {
   TRACE_EVENT0("cfile", "CFileWriter::Finish");
-  BlockCreationTransaction transaction;
+  BlockCreationTransaction transaction(block_->block_manager());
   RETURN_NOT_OK(FinishAndReleaseBlock(&transaction));
   return transaction.CommitCreatedBlocks();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/cfile/cfile_writer.h
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile_writer.h b/src/kudu/cfile/cfile_writer.h
index f9f703b..d23ecef 100644
--- a/src/kudu/cfile/cfile_writer.h
+++ b/src/kudu/cfile/cfile_writer.h
@@ -176,6 +176,8 @@ class CFileWriter {
 
   std::string ToString() const { return block_->id().ToString(); }
 
+  fs::WritableBlock* block() const { return block_.get(); }
+
   // Wrapper for AddBlock() to append the dictionary block to the end of a Cfile.
   Status AppendDictBlock(const std::vector<Slice> &data_slices,
                          BlockPointer *block_ptr,

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/fs/block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_manager-test.cc b/src/kudu/fs/block_manager-test.cc
index bb363db..34a102c 100644
--- a/src/kudu/fs/block_manager-test.cc
+++ b/src/kudu/fs/block_manager-test.cc
@@ -132,7 +132,7 @@ class BlockManagerTest : public KuduTest {
     int num_blocks = num_dirs * num_blocks_per_dir;
 
     // Write 'num_blocks' blocks to this data dir group.
-    BlockCreationTransaction transaction;
+    BlockCreationTransaction transaction(bm_.get());
     for (int i = 0; i < num_blocks; i++) {
       unique_ptr<WritableBlock> written_block;
       ASSERT_OK(bm_->CreateBlock(opts, &written_block));
@@ -349,7 +349,7 @@ void BlockManagerTest<LogBlockManager>::RunMultipathTest(const vector<string>& p
   ASSERT_OK(dd_manager_->CreateDataDirGroup("multipath_test"));
 
   const char* kTestData = "test data";
-  BlockCreationTransaction transaction;
+  BlockCreationTransaction transaction(bm_.get());
   // Creates (numPaths * 2) containers.
   for (int j = 0; j < paths.size() * 2; j++) {
     unique_ptr<WritableBlock> block;
@@ -1065,7 +1065,7 @@ TYPED_TEST(BlockManagerTest, TestBlockTransaction) {
   // Create a BlockCreationTransaction. In this transaction,
   // create some blocks and commit the writes all together.
   const string kTestData = "test data";
-  BlockCreationTransaction creation_transaction;
+  BlockCreationTransaction creation_transaction(this->bm_.get());
   vector<BlockId> created_blocks;
   for (int i = 0; i < 20; i++) {
     unique_ptr<WritableBlock> writer;
@@ -1091,6 +1091,35 @@ TYPED_TEST(BlockManagerTest, TestBlockTransaction) {
     ASSERT_OK(reader->Read(0, data));
     ASSERT_EQ(kTestData, data);
   }
+
+  // Create a BlockDeletionTransaction. In this transaction,
+  // randomly delete almost half of the created blocks.
+  BlockDeletionTransaction deletion_transaction(this->bm_.get());
+  for (const auto& block : created_blocks) {
+    if (rand() % 2) deletion_transaction.AddDeletedBlock(block);
+  }
+  vector<BlockId> deleted_blocks;
+  ASSERT_OK(deletion_transaction.CommitDeletedBlocks(&deleted_blocks));
+  for (const auto& block : deleted_blocks) {
+    created_blocks.erase(std::remove(created_blocks.begin(), created_blocks.end(), block),
+                         created_blocks.end());
+    ASSERT_TRUE(this->bm_->OpenBlock(block, nullptr).IsNotFound());
+  }
+
+  // Delete the rest of created blocks. But force the operations to fail,
+  // in order to test that the first failure properly propagates.
+  FLAGS_crash_on_eio = false;
+  FLAGS_env_inject_eio = 1.0;
+  for (const auto& block : created_blocks) {
+    deletion_transaction.AddDeletedBlock(block);
+  }
+  deleted_blocks.clear();
+  Status s = deletion_transaction.CommitDeletedBlocks(&deleted_blocks);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STR_CONTAINS(s.ToString(), Substitute("only deleted $0 blocks, "
+                                               "first failure",
+                                               deleted_blocks.size()));
+  FLAGS_env_inject_eio = 0;
 }
 
 } // namespace fs

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/fs/block_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_manager.h b/src/kudu/fs/block_manager.h
index e6da595..d682546 100644
--- a/src/kudu/fs/block_manager.h
+++ b/src/kudu/fs/block_manager.h
@@ -24,7 +24,10 @@
 #include <utility>
 #include <vector>
 
+#include "kudu/fs/block_id.h"
 #include "kudu/gutil/ref_counted.h"
+#include "kudu/gutil/strings/substitute.h"
+#include "kudu/util/array_view.h"
 #include "kudu/util/status.h"
 
 namespace kudu {
@@ -268,6 +271,10 @@ class BlockManager {
 //  2) to be able to track all blocks created in one logical operation.
 class BlockCreationTransaction {
  public:
+  explicit BlockCreationTransaction(BlockManager* bm)
+      : bm_(bm) {
+  }
+
   void AddCreatedBlock(std::unique_ptr<WritableBlock> block) {
     created_blocks_.emplace_back(std::move(block));
   }
@@ -279,10 +286,7 @@ class BlockCreationTransaction {
       return Status::OK();
     }
 
-    // We assume every block is using the same block manager, so any
-    // block's manager will do.
-    BlockManager* bm = created_blocks_[0]->block_manager();
-    Status s = bm->CloseBlocks(created_blocks_);
+    Status s = bm_->CloseBlocks(created_blocks_);
     if (s.ok()) created_blocks_.clear();
     return s;
   }
@@ -292,9 +296,60 @@ class BlockCreationTransaction {
   }
 
  private:
+  // The owning BlockManager. Must outlive the BlockCreationTransaction.
+  BlockManager* bm_;
   std::vector<std::unique_ptr<WritableBlock>> created_blocks_;
 };
 
+// Group a set of block deletions together in a transaction. Similar to
+// BlockCreationTransaction, this has two major motivations:
+//  1) the underlying block manager can optimize deletions for a batch
+//     of blocks if possible to achieve better performance.
+//  2) to be able to track all blocks deleted in one logical operation.
+class BlockDeletionTransaction {
+ public:
+  explicit BlockDeletionTransaction(BlockManager* bm)
+      : bm_(bm) {
+  }
+
+  void AddDeletedBlock(BlockId block) {
+    deleted_blocks_.emplace_back(block);
+  }
+
+  // Deletes a group of blocks given the block IDs, the actual deletion will take
+  // place after the last open reader or writer is closed for each block that needs
+  // be to deleted. The 'deleted' out parameter will be set with the list of block
+  // IDs that were successfully deleted, regardless of the value of returned 'status'
+  // is OK or error.
+  //
+  // Returns the first deletion failure that was seen, if any.
+  Status CommitDeletedBlocks(std::vector<BlockId>* deleted) {
+    Status first_failure;
+    for (BlockId block : deleted_blocks_) {
+      Status s = bm_->DeleteBlock(block);
+      // If we get NotFound, then the block was already deleted.
+      if (!s.ok() && !s.IsNotFound()) {
+        if (first_failure.ok()) first_failure = s;
+      } else {
+        deleted->emplace_back(block);
+      }
+    }
+
+    if (!first_failure.ok()) {
+      first_failure = first_failure.CloneAndPrepend(strings::Substitute("only deleted $0 blocks, "
+                                                                        "first failure",
+                                                                        deleted->size()));
+    }
+    deleted_blocks_.clear();
+    return first_failure;
+  }
+
+ private:
+  // The owning BlockManager. Must outlive the BlockDeletionTransaction.
+  BlockManager* bm_;
+  std::vector<BlockId> deleted_blocks_;
+};
+
 // Compute an upper bound for a file cache embedded within a block manager
 // using resource limits obtained from the system.
 int64_t GetFileCacheCapacityForBlockManager(Env* env);

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/fs/log_block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test.cc b/src/kudu/fs/log_block_manager-test.cc
index b870623..634f7db 100644
--- a/src/kudu/fs/log_block_manager-test.cc
+++ b/src/kudu/fs/log_block_manager-test.cc
@@ -290,7 +290,7 @@ TEST_F(LogBlockManagerTest, MetricsTest) {
   BlockId saved_id;
   {
     Random rand(SeedRandom());
-    BlockCreationTransaction transaction;
+    BlockCreationTransaction transaction(bm_.get());
     for (int i = 0; i < 10; i++) {
       unique_ptr<WritableBlock> b;
       ASSERT_OK(bm_->CreateBlock(test_block_opts_, &b));
@@ -377,7 +377,7 @@ TEST_F(LogBlockManagerTest, TestReuseBlockIds) {
 
   // Create 4 containers, with the first four block IDs in the sequence.
   {
-    BlockCreationTransaction transaction;
+    BlockCreationTransaction transaction(bm_.get());
     for (int i = 0; i < 4; i++) {
       unique_ptr<WritableBlock> writer;
       ASSERT_OK(bm_->CreateBlock(test_block_opts_, &writer));
@@ -806,7 +806,7 @@ TEST_F(LogBlockManagerTest, StartupBenchmark) {
   const int kNumBlocks = AllowSlowTests() ? 1000000 : 1000;
   // Creates 'kNumBlocks' blocks with minimal data.
   {
-    BlockCreationTransaction transaction;
+    BlockCreationTransaction transaction(bm_.get());
     for (int i = 0; i < kNumBlocks; i++) {
       unique_ptr<WritableBlock> block;
       ASSERT_OK_FAST(bm_->CreateBlock(test_block_opts_, &block));
@@ -986,7 +986,7 @@ TEST_F(LogBlockManagerTest, TestRepairPreallocateExcessSpace) {
 
   // Create several full containers.
   {
-    BlockCreationTransaction transaction;
+    BlockCreationTransaction transaction(bm_.get());
     for (int i = 0; i < kNumContainers; i++) {
       unique_ptr<WritableBlock> block;
       ASSERT_OK(bm_->CreateBlock(test_block_opts_, &block));
@@ -1179,7 +1179,7 @@ TEST_F(LogBlockManagerTest, TestRepairPartialRecords) {
 
   // Create some containers.
   {
-    BlockCreationTransaction transaction;
+    BlockCreationTransaction transaction(bm_.get());
     for (int i = 0; i < kNumContainers; i++) {
       unique_ptr<WritableBlock> block;
       ASSERT_OK(bm_->CreateBlock(test_block_opts_, &block));

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/tablet/deltafile.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/deltafile.cc b/src/kudu/tablet/deltafile.cc
index 32004d2..37f9f51 100644
--- a/src/kudu/tablet/deltafile.cc
+++ b/src/kudu/tablet/deltafile.cc
@@ -113,7 +113,7 @@ Status DeltaFileWriter::Start() {
 }
 
 Status DeltaFileWriter::Finish() {
-  BlockCreationTransaction transaction;
+  BlockCreationTransaction transaction(writer_->block()->block_manager());
   RETURN_NOT_OK(FinishAndReleaseBlock(&transaction));
   return transaction.CommitCreatedBlocks();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/tablet/diskrowset.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc
index d67644a..60d808a 100644
--- a/src/kudu/tablet/diskrowset.cc
+++ b/src/kudu/tablet/diskrowset.cc
@@ -227,7 +227,8 @@ Status DiskRowSetWriter::AppendBlock(const RowBlock &block) {
 
 Status DiskRowSetWriter::Finish() {
   TRACE_EVENT0("tablet", "DiskRowSetWriter::Finish");
-  BlockCreationTransaction transaction;
+  FsManager* fs = rowset_metadata_->fs_manager();
+  BlockCreationTransaction transaction(fs->block_manager());
   RETURN_NOT_OK(FinishAndReleaseBlocks(&transaction));
   return transaction.CommitCreatedBlocks();
 }
@@ -315,7 +316,8 @@ RollingDiskRowSetWriter::RollingDiskRowSetWriter(
       row_idx_in_cur_drs_(0),
       can_roll_(false),
       written_count_(0),
-      written_size_(0) {
+      written_size_(0),
+      block_transaction_(tablet_metadata->fs_manager()->block_manager()) {
   CHECK(schema.has_column_ids());
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/tablet/multi_column_writer.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/multi_column_writer.cc b/src/kudu/tablet/multi_column_writer.cc
index 1ec669b..9412bdb 100644
--- a/src/kudu/tablet/multi_column_writer.cc
+++ b/src/kudu/tablet/multi_column_writer.cc
@@ -119,7 +119,7 @@ Status MultiColumnWriter::AppendBlock(const RowBlock& block) {
 }
 
 Status MultiColumnWriter::Finish() {
-  BlockCreationTransaction transaction;
+  BlockCreationTransaction transaction(fs_->block_manager());
   RETURN_NOT_OK(FinishAndReleaseBlocks(&transaction));
   return transaction.CommitCreatedBlocks();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/26a32fc4/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index 6cab0f7..fbf3244 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -156,7 +156,8 @@ TabletCopyClient::TabletCopyClient(std::string tablet_id,
       session_idle_timeout_millis_(FLAGS_tablet_copy_begin_session_timeout_ms),
       start_time_micros_(0),
       rng_(GetRandomSeed32()),
-      tablet_copy_metrics_(tablet_copy_metrics) {
+      tablet_copy_metrics_(tablet_copy_metrics),
+      transaction_(fs_manager->block_manager()) {
   if (tablet_copy_metrics_) {
     tablet_copy_metrics_->open_client_sessions->Increment();
   }