You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2017/10/21 01:10:16 UTC

kudu git commit: log block manager: return early if read-only

Repository: kudu
Updated Branches:
  refs/heads/master 4172983fd -> ac18aca75


log block manager: return early if read-only

A container may be used for multiple block transactions, and when any
transaction is committed, the blocks in that transaction are closed.  If
blocks in a container fail to close, the container is marked read-only,
as this may cause on-disk inconsistencies.

If there are multiple transactions affecting the same container, a
container may be marked read-only by one transaction while still being
written to by another, leading to a DCHECK failure.

This patch changes this by storing a Status to indicate read-only
instead of an AtomicBool. Functions that previously DCHECKed the
container was not read-only now return early with the stored error and
appropriate error-handling. Additionally, a couple of checks for
read-only have been moved out of the blocks and into to the containers
themselves.

A test is added that shares a single container among multiple
transactions, injects failure in one, and ensures the failure is visible
across the other transactions.

Change-Id: I06412a36ed62fe1e3a2307a4a2e7a0a7e3d65702
Reviewed-on: http://gerrit.cloudera.org:8080/8338
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Reviewed-by: Hao Hao <ha...@cloudera.com>
Tested-by: Adar Dembo <ad...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/ac18aca7
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/ac18aca7
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/ac18aca7

Branch: refs/heads/master
Commit: ac18aca755daae6b9f2826b71f4a458e2b0fefa1
Parents: 4172983
Author: Andrew Wong <aw...@cloudera.com>
Authored: Wed Oct 18 10:58:06 2017 -0700
Committer: Adar Dembo <ad...@cloudera.com>
Committed: Sat Oct 21 01:10:02 2017 +0000

----------------------------------------------------------------------
 src/kudu/fs/log_block_manager-test.cc | 56 ++++++++++++++++++++
 src/kudu/fs/log_block_manager.cc      | 85 ++++++++++++++++--------------
 src/kudu/fs/log_block_manager.h       | 26 +++++++--
 3 files changed, 123 insertions(+), 44 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/ac18aca7/src/kudu/fs/log_block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test.cc b/src/kudu/fs/log_block_manager-test.cc
index 9a23c0c..929cb02 100644
--- a/src/kudu/fs/log_block_manager-test.cc
+++ b/src/kudu/fs/log_block_manager-test.cc
@@ -834,6 +834,62 @@ TEST_F(LogBlockManagerTest, StartupBenchmark) {
 }
 #endif
 
+TEST_F(LogBlockManagerTest, TestFailMultipleTransactionsPerContainer) {
+  // Create multiple transactions that will share a container.
+  const int kNumTransactions = 3;
+  vector<unique_ptr<BlockCreationTransaction>> block_transactions;
+  for (int i = 0; i < kNumTransactions; i++) {
+    block_transactions.emplace_back(bm_->NewCreationTransaction());
+  }
+
+  // Repeatedly add new blocks for the transactions. Finalizing each block
+  // makes the block's container available, allowing the same container to be
+  // reused by the next block.
+  const int kNumBlocks = 10;
+  for (int i = 0; i < kNumBlocks; i++) {
+    unique_ptr<WritableBlock> block;
+    ASSERT_OK_FAST(bm_->CreateBlock(test_block_opts_, &block));
+    ASSERT_OK_FAST(block->Append("x"));
+    ASSERT_OK_FAST(block->Finalize());
+    block_transactions[i % kNumTransactions]->AddCreatedBlock(std::move(block));
+  }
+  ASSERT_EQ(1, bm_->all_containers_by_name_.size());
+
+  // Briefly inject an error while committing one of the transactions. This
+  // should make the container read-only, preventing the remaining transactions
+  // from proceeding.
+  FLAGS_crash_on_eio = false;
+  FLAGS_env_inject_eio = 1.0;
+  Status s = block_transactions[0]->CommitCreatedBlocks();
+  ASSERT_TRUE(s.IsIOError());
+  FLAGS_env_inject_eio = 0;
+
+  // Now try to add some more blocks.
+  for (int i = 0; i < kNumTransactions; i++) {
+    unique_ptr<WritableBlock> block;
+    ASSERT_OK_FAST(bm_->CreateBlock(test_block_opts_, &block));
+
+    // The first write will fail, as the container has been marked read-only.
+    // This will leave the container unavailable and force the creation of a
+    // new container.
+    Status s = block->Append("x");
+    if (i == 0) {
+      ASSERT_TRUE(s.IsIOError());
+    } else {
+      ASSERT_OK_FAST(s);
+      ASSERT_OK_FAST(block->Finalize());
+    }
+    block_transactions[i]->AddCreatedBlock(std::move(block));
+  }
+  ASSERT_EQ(2, bm_->all_containers_by_name_.size());
+
+  // At this point, all of the transactions have blocks in read-only containers
+  // and, thus, will be unable to commit.
+  for (const auto& block_transaction : block_transactions) {
+    ASSERT_TRUE(block_transaction->CommitCreatedBlocks().IsIOError());
+  }
+}
+
 TEST_F(LogBlockManagerTest, TestLookupBlockLimit) {
   int64_t limit_1024 = LogBlockManager::LookupBlockLimit(1024);
   int64_t limit_2048 = LogBlockManager::LookupBlockLimit(2048);

http://git-wip-us.apache.org/repos/asf/kudu/blob/ac18aca7/src/kudu/fs/log_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index da9b49b..9bfe88d 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -58,7 +58,6 @@
 #include "kudu/gutil/walltime.h"
 #include "kudu/util/alignment.h"
 #include "kudu/util/array_view.h"
-#include "kudu/util/atomic.h"
 #include "kudu/util/env.h"
 #include "kudu/util/file_cache.h"
 #include "kudu/util/flag_tags.h"
@@ -461,21 +460,33 @@ class LogBlockContainer {
   // truncates the container if full and marks the container as available.
   void FinalizeBlock(int64_t block_offset, int64_t block_length);
 
-  // Run a task on this container's data directory thread pool.
+  // Runs a task on this container's data directory thread pool.
   //
   // Normally the task is performed asynchronously. However, if submission to
   // the pool fails, it runs synchronously on the current thread.
   void ExecClosure(const Closure& task);
 
-  // Mark the container to be read-only.
-  void SetReadOnly();
-
   // Produces a debug-friendly string representation of this container.
   string ToString() const;
 
+  // Makes the container read-only and stores the responsible error.
+  void SetReadOnly(const Status& error);
+
   // Handles errors if the input status is not OK.
   void HandleError(const Status& s) const;
 
+  // Returns whether or not the container has been marked read-only.
+  bool read_only() const {
+    return !read_only_status().ok();
+  }
+
+  // Returns the error that caused the container to become read-only, or OK if
+  // the container has not been marked read-only.
+  Status read_only_status() const {
+    std::lock_guard<simple_spinlock> l(read_only_lock_);
+    return read_only_status_;
+  }
+
   // Simple accessors.
   LogBlockManager* block_manager() const { return block_manager_; }
   int64_t next_block_offset() const { return next_block_offset_.Load(); }
@@ -491,7 +502,6 @@ class LogBlockContainer {
   const LogBlockManagerMetrics* metrics() const { return metrics_; }
   DataDir* data_dir() const { return data_dir_; }
   const PathInstanceMetadataPB* instance() const { return data_dir_->instance()->metadata(); }
-  bool read_only() const { return read_only_.Load(); }
 
  private:
   LogBlockContainer(LogBlockManager* block_manager, DataDir* data_dir,
@@ -570,7 +580,8 @@ class LogBlockContainer {
   // If true, only read operations are allowed. Existing blocks may
   // not be deleted until the next restart, and new blocks may not
   // be added.
-  AtomicBool read_only_;
+  mutable simple_spinlock read_only_lock_;
+  Status read_only_status_;
 
   DISALLOW_COPY_AND_ASSIGN(LogBlockContainer);
 };
@@ -592,8 +603,7 @@ LogBlockContainer::LogBlockContainer(
       live_bytes_(0),
       live_bytes_aligned_(0),
       live_blocks_(0),
-      metrics_(block_manager->metrics()),
-      read_only_(false) {
+      metrics_(block_manager->metrics()) {
 }
 
 void LogBlockContainer::HandleError(const Status& s) const {
@@ -736,6 +746,8 @@ Status LogBlockContainer::Open(LogBlockManager* block_manager,
 }
 
 Status LogBlockContainer::TruncateDataToNextBlockOffset() {
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
+
   if (full()) {
     VLOG(2) << Substitute("Truncating container $0 to offset $1",
                           ToString(), next_block_offset());
@@ -882,7 +894,6 @@ Status LogBlockContainer::ProcessRecord(
 
 Status LogBlockContainer::DoCloseBlocks(const vector<LogWritableBlock*>& blocks,
                                         SyncMode mode) {
-  DCHECK(!read_only());
   auto sync_blocks = [&]() -> Status {
     if (mode == SYNC) {
       VLOG(3) << "Syncing data file " << data_file_->filename();
@@ -912,16 +923,16 @@ Status LogBlockContainer::DoCloseBlocks(const vector<LogWritableBlock*>& blocks,
 
   Status s = sync_blocks();
   if (!s.ok()) {
-    // Mark container as read-only to forbid further writes in case of
-    // failure. Because the on-disk state may contain partial/complete
-    // data/metadata at this point, it is not safe to either overwrite
-    // it or append to it.
-    SetReadOnly();
+    // Make container read-only to forbid further writes in case of failure.
+    // Because the on-disk state may contain partial/incomplete data/metadata at
+    // this point, it is not safe to either overwrite it or append to it.
+    SetReadOnly(s);
   }
   return s;
 }
 
 Status LogBlockContainer::PunchHole(int64_t offset, int64_t length) {
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   DCHECK_GE(offset, 0);
   DCHECK_GE(length, 0);
 
@@ -939,7 +950,7 @@ Status LogBlockContainer::WriteData(int64_t offset, const Slice& data) {
 }
 
 Status LogBlockContainer::WriteVData(int64_t offset, ArrayView<const Slice> data) {
-  DCHECK(!read_only());
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   DCHECK_GE(offset, next_block_offset());
 
   RETURN_NOT_OK_HANDLE_ERROR(data_file_->WriteV(offset, data));
@@ -969,7 +980,7 @@ Status LogBlockContainer::ReadVData(int64_t offset, ArrayView<Slice> results) co
 }
 
 Status LogBlockContainer::AppendMetadata(const BlockRecordPB& pb) {
-  DCHECK(!read_only());
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   // Note: We don't check for sufficient disk space for metadata writes in
   // order to allow for block deletion on full disks.
   RETURN_NOT_OK_HANDLE_ERROR(metadata_file_->Append(pb));
@@ -977,7 +988,7 @@ Status LogBlockContainer::AppendMetadata(const BlockRecordPB& pb) {
 }
 
 Status LogBlockContainer::FlushData(int64_t offset, int64_t length) {
-  DCHECK(!read_only());
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   DCHECK_GE(offset, 0);
   DCHECK_GE(length, 0);
   RETURN_NOT_OK_HANDLE_ERROR(data_file_->Flush(RWFile::FLUSH_ASYNC, offset, length));
@@ -985,13 +996,13 @@ Status LogBlockContainer::FlushData(int64_t offset, int64_t length) {
 }
 
 Status LogBlockContainer::FlushMetadata() {
-  DCHECK(!read_only());
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   RETURN_NOT_OK_HANDLE_ERROR(metadata_file_->Flush());
   return Status::OK();
 }
 
 Status LogBlockContainer::SyncData() {
-  DCHECK(!read_only());
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   if (FLAGS_enable_data_block_fsync) {
     if (metrics_) metrics_->generic_metrics.total_disk_sync->Increment();
     RETURN_NOT_OK_HANDLE_ERROR(data_file_->Sync());
@@ -1000,7 +1011,7 @@ Status LogBlockContainer::SyncData() {
 }
 
 Status LogBlockContainer::SyncMetadata() {
-  DCHECK(!read_only());
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   if (FLAGS_enable_data_block_fsync) {
     if (metrics_) metrics_->generic_metrics.total_disk_sync->Increment();
     RETURN_NOT_OK_HANDLE_ERROR(metadata_file_->Sync());
@@ -1023,7 +1034,7 @@ Status LogBlockContainer::ReopenMetadataWriter() {
 
 Status LogBlockContainer::EnsurePreallocated(int64_t block_start_offset,
                                              size_t next_append_length) {
-  DCHECK(!read_only());
+  RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
   DCHECK_GE(block_start_offset, 0);
 
   if (!FLAGS_log_container_preallocate_bytes) {
@@ -1068,7 +1079,6 @@ void LogBlockContainer::FinalizeBlock(int64_t block_offset, int64_t block_length
 }
 
 void LogBlockContainer::UpdateNextBlockOffset(int64_t block_offset, int64_t block_length) {
-  DCHECK(!read_only());
   DCHECK_GE(block_offset, 0);
 
   // The log block manager maintains block contiguity as an invariant, which
@@ -1094,7 +1104,6 @@ void LogBlockContainer::UpdateNextBlockOffset(int64_t block_offset, int64_t bloc
 }
 
 void LogBlockContainer::BlockCreated(const scoped_refptr<LogBlock>& block) {
-  DCHECK(!read_only());
   DCHECK_GE(block->offset(), 0);
 
   total_bytes_.IncrementBy(block->fs_aligned_length());
@@ -1105,7 +1114,6 @@ void LogBlockContainer::BlockCreated(const scoped_refptr<LogBlock>& block) {
 }
 
 void LogBlockContainer::BlockDeleted(const scoped_refptr<LogBlock>& block) {
-  DCHECK(!read_only());
   DCHECK_GE(block->offset(), 0);
 
   live_bytes_.IncrementBy(-block->length());
@@ -1124,8 +1132,12 @@ string LogBlockContainer::ToString() const {
   return s;
 }
 
-void LogBlockContainer::SetReadOnly() {
-  read_only_.Store(true);
+void LogBlockContainer::SetReadOnly(const Status& error) {
+  DCHECK(!error.ok());
+  LOG(WARNING) << Substitute("Container $0 being marked read-only: $1",
+                             ToString(), error.ToString());
+  std::lock_guard<simple_spinlock> l(read_only_lock_);
+  read_only_status_ = error;
 }
 
 ///////////////////////////////////////////////////////////
@@ -1328,8 +1340,8 @@ Status LogWritableBlock::Abort() {
             block_length_);
       }
     }
-    return Status::Aborted(Substitute("container $0 is read-only",
-                                      container_->ToString()));
+    return container_->read_only_status().CloneAndPrepend(
+        Substitute("container $0 is read-only", container_->ToString()));
   }
 
   // Close the block and then delete it. Theoretically, we could do nothing
@@ -1381,14 +1393,6 @@ Status LogWritableBlock::AppendV(ArrayView<const Slice> data) {
   DCHECK(state_ == CLEAN || state_ == DIRTY)
       << "Invalid state: " << state_;
 
-  // This could be true if Finalize() is successful but DoCloseBlocks() is not.
-  // That means the container is read-only but marked as available. Another
-  // block can actually try to write to it.
-  if (container_->read_only()) {
-    return Status::IllegalState(Substitute("container $0 is read-only",
-                                           container_->ToString()));
-  }
-
   // Calculate the amount of data to write
   size_t data_size = accumulate(data.begin(), data.end(), static_cast<size_t>(0),
                                 [&](int sum, const Slice& curr) {
@@ -1835,10 +1839,8 @@ Status LogBlockManager::DeleteBlock(const BlockId& block_id) {
     }
 
     lb = it->second;
-    if (lb->container()->read_only()) {
-      return Status::IllegalState(Substitute("container $0 is read-only",
-                                             lb->container()->ToString()));
-    }
+    HANDLE_DISK_FAILURE(lb->container()->read_only_status(),
+        error_manager_->RunErrorNotificationCb(lb->container()->data_dir()));
 
     // Return early if deleting a block in a failed directory.
     set<int> failed_dirs = dd_manager_->GetFailedDataDirs();
@@ -1963,6 +1965,7 @@ void LogBlockManager::MakeContainerAvailableUnlocked(LogBlockContainer* containe
   if (container->full() || container->read_only()) {
     return;
   }
+  VLOG(3) << Substitute("container $0 being made available", container->ToString());
   available_containers_by_data_dir_[container->data_dir()].push_front(container);
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/ac18aca7/src/kudu/fs/log_block_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.h b/src/kudu/fs/log_block_manager.h
index 4b0f7b2..b37a62d 100644
--- a/src/kudu/fs/log_block_manager.h
+++ b/src/kudu/fs/log_block_manager.h
@@ -101,9 +101,26 @@ struct LogBlockManagerMetrics;
 // deleted in a GC.
 //
 // Care is taken to keep the in-memory representation of the block manager
-// in synch with its persistent representation. To wit, a block is only
-// made available in memory if _all_ on-disk operations (including any
-// necessary synchronization calls) are successful.
+// in sync with its persistent representation. To wit, a block is only made
+// available in memory if _all_ on-disk operations (including any necessary
+// synchronization calls) are successful.
+//
+// Writes to containers are batched together through the use of block
+// transactions: each writer will take ownership of an "available" container,
+// write a block to the container, and release ownership of the container once
+// the writer "finalizes" the block, making the container available to other
+// writers. This can happen concurrently; multiple transactions can interleave
+// writes to single container, provided each writer finalizes its block before
+// the next writer reaches for a container. Once any of the writers is
+// completely done with its IO, it can commit its transaction, syncing its
+// blocks and the container to disk (potentially as others are writing!).
+//
+// In order to maintain on-disk consistency, if the above commit fails, the
+// entire container is marked read-only, and any future writes to the container
+// will fail. There is a tradeoff here to note--having concurrent writers
+// grants better utilization for each container; however a failure to sync by
+// any of the writers will cause the others to fail and potentially corrupt the
+// underlying container.
 //
 // When a new block is created, a container is selected from the data
 // directory group appropriate for the block, as indicated by hints in
@@ -156,6 +173,8 @@ struct LogBlockManagerMetrics;
 //   similarly recoverable errors).
 // - Evaluate and implement a solution for data integrity (e.g. per-block
 //   checksum).
+// - Change the availability semantics to only mark a container as available if
+//   the current writer has committed and synced its transaction.
 
 // The log-backed block manager.
 class LogBlockManager : public BlockManager {
@@ -197,6 +216,7 @@ class LogBlockManager : public BlockManager {
   FRIEND_TEST(LogBlockManagerTest, TestMetadataTruncation);
   FRIEND_TEST(LogBlockManagerTest, TestParseKernelRelease);
   FRIEND_TEST(LogBlockManagerTest, TestReuseBlockIds);
+  FRIEND_TEST(LogBlockManagerTest, TestFailMultipleTransactionsPerContainer);
 
   friend class internal::LogBlockContainer;
   friend class internal::LogBlockDeletionTransaction;