You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2017/10/21 01:10:16 UTC
kudu git commit: log block manager: return early if read-only
Repository: kudu
Updated Branches:
refs/heads/master 4172983fd -> ac18aca75
log block manager: return early if read-only
A container may be used for multiple block transactions, and when any
transaction is committed, the blocks in that transaction are closed. If
blocks in a container fail to close, the container is marked read-only,
as this may cause on-disk inconsistencies.
If there are multiple transactions affecting the same container, a
container may be marked read-only by one transaction while still being
written to by another, leading to a DCHECK failure.
This patch changes this by storing a Status to indicate read-only
instead of an AtomicBool. Functions that previously DCHECKed the
container was not read-only now return early with the stored error and
appropriate error-handling. Additionally, a couple of checks for
read-only have been moved out of the blocks and into to the containers
themselves.
A test is added that shares a single container among multiple
transactions, injects failure in one, and ensures the failure is visible
across the other transactions.
Change-Id: I06412a36ed62fe1e3a2307a4a2e7a0a7e3d65702
Reviewed-on: http://gerrit.cloudera.org:8080/8338
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Reviewed-by: Hao Hao <ha...@cloudera.com>
Tested-by: Adar Dembo <ad...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/ac18aca7
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/ac18aca7
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/ac18aca7
Branch: refs/heads/master
Commit: ac18aca755daae6b9f2826b71f4a458e2b0fefa1
Parents: 4172983
Author: Andrew Wong <aw...@cloudera.com>
Authored: Wed Oct 18 10:58:06 2017 -0700
Committer: Adar Dembo <ad...@cloudera.com>
Committed: Sat Oct 21 01:10:02 2017 +0000
----------------------------------------------------------------------
src/kudu/fs/log_block_manager-test.cc | 56 ++++++++++++++++++++
src/kudu/fs/log_block_manager.cc | 85 ++++++++++++++++--------------
src/kudu/fs/log_block_manager.h | 26 +++++++--
3 files changed, 123 insertions(+), 44 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/ac18aca7/src/kudu/fs/log_block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test.cc b/src/kudu/fs/log_block_manager-test.cc
index 9a23c0c..929cb02 100644
--- a/src/kudu/fs/log_block_manager-test.cc
+++ b/src/kudu/fs/log_block_manager-test.cc
@@ -834,6 +834,62 @@ TEST_F(LogBlockManagerTest, StartupBenchmark) {
}
#endif
+TEST_F(LogBlockManagerTest, TestFailMultipleTransactionsPerContainer) {
+ // Create multiple transactions that will share a container.
+ const int kNumTransactions = 3;
+ vector<unique_ptr<BlockCreationTransaction>> block_transactions;
+ for (int i = 0; i < kNumTransactions; i++) {
+ block_transactions.emplace_back(bm_->NewCreationTransaction());
+ }
+
+ // Repeatedly add new blocks for the transactions. Finalizing each block
+ // makes the block's container available, allowing the same container to be
+ // reused by the next block.
+ const int kNumBlocks = 10;
+ for (int i = 0; i < kNumBlocks; i++) {
+ unique_ptr<WritableBlock> block;
+ ASSERT_OK_FAST(bm_->CreateBlock(test_block_opts_, &block));
+ ASSERT_OK_FAST(block->Append("x"));
+ ASSERT_OK_FAST(block->Finalize());
+ block_transactions[i % kNumTransactions]->AddCreatedBlock(std::move(block));
+ }
+ ASSERT_EQ(1, bm_->all_containers_by_name_.size());
+
+ // Briefly inject an error while committing one of the transactions. This
+ // should make the container read-only, preventing the remaining transactions
+ // from proceeding.
+ FLAGS_crash_on_eio = false;
+ FLAGS_env_inject_eio = 1.0;
+ Status s = block_transactions[0]->CommitCreatedBlocks();
+ ASSERT_TRUE(s.IsIOError());
+ FLAGS_env_inject_eio = 0;
+
+ // Now try to add some more blocks.
+ for (int i = 0; i < kNumTransactions; i++) {
+ unique_ptr<WritableBlock> block;
+ ASSERT_OK_FAST(bm_->CreateBlock(test_block_opts_, &block));
+
+ // The first write will fail, as the container has been marked read-only.
+ // This will leave the container unavailable and force the creation of a
+ // new container.
+ Status s = block->Append("x");
+ if (i == 0) {
+ ASSERT_TRUE(s.IsIOError());
+ } else {
+ ASSERT_OK_FAST(s);
+ ASSERT_OK_FAST(block->Finalize());
+ }
+ block_transactions[i]->AddCreatedBlock(std::move(block));
+ }
+ ASSERT_EQ(2, bm_->all_containers_by_name_.size());
+
+ // At this point, all of the transactions have blocks in read-only containers
+ // and, thus, will be unable to commit.
+ for (const auto& block_transaction : block_transactions) {
+ ASSERT_TRUE(block_transaction->CommitCreatedBlocks().IsIOError());
+ }
+}
+
TEST_F(LogBlockManagerTest, TestLookupBlockLimit) {
int64_t limit_1024 = LogBlockManager::LookupBlockLimit(1024);
int64_t limit_2048 = LogBlockManager::LookupBlockLimit(2048);
http://git-wip-us.apache.org/repos/asf/kudu/blob/ac18aca7/src/kudu/fs/log_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index da9b49b..9bfe88d 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -58,7 +58,6 @@
#include "kudu/gutil/walltime.h"
#include "kudu/util/alignment.h"
#include "kudu/util/array_view.h"
-#include "kudu/util/atomic.h"
#include "kudu/util/env.h"
#include "kudu/util/file_cache.h"
#include "kudu/util/flag_tags.h"
@@ -461,21 +460,33 @@ class LogBlockContainer {
// truncates the container if full and marks the container as available.
void FinalizeBlock(int64_t block_offset, int64_t block_length);
- // Run a task on this container's data directory thread pool.
+ // Runs a task on this container's data directory thread pool.
//
// Normally the task is performed asynchronously. However, if submission to
// the pool fails, it runs synchronously on the current thread.
void ExecClosure(const Closure& task);
- // Mark the container to be read-only.
- void SetReadOnly();
-
// Produces a debug-friendly string representation of this container.
string ToString() const;
+ // Makes the container read-only and stores the responsible error.
+ void SetReadOnly(const Status& error);
+
// Handles errors if the input status is not OK.
void HandleError(const Status& s) const;
+ // Returns whether or not the container has been marked read-only.
+ bool read_only() const {
+ return !read_only_status().ok();
+ }
+
+ // Returns the error that caused the container to become read-only, or OK if
+ // the container has not been marked read-only.
+ Status read_only_status() const {
+ std::lock_guard<simple_spinlock> l(read_only_lock_);
+ return read_only_status_;
+ }
+
// Simple accessors.
LogBlockManager* block_manager() const { return block_manager_; }
int64_t next_block_offset() const { return next_block_offset_.Load(); }
@@ -491,7 +502,6 @@ class LogBlockContainer {
const LogBlockManagerMetrics* metrics() const { return metrics_; }
DataDir* data_dir() const { return data_dir_; }
const PathInstanceMetadataPB* instance() const { return data_dir_->instance()->metadata(); }
- bool read_only() const { return read_only_.Load(); }
private:
LogBlockContainer(LogBlockManager* block_manager, DataDir* data_dir,
@@ -570,7 +580,8 @@ class LogBlockContainer {
// If true, only read operations are allowed. Existing blocks may
// not be deleted until the next restart, and new blocks may not
// be added.
- AtomicBool read_only_;
+ mutable simple_spinlock read_only_lock_;
+ Status read_only_status_;
DISALLOW_COPY_AND_ASSIGN(LogBlockContainer);
};
@@ -592,8 +603,7 @@ LogBlockContainer::LogBlockContainer(
live_bytes_(0),
live_bytes_aligned_(0),
live_blocks_(0),
- metrics_(block_manager->metrics()),
- read_only_(false) {
+ metrics_(block_manager->metrics()) {
}
void LogBlockContainer::HandleError(const Status& s) const {
@@ -736,6 +746,8 @@ Status LogBlockContainer::Open(LogBlockManager* block_manager,
}
Status LogBlockContainer::TruncateDataToNextBlockOffset() {
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
+
if (full()) {
VLOG(2) << Substitute("Truncating container $0 to offset $1",
ToString(), next_block_offset());
@@ -882,7 +894,6 @@ Status LogBlockContainer::ProcessRecord(
Status LogBlockContainer::DoCloseBlocks(const vector<LogWritableBlock*>& blocks,
SyncMode mode) {
- DCHECK(!read_only());
auto sync_blocks = [&]() -> Status {
if (mode == SYNC) {
VLOG(3) << "Syncing data file " << data_file_->filename();
@@ -912,16 +923,16 @@ Status LogBlockContainer::DoCloseBlocks(const vector<LogWritableBlock*>& blocks,
Status s = sync_blocks();
if (!s.ok()) {
- // Mark container as read-only to forbid further writes in case of
- // failure. Because the on-disk state may contain partial/complete
- // data/metadata at this point, it is not safe to either overwrite
- // it or append to it.
- SetReadOnly();
+ // Make container read-only to forbid further writes in case of failure.
+ // Because the on-disk state may contain partial/incomplete data/metadata at
+ // this point, it is not safe to either overwrite it or append to it.
+ SetReadOnly(s);
}
return s;
}
Status LogBlockContainer::PunchHole(int64_t offset, int64_t length) {
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
DCHECK_GE(offset, 0);
DCHECK_GE(length, 0);
@@ -939,7 +950,7 @@ Status LogBlockContainer::WriteData(int64_t offset, const Slice& data) {
}
Status LogBlockContainer::WriteVData(int64_t offset, ArrayView<const Slice> data) {
- DCHECK(!read_only());
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
DCHECK_GE(offset, next_block_offset());
RETURN_NOT_OK_HANDLE_ERROR(data_file_->WriteV(offset, data));
@@ -969,7 +980,7 @@ Status LogBlockContainer::ReadVData(int64_t offset, ArrayView<Slice> results) co
}
Status LogBlockContainer::AppendMetadata(const BlockRecordPB& pb) {
- DCHECK(!read_only());
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
// Note: We don't check for sufficient disk space for metadata writes in
// order to allow for block deletion on full disks.
RETURN_NOT_OK_HANDLE_ERROR(metadata_file_->Append(pb));
@@ -977,7 +988,7 @@ Status LogBlockContainer::AppendMetadata(const BlockRecordPB& pb) {
}
Status LogBlockContainer::FlushData(int64_t offset, int64_t length) {
- DCHECK(!read_only());
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
DCHECK_GE(offset, 0);
DCHECK_GE(length, 0);
RETURN_NOT_OK_HANDLE_ERROR(data_file_->Flush(RWFile::FLUSH_ASYNC, offset, length));
@@ -985,13 +996,13 @@ Status LogBlockContainer::FlushData(int64_t offset, int64_t length) {
}
Status LogBlockContainer::FlushMetadata() {
- DCHECK(!read_only());
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
RETURN_NOT_OK_HANDLE_ERROR(metadata_file_->Flush());
return Status::OK();
}
Status LogBlockContainer::SyncData() {
- DCHECK(!read_only());
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
if (FLAGS_enable_data_block_fsync) {
if (metrics_) metrics_->generic_metrics.total_disk_sync->Increment();
RETURN_NOT_OK_HANDLE_ERROR(data_file_->Sync());
@@ -1000,7 +1011,7 @@ Status LogBlockContainer::SyncData() {
}
Status LogBlockContainer::SyncMetadata() {
- DCHECK(!read_only());
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
if (FLAGS_enable_data_block_fsync) {
if (metrics_) metrics_->generic_metrics.total_disk_sync->Increment();
RETURN_NOT_OK_HANDLE_ERROR(metadata_file_->Sync());
@@ -1023,7 +1034,7 @@ Status LogBlockContainer::ReopenMetadataWriter() {
Status LogBlockContainer::EnsurePreallocated(int64_t block_start_offset,
size_t next_append_length) {
- DCHECK(!read_only());
+ RETURN_NOT_OK_HANDLE_ERROR(read_only_status());
DCHECK_GE(block_start_offset, 0);
if (!FLAGS_log_container_preallocate_bytes) {
@@ -1068,7 +1079,6 @@ void LogBlockContainer::FinalizeBlock(int64_t block_offset, int64_t block_length
}
void LogBlockContainer::UpdateNextBlockOffset(int64_t block_offset, int64_t block_length) {
- DCHECK(!read_only());
DCHECK_GE(block_offset, 0);
// The log block manager maintains block contiguity as an invariant, which
@@ -1094,7 +1104,6 @@ void LogBlockContainer::UpdateNextBlockOffset(int64_t block_offset, int64_t bloc
}
void LogBlockContainer::BlockCreated(const scoped_refptr<LogBlock>& block) {
- DCHECK(!read_only());
DCHECK_GE(block->offset(), 0);
total_bytes_.IncrementBy(block->fs_aligned_length());
@@ -1105,7 +1114,6 @@ void LogBlockContainer::BlockCreated(const scoped_refptr<LogBlock>& block) {
}
void LogBlockContainer::BlockDeleted(const scoped_refptr<LogBlock>& block) {
- DCHECK(!read_only());
DCHECK_GE(block->offset(), 0);
live_bytes_.IncrementBy(-block->length());
@@ -1124,8 +1132,12 @@ string LogBlockContainer::ToString() const {
return s;
}
-void LogBlockContainer::SetReadOnly() {
- read_only_.Store(true);
+void LogBlockContainer::SetReadOnly(const Status& error) {
+ DCHECK(!error.ok());
+ LOG(WARNING) << Substitute("Container $0 being marked read-only: $1",
+ ToString(), error.ToString());
+ std::lock_guard<simple_spinlock> l(read_only_lock_);
+ read_only_status_ = error;
}
///////////////////////////////////////////////////////////
@@ -1328,8 +1340,8 @@ Status LogWritableBlock::Abort() {
block_length_);
}
}
- return Status::Aborted(Substitute("container $0 is read-only",
- container_->ToString()));
+ return container_->read_only_status().CloneAndPrepend(
+ Substitute("container $0 is read-only", container_->ToString()));
}
// Close the block and then delete it. Theoretically, we could do nothing
@@ -1381,14 +1393,6 @@ Status LogWritableBlock::AppendV(ArrayView<const Slice> data) {
DCHECK(state_ == CLEAN || state_ == DIRTY)
<< "Invalid state: " << state_;
- // This could be true if Finalize() is successful but DoCloseBlocks() is not.
- // That means the container is read-only but marked as available. Another
- // block can actually try to write to it.
- if (container_->read_only()) {
- return Status::IllegalState(Substitute("container $0 is read-only",
- container_->ToString()));
- }
-
// Calculate the amount of data to write
size_t data_size = accumulate(data.begin(), data.end(), static_cast<size_t>(0),
[&](int sum, const Slice& curr) {
@@ -1835,10 +1839,8 @@ Status LogBlockManager::DeleteBlock(const BlockId& block_id) {
}
lb = it->second;
- if (lb->container()->read_only()) {
- return Status::IllegalState(Substitute("container $0 is read-only",
- lb->container()->ToString()));
- }
+ HANDLE_DISK_FAILURE(lb->container()->read_only_status(),
+ error_manager_->RunErrorNotificationCb(lb->container()->data_dir()));
// Return early if deleting a block in a failed directory.
set<int> failed_dirs = dd_manager_->GetFailedDataDirs();
@@ -1963,6 +1965,7 @@ void LogBlockManager::MakeContainerAvailableUnlocked(LogBlockContainer* containe
if (container->full() || container->read_only()) {
return;
}
+ VLOG(3) << Substitute("container $0 being made available", container->ToString());
available_containers_by_data_dir_[container->data_dir()].push_front(container);
}
http://git-wip-us.apache.org/repos/asf/kudu/blob/ac18aca7/src/kudu/fs/log_block_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.h b/src/kudu/fs/log_block_manager.h
index 4b0f7b2..b37a62d 100644
--- a/src/kudu/fs/log_block_manager.h
+++ b/src/kudu/fs/log_block_manager.h
@@ -101,9 +101,26 @@ struct LogBlockManagerMetrics;
// deleted in a GC.
//
// Care is taken to keep the in-memory representation of the block manager
-// in synch with its persistent representation. To wit, a block is only
-// made available in memory if _all_ on-disk operations (including any
-// necessary synchronization calls) are successful.
+// in sync with its persistent representation. To wit, a block is only made
+// available in memory if _all_ on-disk operations (including any necessary
+// synchronization calls) are successful.
+//
+// Writes to containers are batched together through the use of block
+// transactions: each writer will take ownership of an "available" container,
+// write a block to the container, and release ownership of the container once
+// the writer "finalizes" the block, making the container available to other
+// writers. This can happen concurrently; multiple transactions can interleave
+// writes to single container, provided each writer finalizes its block before
+// the next writer reaches for a container. Once any of the writers is
+// completely done with its IO, it can commit its transaction, syncing its
+// blocks and the container to disk (potentially as others are writing!).
+//
+// In order to maintain on-disk consistency, if the above commit fails, the
+// entire container is marked read-only, and any future writes to the container
+// will fail. There is a tradeoff here to note--having concurrent writers
+// grants better utilization for each container; however a failure to sync by
+// any of the writers will cause the others to fail and potentially corrupt the
+// underlying container.
//
// When a new block is created, a container is selected from the data
// directory group appropriate for the block, as indicated by hints in
@@ -156,6 +173,8 @@ struct LogBlockManagerMetrics;
// similarly recoverable errors).
// - Evaluate and implement a solution for data integrity (e.g. per-block
// checksum).
+// - Change the availability semantics to only mark a container as available if
+// the current writer has committed and synced its transaction.
// The log-backed block manager.
class LogBlockManager : public BlockManager {
@@ -197,6 +216,7 @@ class LogBlockManager : public BlockManager {
FRIEND_TEST(LogBlockManagerTest, TestMetadataTruncation);
FRIEND_TEST(LogBlockManagerTest, TestParseKernelRelease);
FRIEND_TEST(LogBlockManagerTest, TestReuseBlockIds);
+ FRIEND_TEST(LogBlockManagerTest, TestFailMultipleTransactionsPerContainer);
friend class internal::LogBlockContainer;
friend class internal::LogBlockDeletionTransaction;