You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2017/04/28 23:53:38 UTC

kudu git commit: KUDU-1978: avoid corruption when deleting misaligned blocks

Repository: kudu
Updated Branches:
  refs/heads/master 401baedf1 -> 63c68add1


KUDU-1978: avoid corruption when deleting misaligned blocks

The gist of the problem: when punching out blocks, we don't actually know
where they end. The existing code assumes that proper alignment is always
maintained, but when faced with a repeated sequence of blocks misaligned on
both start and end offsets, this may not be true. It is conceivable (though
not certain) that KUDU-1793 can cause such sequences.

To fix, we could maintain per-block end offsets, but that'd be expensive for
such a rare case (remember: we have millions of blocks). Instead, we'll look
for blocks misaligned on their start offsets and skip the align up step when
punching them out. This means we won't reclaim all of their space, but it's
better than corrupting data belonging to the next block.

A black box reproduction of KUDU-1793 is virtually impossible, so here's a
white box implementation instead, via augmentation of the LBMCorruptor's
misaligned block generator. Without the fix (though with the removal of the
DCHECK_EQ), the new test fails 100% of the time. With the fix, it hasn't
failed in 1000 runs.

Change-Id: I6ed6e349ab10d8d04722cd7ef99e7a06554f51f1
Reviewed-on: http://gerrit.cloudera.org:8080/6715
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <da...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/63c68add
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/63c68add
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/63c68add

Branch: refs/heads/master
Commit: 63c68add1145a64e3a546eccf7cc98c16bba638a
Parents: 401baed
Author: Adar Dembo <ad...@cloudera.com>
Authored: Thu Apr 20 19:12:38 2017 -0700
Committer: Adar Dembo <ad...@cloudera.com>
Committed: Fri Apr 28 23:53:24 2017 +0000

----------------------------------------------------------------------
 src/kudu/fs/log_block_manager-test-util.cc | 73 ++++++++++++-------
 src/kudu/fs/log_block_manager-test-util.h  |  5 +-
 src/kudu/fs/log_block_manager-test.cc      | 97 +++++++++++++++++++++++++
 src/kudu/fs/log_block_manager.cc           | 47 ++++++++----
 4 files changed, 180 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/63c68add/src/kudu/fs/log_block_manager-test-util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test-util.cc b/src/kudu/fs/log_block_manager-test-util.cc
index 253778c..6359574 100644
--- a/src/kudu/fs/log_block_manager-test-util.cc
+++ b/src/kudu/fs/log_block_manager-test-util.cc
@@ -249,46 +249,65 @@ Status LBMCorruptor::AddMalformedRecordToContainer() {
 }
 
 Status LBMCorruptor::AddMisalignedBlockToContainer() {
-  const int kBlockSize = 16 * 1024;
   const Container* c;
   RETURN_NOT_OK(GetRandomContainer(ANY, &c));
 
-  // Ensure the container's data file has enough space for the new block. We're
-  // not going to fill that space, but this ensures that the block's record
-  // isn't considered malformed.
-  uint64_t block_offset;
-  {
-    unique_ptr<RWFile> data_file;
-    RWFileOptions opts;
-    opts.mode = Env::OPEN_EXISTING;
-    RETURN_NOT_OK(env_->NewRWFile(opts, c->data_filename, &data_file));
-    uint64_t fs_block_size;
-    RETURN_NOT_OK(env_->GetBlockSize(c->data_filename, &fs_block_size));
-    uint64_t initial_data_size;
-    RETURN_NOT_OK(data_file->Size(&initial_data_size));
+  uint64_t fs_block_size;
+  RETURN_NOT_OK(env_->GetBlockSize(c->data_filename, &fs_block_size));
 
-    // Ensure the offset of the new block isn't aligned with the filesystem
-    // block size.
-    block_offset = initial_data_size;
-    if (block_offset % fs_block_size == 0) {
-      block_offset += 1;
-    }
+  unique_ptr<RWFile> data_file;
+  RWFileOptions opts;
+  opts.mode = Env::OPEN_EXISTING;
+  RETURN_NOT_OK(env_->NewRWFile(opts, c->data_filename, &data_file));
+  uint64_t initial_data_size;
+  RETURN_NOT_OK(data_file->Size(&initial_data_size));
 
-    RETURN_NOT_OK(data_file->PreAllocate(
-        initial_data_size, (block_offset - initial_data_size) + kBlockSize,
-        RWFile::CHANGE_FILE_SIZE));
-    RETURN_NOT_OK(data_file->Close());
+  // Pick a random offset beyond the end of the file to place the new block,
+  // ensuring that the offset isn't aligned with the filesystem block size.
+  //
+  // In accordance with KUDU-1793 (which sparked the entire concept of
+  // misaligned blocks in the first place), misaligned blocks may not intrude
+  // on the aligned space of the blocks that came before them. To avoid having
+  // to read the container's records just to corrupt it, we'll arbitrarily add
+  // a fs_block_size gap before this misaligned block, to ensure that it
+  // doesn't violate the previous block's alignment.
+  uint64_t block_offset =
+      initial_data_size + fs_block_size + rand_.Uniform(fs_block_size);
+  if (block_offset % fs_block_size == 0) {
+    block_offset++;
   }
 
-  unique_ptr<WritablePBContainerFile> metadata_writer;
-  RETURN_NOT_OK(OpenMetadataWriter(*c, &metadata_writer));
+  // Ensure the file is preallocated at least up to the offset, in case we
+  // decide to write a zero-length block to the end of it.
+  uint64_t length_beyond_eof = block_offset - initial_data_size;
+  if (length_beyond_eof > 0) {
+    RETURN_NOT_OK(data_file->PreAllocate(initial_data_size, length_beyond_eof,
+                                         RWFile::CHANGE_FILE_SIZE));
+  }
 
+  // Populate the block with repeated sequences of its id so that readers who
+  // wish to verify its contents can do so easily. To avoid a truncated
+  // sequence at the end of the block, we also ensure that the block's length
+  // is a multiple of the id's type.
   BlockId block_id(rand_.Next64());
+  uint64_t raw_block_id = block_id.id();
+  uint64_t block_length = rand_.Uniform(fs_block_size * 4);
+  block_length -= block_length % sizeof(raw_block_id);
+  uint8_t data[block_length];
+  for (int i = 0; i < ARRAYSIZE(data); i += sizeof(raw_block_id)) {
+    memcpy(&data[i], &raw_block_id, sizeof(raw_block_id));
+  }
+  RETURN_NOT_OK(data_file->Write(block_offset, Slice(data, ARRAYSIZE(data))));
+  RETURN_NOT_OK(data_file->Close());
+
+  // Having written out the block, write a corresponding metadata record.
+  unique_ptr<WritablePBContainerFile> metadata_writer;
+  RETURN_NOT_OK(OpenMetadataWriter(*c, &metadata_writer));
   BlockRecordPB record;
   block_id.CopyToPB(record.mutable_block_id());
   record.set_op_type(CREATE);
   record.set_offset(block_offset);
-  record.set_length(kBlockSize);
+  record.set_length(block_length);
   record.set_timestamp_us(0);
 
   RETURN_NOT_OK(metadata_writer->Append(record));

http://git-wip-us.apache.org/repos/asf/kudu/blob/63c68add/src/kudu/fs/log_block_manager-test-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test-util.h b/src/kudu/fs/log_block_manager-test-util.h
index 20273a0..a4076aa 100644
--- a/src/kudu/fs/log_block_manager-test-util.h
+++ b/src/kudu/fs/log_block_manager-test-util.h
@@ -62,8 +62,9 @@ class LBMCorruptor {
   // Returns an error if a container could not be found.
   Status AddMalformedRecordToContainer();
 
-  // Adds a misaligned block to a container (chosen at random). This
-  // inconsistency is non-fatal and irreparable.
+  // Adds a misaligned block to a container (chosen at random). The block
+  // contains repeated 8-byte sequences of its block id. This inconsistency is
+  // non-fatal and irreparable.
   //
   // Returns an error if a container could not be found.
   Status AddMisalignedBlockToContainer();

http://git-wip-us.apache.org/repos/asf/kudu/blob/63c68add/src/kudu/fs/log_block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager-test.cc b/src/kudu/fs/log_block_manager-test.cc
index 953c766..0f01fc5 100644
--- a/src/kudu/fs/log_block_manager-test.cc
+++ b/src/kudu/fs/log_block_manager-test.cc
@@ -760,6 +760,103 @@ TEST_F(LogBlockManagerTest, TestContainerBlockLimiting) {
   NO_FATALS(AssertNumContainers(4));
 }
 
+TEST_F(LogBlockManagerTest, TestMisalignedBlocksFuzz) {
+  FLAGS_log_container_preallocate_bytes = 0;
+  const int kNumBlocks = 100;
+
+  // Create one container.
+  unique_ptr<WritableBlock> block;
+  ASSERT_OK(bm_->CreateBlock(&block));
+  ASSERT_OK(block->Close());
+  string container_name;
+  NO_FATALS(GetOnlyContainer(&container_name));
+
+  // Add a mixture of regular and misaligned blocks to it.
+  LBMCorruptor corruptor(env_, { test_dir_ }, SeedRandom());
+  ASSERT_OK(corruptor.Init());
+  int num_misaligned_blocks = 0;
+  for (int i = 0; i < kNumBlocks; i++) {
+    if (rand() % 2) {
+      ASSERT_OK(corruptor.AddMisalignedBlockToContainer());
+
+      // Need to reopen the block manager after each corruption because the
+      // container metadata writers do not expect the metadata files to have
+      // been changed underneath them.
+      FsReport report;
+      ASSERT_OK(ReopenBlockManager(&report));
+      ASSERT_FALSE(report.HasFatalErrors());
+      num_misaligned_blocks++;
+    } else {
+      unique_ptr<WritableBlock> block;
+      ASSERT_OK(bm_->CreateBlock(&block));
+
+      // Append at least once to ensure that the data file grows.
+      //
+      // The LBM considers the last record of a container to be malformed if
+      // it's zero-length and if the file hasn't grown enough to catch up it.
+      // This combination (zero-length block at the end of a full container
+      // without any remaining preallocated space) is nearly impossible in real
+      // life, so we avoid it in testing too.
+      int num_appends = (rand() % 8) + 1;
+      uint64_t raw_block_id = block->id().id();
+      Slice s(reinterpret_cast<const uint8_t*>(&raw_block_id),
+              sizeof(raw_block_id));
+      for (int j = 0; j < num_appends; j++) {
+        // The corruptor writes the block ID repeatedly into each misaligned
+        // block, so we'll make our regular blocks do the same thing.
+        ASSERT_OK(block->Append(s));
+      }
+      ASSERT_OK(block->Close());
+    }
+  }
+  FsReport report;
+  ASSERT_OK(ReopenBlockManager(&report));
+  ASSERT_FALSE(report.HasFatalErrors()) << report.ToString();
+  ASSERT_EQ(num_misaligned_blocks, report.misaligned_block_check->entries.size());
+  for (const auto& mb : report.misaligned_block_check->entries) {
+    ASSERT_EQ(container_name, mb.container);
+  }
+
+  // Delete about half of them, chosen randomly.
+  vector<BlockId> block_ids;
+  ASSERT_OK(bm_->GetAllBlockIds(&block_ids));
+  for (const auto& id : block_ids) {
+    if (rand() % 2) {
+      ASSERT_OK(bm_->DeleteBlock(id));
+    }
+  }
+
+  // Wait for the block manager to punch out all of the holes. It's easiest to
+  // do this by reopening it; shutdown will wait for outstanding hole punches.
+  //
+  // On reopen, some misaligned blocks should be gone from the report.
+  ASSERT_OK(ReopenBlockManager(&report));
+  ASSERT_FALSE(report.HasFatalErrors());
+  ASSERT_GT(report.misaligned_block_check->entries.size(), 0);
+  ASSERT_LT(report.misaligned_block_check->entries.size(), num_misaligned_blocks);
+  for (const auto& mb : report.misaligned_block_check->entries) {
+    ASSERT_EQ(container_name, mb.container);
+  }
+
+  // Read and verify the contents of each remaining block.
+  ASSERT_OK(bm_->GetAllBlockIds(&block_ids));
+  for (const auto& id : block_ids) {
+    uint64_t raw_block_id = id.id();
+    unique_ptr<ReadableBlock> b;
+    ASSERT_OK(bm_->OpenBlock(id, &b));
+    uint64_t size;
+    ASSERT_OK(b->Size(&size));
+    ASSERT_EQ(0, size % sizeof(raw_block_id));
+    uint8_t buf[size];
+    Slice s;
+    ASSERT_OK(b->Read(0, size, &s, buf));
+    for (int i = 0; i < size; i += sizeof(raw_block_id)) {
+      ASSERT_EQ(raw_block_id, *reinterpret_cast<uint64_t*>(buf + i));
+    }
+    ASSERT_OK(b->Close());
+  }
+}
+
 TEST_F(LogBlockManagerTest, TestRepairPreallocateExcessSpace) {
   FLAGS_log_container_preallocate_bytes = 0;
   FLAGS_log_container_max_size = 1;

http://git-wip-us.apache.org/repos/asf/kudu/blob/63c68add/src/kudu/fs/log_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index 24ffd76..cda36d4 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -210,7 +210,7 @@ class LogBlockContainer {
   // is required to record the deletion in container metadata.
   //
   // The on-disk effects of this call are made durable only after SyncData().
-  Status DeleteBlock(int64_t offset, int64_t length);
+  Status PunchHole(int64_t offset, int64_t length);
 
   // Preallocate enough space to ensure that an append of 'next_append_length'
   // can be satisfied by this container. The offset of the beginning of this
@@ -299,6 +299,9 @@ class LogBlockContainer {
   // Produces a debug-friendly string representation of this container.
   string ToString() const;
 
+  // Returns a block's length aligned to the nearest filesystem block size;
+  int64_t GetAlignedBlockLength(int64_t block_offset, int64_t block_length) const;
+
   // Simple accessors.
   LogBlockManager* block_manager() const { return block_manager_; }
   int64_t total_bytes_written() const { return total_bytes_written_; }
@@ -656,22 +659,15 @@ Status LogBlockContainer::FinishBlock(const Status& s, WritableBlock* block) {
   return Status::OK();
 }
 
-Status LogBlockContainer::DeleteBlock(int64_t offset, int64_t length) {
+Status LogBlockContainer::PunchHole(int64_t offset, int64_t length) {
   DCHECK_GE(offset, 0);
   DCHECK_GE(length, 0);
 
-  // Guaranteed by UpdateBytesWritten().
-  DCHECK_EQ(0, offset % instance()->filesystem_block_size_bytes());
-
   // It is invalid to punch a zero-size hole.
   if (length) {
-    // Round up to the nearest filesystem block so that the kernel will
-    // actually reclaim disk space.
-    //
     // It's OK if we exceed the file's total size; the kernel will truncate
     // our request.
-    return data_file_->PunchHole(offset, KUDU_ALIGN_UP(
-        length, instance()->filesystem_block_size_bytes()));
+    return data_file_->PunchHole(offset, length);
   }
   return Status::OK();
 }
@@ -801,6 +797,27 @@ string LogBlockContainer::ToString() const {
   return s;
 }
 
+
+int64_t LogBlockContainer::GetAlignedBlockLength(int64_t block_offset,
+                                                 int64_t block_length) const {
+  uint64_t fs_block_size =
+      data_dir_->instance()->metadata()->filesystem_block_size_bytes();
+
+  // Nearly all blocks are placed on a filesystem block boundary, which means
+  // their length post-alignment is simply their length aligned up to the
+  // nearest fs block size.
+  //
+  // However, due to KUDU-1793, some blocks may start or end at misaligned
+  // offsets. We don't maintain enough state to precisely pinpoint such a
+  // block's (aligned) end offset in this case, so we'll just undercount it.
+  // This should be safe, although it may mean unreclaimed disk space (i.e.
+  // when GetAlignedBlockLength() is used in hole punching).
+  if (PREDICT_TRUE(block_offset % fs_block_size == 0)) {
+    return KUDU_ALIGN_UP(block_length, fs_block_size);
+  }
+  return block_length;
+}
+
 ////////////////////////////////////////////////////////////
 // LogBlock
 ////////////////////////////////////////////////////////////
@@ -866,11 +883,15 @@ static void DeleteBlockAsync(LogBlockContainer* container,
                              BlockId block_id,
                              int64_t offset,
                              int64_t length) {
+  // Use the block's aligned length so that the filesystem can reclaim maximal
+  // disk space.
+  //
   // We don't call SyncData() to synchronize the deletion because it's
   // expensive, and in the worst case, we'll just leave orphaned data
   // behind to be cleaned up in the next GC.
   VLOG(3) << "Freeing space belonging to block " << block_id;
-  WARN_NOT_OK(container->DeleteBlock(offset, length),
+  WARN_NOT_OK(container->PunchHole(
+      offset, container->GetAlignedBlockLength(offset, length)),
               Substitute("Could not delete block $0", block_id.ToString()));
 }
 
@@ -1840,8 +1861,8 @@ void LogBlockManager::ProcessBlockRecord(const BlockRecordPB& record,
       VLOG(2) << Substitute("Found DELETE block $0", block_id.ToString());
       report->stats.live_block_count--;
       report->stats.live_block_bytes -= lb->length();
-      report->stats.live_block_bytes_aligned -= KUDU_ALIGN_UP(
-          lb->length(), container->instance()->filesystem_block_size_bytes());
+      report->stats.live_block_bytes_aligned -=
+          container->GetAlignedBlockLength(lb->offset(), lb->length());
       break;
     default:
       // TODO(adar): treat as a different kind of inconsistency?