You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/04/18 06:42:36 UTC

[4/4] incubator-impala git commit: IMPALA-5124: add tests for scratch read errors

IMPALA-5124: add tests for scratch read errors

Adds tests for read errors from permissions (i.e. open() fails),
corrupt data (integrity check fails) and truncated files (read() fails).

Fixes a couple of bugs:
* Truncated reads were not detected in TmpFilemgr
* IoMgr buffers weren't returned on error paths (this isn't a true leak
  but results in DCHECKs being hit).

Change-Id: I3f2b93588dd47f70a4863ecad3b5556c3634ccb4
Reviewed-on: http://gerrit.cloudera.org:8080/6562
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/42002b91
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/42002b91
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/42002b91

Branch: refs/heads/master
Commit: 42002b91cb2db42be984b6ed95bc08c6dbd70c74
Parents: cb900df
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Apr 5 14:31:33 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Apr 18 06:34:47 2017 +0000

----------------------------------------------------------------------
 be/src/runtime/bufferpool/buffer-pool-test.cc | 92 +++++++++++++++++++++-
 be/src/runtime/tmp-file-mgr.cc                | 27 +++++--
 common/thrift/generate_error_codes.py         |  3 +
 3 files changed, 112 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/42002b91/be/src/runtime/bufferpool/buffer-pool-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool-test.cc b/be/src/runtime/bufferpool/buffer-pool-test.cc
index a40feac..dae7bcd 100644
--- a/be/src/runtime/bufferpool/buffer-pool-test.cc
+++ b/be/src/runtime/bufferpool/buffer-pool-test.cc
@@ -296,7 +296,7 @@ class BufferPoolTest : public ::testing::Test {
     directory_iterator dir_it(SCRATCH_DIR);
     for (; dir_it != directory_iterator(); ++dir_it) {
       ++num_files;
-      chmod(dir_it->path().c_str(), 0);
+      EXPECT_EQ(0, chmod(dir_it->path().c_str(), 0));
     }
     return num_files;
   }
@@ -309,6 +309,25 @@ class BufferPoolTest : public ::testing::Test {
     LOG(INFO) << "Injected fault by removing file permissions " << path;
   }
 
+  /// Write out a bunch of nonsense to replace the file's current data.
+  static void CorruptBackingFile(const string& path) {
+    EXPECT_GT(path.size(), 0);
+    FILE* file = fopen(path.c_str(), "rb+");
+    EXPECT_EQ(0, fseek(file, 0, SEEK_END));
+    int64_t size = ftell(file);
+    EXPECT_EQ(0, fseek(file, 0, SEEK_SET));
+    for (int64_t i = 0; i < size; ++i) fputc(123, file);
+    fclose(file);
+    LOG(INFO) << "Injected fault by corrupting file " << path;
+  }
+
+  /// Truncate the file to 0 bytes.
+  static void TruncateBackingFile(const string& path) {
+    EXPECT_GT(path.size(), 0);
+    EXPECT_EQ(0, truncate(path.c_str(), 0));
+    LOG(INFO) << "Injected fault by truncating file " << path;
+  }
+
   // Return the path of the temporary file backing the page.
   static string TmpFilePath(PageHandle* page) {
     return page->page_->write_handle->TmpFilePath();
@@ -1194,7 +1213,7 @@ void BufferPoolTest::TestWriteError(int write_delay_ms) {
   WaitForAllWrites(&client);
   // Repin the pages
   PinAll(&pool, &client, &pages);
-  // Remove the backing storage so that future writes will fail
+  // Remove permissions to the backing storage so that future writes will fail
   ASSERT_GT(RemoveScratchPerms(), 0);
   // Give the first write a chance to fail before the second write starts.
   const int INTERVAL_MS = 10;
@@ -1247,7 +1266,7 @@ TEST_F(BufferPoolTest, TmpFileAllocateError) {
   // Unpin a page, which will trigger a write.
   pool.Unpin(&client, &pages[0]);
   WaitForAllWrites(&client);
-  // Remove temporary files - subsequent operations will fail.
+  // Remove permissions to the temporary files - subsequent operations will fail.
   ASSERT_GT(RemoveScratchPerms(), 0);
   // The write error will happen asynchronously.
   pool.Unpin(&client, &pages[1]);
@@ -1382,6 +1401,73 @@ TEST_F(BufferPoolTest, WriteErrorBlacklist) {
   }
 }
 
+// Test error handling when on-disk data is corrupted and the read fails.
+TEST_F(BufferPoolTest, ScratchReadError) {
+  // Only allow one buffer in memory.
+  const int64_t TOTAL_MEM = TEST_BUFFER_LEN;
+  BufferPool pool(TEST_BUFFER_LEN, TOTAL_MEM);
+  global_reservations_.InitRootTracker(NewProfile(), TOTAL_MEM);
+
+  // Simulate different types of error.
+  enum ErrType {
+    CORRUPT_DATA, // Overwrite real spilled data with bogus data.
+    NO_PERMS, // Remove permissions on the scratch file.
+    TRUNCATE // Truncate the scratch file, destroying spilled data.
+  };
+  for (ErrType error_type : {CORRUPT_DATA, NO_PERMS, TRUNCATE}) {
+    ClientHandle client;
+    ASSERT_OK(pool.RegisterClient("test client", NewFileGroup(), &global_reservations_,
+        nullptr, TOTAL_MEM, NewProfile(), &client));
+    ASSERT_TRUE(client.IncreaseReservation(TOTAL_MEM));
+    PageHandle page;
+    ASSERT_OK(pool.CreatePage(&client, TEST_BUFFER_LEN, &page));
+    // Unpin a page, which will trigger a write.
+    pool.Unpin(&client, &page);
+    WaitForAllWrites(&client);
+
+    // Force eviction of the page.
+    ASSERT_OK(AllocateAndFree(&pool, &client, TEST_BUFFER_LEN));
+
+    string tmp_file = TmpFilePath(&page);
+    if (error_type == CORRUPT_DATA) {
+      CorruptBackingFile(tmp_file);
+    } else if (error_type == NO_PERMS) {
+      DisableBackingFile(tmp_file);
+    } else {
+      DCHECK_EQ(error_type, TRUNCATE);
+      TruncateBackingFile(tmp_file);
+    }
+    Status status = pool.Pin(&client, &page);
+    if (error_type == CORRUPT_DATA && !FLAGS_disk_spill_encryption) {
+      // Without encryption we can't detect that the data changed.
+      EXPECT_OK(status);
+    } else {
+      // Otherwise the read should fail.
+      EXPECT_FALSE(status.ok());
+    }
+    // Should be able to destroy the page, even though we hit an error.
+    pool.DestroyPage(&client, &page);
+
+    // If the backing file is still enabled, we should still be able to pin and unpin
+    // pages as normal.
+    ASSERT_OK(pool.CreatePage(&client, TEST_BUFFER_LEN, &page));
+    WriteData(page, 1);
+    pool.Unpin(&client, &page);
+    WaitForAllWrites(&client);
+    if (error_type == NO_PERMS) {
+      // The error prevents read/write of scratch files - this will fail.
+      EXPECT_FALSE(pool.Pin(&client, &page).ok());
+    } else {
+      // The error does not prevent read/write of scratch files.
+      ASSERT_OK(AllocateAndFree(&pool, &client, TEST_BUFFER_LEN));
+      ASSERT_OK(pool.Pin(&client, &page));
+      VerifyData(page, 1);
+    }
+    pool.DestroyPage(&client, &page);
+    pool.DeregisterClient(&client);
+  }
+}
+
 /// Test that the buffer pool fails cleanly when all scratch directories are inaccessible
 /// at runtime.
 TEST_F(BufferPoolTest, NoDirsAllocationError) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/42002b91/be/src/runtime/tmp-file-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index 4650e69..37a05cf 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -372,6 +372,7 @@ Status TmpFileMgr::FileGroup::Read(WriteHandle* handle, MemRange buffer) {
   DCHECK(handle->write_range_ != nullptr);
   DCHECK(!handle->is_cancelled_);
   DCHECK_EQ(buffer.len(), handle->len());
+  Status status;
 
   // Don't grab 'lock_' in this method - it is not necessary because we don't touch
   // any members that it protects and could block other threads for the duration of
@@ -384,23 +385,35 @@ Status TmpFileMgr::FileGroup::Read(WriteHandle* handle, MemRange buffer) {
   scan_range->Reset(nullptr, handle->write_range_->file(), handle->write_range_->len(),
       handle->write_range_->offset(), handle->write_range_->disk_id(), false,
       DiskIoMgr::BufferOpts::ReadInto(buffer.data(), buffer.len()));
-  DiskIoMgr::BufferDescriptor* io_mgr_buffer;
+  DiskIoMgr::BufferDescriptor* io_mgr_buffer = nullptr;
   {
     SCOPED_TIMER(disk_read_timer_);
     read_counter_->Add(1);
     bytes_read_counter_->Add(buffer.len());
-    RETURN_IF_ERROR(io_mgr_->Read(io_ctx_, scan_range, &io_mgr_buffer));
+    status = io_mgr_->Read(io_ctx_, scan_range, &io_mgr_buffer);
+    if (!status.ok()) goto exit;
   }
 
   if (FLAGS_disk_spill_encryption) {
-    RETURN_IF_ERROR(handle->CheckHashAndDecrypt(buffer));
+    status = handle->CheckHashAndDecrypt(buffer);
+    if (!status.ok()) goto exit;
   }
 
-  DCHECK_EQ(io_mgr_buffer->buffer(), buffer.data());
-  DCHECK_EQ(io_mgr_buffer->len(), buffer.len());
   DCHECK(io_mgr_buffer->eosr());
-  io_mgr_buffer->Return();
-  return Status::OK();
+  DCHECK_LE(io_mgr_buffer->len(), buffer.len());
+  if (io_mgr_buffer->len() < buffer.len()) {
+    // The read was truncated - this is an error.
+    status = Status(TErrorCode::SCRATCH_READ_TRUNCATED, buffer.len(),
+        handle->write_range_->file(), handle->write_range_->offset(),
+        io_mgr_buffer->len());
+    goto exit;
+  }
+  DCHECK_EQ(io_mgr_buffer->buffer(), buffer.data());
+
+exit:
+  // Always return the buffer before exiting to avoid leaking it.
+  if (io_mgr_buffer != nullptr) io_mgr_buffer->Return();
+  return status;
 }
 
 Status TmpFileMgr::FileGroup::RestoreData(

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/42002b91/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 4c32768..39036c5 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -312,6 +312,9 @@ error_codes = (
   ("SCRATCH_ALLOCATION_FAILED", 101, "Could not create files in any configured scratch "
    "directories (--scratch_dirs). See logs for previous errors that may have prevented "
    "creating or writing scratch files."),
+
+  ("SCRATCH_READ_TRUNCATED", 102, "Error reading $0 bytes from scratch file '$1' at "
+   "offset $2: could only read $3 bytes"),
 )
 
 import sys