You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/04/18 06:42:36 UTC
[4/4] incubator-impala git commit: IMPALA-5124: add tests for scratch
read errors
IMPALA-5124: add tests for scratch read errors
Adds tests for read errors from permissions (i.e. open() fails),
corrupt data (integrity check fails) and truncated files (read() fails).
Fixes a couple of bugs:
* Truncated reads were not detected in TmpFilemgr
* IoMgr buffers weren't returned on error paths (this isn't a true leak
but results in DCHECKs being hit).
Change-Id: I3f2b93588dd47f70a4863ecad3b5556c3634ccb4
Reviewed-on: http://gerrit.cloudera.org:8080/6562
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/42002b91
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/42002b91
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/42002b91
Branch: refs/heads/master
Commit: 42002b91cb2db42be984b6ed95bc08c6dbd70c74
Parents: cb900df
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Apr 5 14:31:33 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Apr 18 06:34:47 2017 +0000
----------------------------------------------------------------------
be/src/runtime/bufferpool/buffer-pool-test.cc | 92 +++++++++++++++++++++-
be/src/runtime/tmp-file-mgr.cc | 27 +++++--
common/thrift/generate_error_codes.py | 3 +
3 files changed, 112 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/42002b91/be/src/runtime/bufferpool/buffer-pool-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool-test.cc b/be/src/runtime/bufferpool/buffer-pool-test.cc
index a40feac..dae7bcd 100644
--- a/be/src/runtime/bufferpool/buffer-pool-test.cc
+++ b/be/src/runtime/bufferpool/buffer-pool-test.cc
@@ -296,7 +296,7 @@ class BufferPoolTest : public ::testing::Test {
directory_iterator dir_it(SCRATCH_DIR);
for (; dir_it != directory_iterator(); ++dir_it) {
++num_files;
- chmod(dir_it->path().c_str(), 0);
+ EXPECT_EQ(0, chmod(dir_it->path().c_str(), 0));
}
return num_files;
}
@@ -309,6 +309,25 @@ class BufferPoolTest : public ::testing::Test {
LOG(INFO) << "Injected fault by removing file permissions " << path;
}
+ /// Write out a bunch of nonsense to replace the file's current data.
+ static void CorruptBackingFile(const string& path) {
+ EXPECT_GT(path.size(), 0);
+ FILE* file = fopen(path.c_str(), "rb+");
+ EXPECT_EQ(0, fseek(file, 0, SEEK_END));
+ int64_t size = ftell(file);
+ EXPECT_EQ(0, fseek(file, 0, SEEK_SET));
+ for (int64_t i = 0; i < size; ++i) fputc(123, file);
+ fclose(file);
+ LOG(INFO) << "Injected fault by corrupting file " << path;
+ }
+
+ /// Truncate the file to 0 bytes.
+ static void TruncateBackingFile(const string& path) {
+ EXPECT_GT(path.size(), 0);
+ EXPECT_EQ(0, truncate(path.c_str(), 0));
+ LOG(INFO) << "Injected fault by truncating file " << path;
+ }
+
// Return the path of the temporary file backing the page.
static string TmpFilePath(PageHandle* page) {
return page->page_->write_handle->TmpFilePath();
@@ -1194,7 +1213,7 @@ void BufferPoolTest::TestWriteError(int write_delay_ms) {
WaitForAllWrites(&client);
// Repin the pages
PinAll(&pool, &client, &pages);
- // Remove the backing storage so that future writes will fail
+ // Remove permissions to the backing storage so that future writes will fail
ASSERT_GT(RemoveScratchPerms(), 0);
// Give the first write a chance to fail before the second write starts.
const int INTERVAL_MS = 10;
@@ -1247,7 +1266,7 @@ TEST_F(BufferPoolTest, TmpFileAllocateError) {
// Unpin a page, which will trigger a write.
pool.Unpin(&client, &pages[0]);
WaitForAllWrites(&client);
- // Remove temporary files - subsequent operations will fail.
+ // Remove permissions to the temporary files - subsequent operations will fail.
ASSERT_GT(RemoveScratchPerms(), 0);
// The write error will happen asynchronously.
pool.Unpin(&client, &pages[1]);
@@ -1382,6 +1401,73 @@ TEST_F(BufferPoolTest, WriteErrorBlacklist) {
}
}
+// Test error handling when on-disk data is corrupted and the read fails.
+TEST_F(BufferPoolTest, ScratchReadError) {
+ // Only allow one buffer in memory.
+ const int64_t TOTAL_MEM = TEST_BUFFER_LEN;
+ BufferPool pool(TEST_BUFFER_LEN, TOTAL_MEM);
+ global_reservations_.InitRootTracker(NewProfile(), TOTAL_MEM);
+
+ // Simulate different types of error.
+ enum ErrType {
+ CORRUPT_DATA, // Overwrite real spilled data with bogus data.
+ NO_PERMS, // Remove permissions on the scratch file.
+ TRUNCATE // Truncate the scratch file, destroying spilled data.
+ };
+ for (ErrType error_type : {CORRUPT_DATA, NO_PERMS, TRUNCATE}) {
+ ClientHandle client;
+ ASSERT_OK(pool.RegisterClient("test client", NewFileGroup(), &global_reservations_,
+ nullptr, TOTAL_MEM, NewProfile(), &client));
+ ASSERT_TRUE(client.IncreaseReservation(TOTAL_MEM));
+ PageHandle page;
+ ASSERT_OK(pool.CreatePage(&client, TEST_BUFFER_LEN, &page));
+ // Unpin a page, which will trigger a write.
+ pool.Unpin(&client, &page);
+ WaitForAllWrites(&client);
+
+ // Force eviction of the page.
+ ASSERT_OK(AllocateAndFree(&pool, &client, TEST_BUFFER_LEN));
+
+ string tmp_file = TmpFilePath(&page);
+ if (error_type == CORRUPT_DATA) {
+ CorruptBackingFile(tmp_file);
+ } else if (error_type == NO_PERMS) {
+ DisableBackingFile(tmp_file);
+ } else {
+ DCHECK_EQ(error_type, TRUNCATE);
+ TruncateBackingFile(tmp_file);
+ }
+ Status status = pool.Pin(&client, &page);
+ if (error_type == CORRUPT_DATA && !FLAGS_disk_spill_encryption) {
+ // Without encryption we can't detect that the data changed.
+ EXPECT_OK(status);
+ } else {
+ // Otherwise the read should fail.
+ EXPECT_FALSE(status.ok());
+ }
+ // Should be able to destroy the page, even though we hit an error.
+ pool.DestroyPage(&client, &page);
+
+ // If the backing file is still enabled, we should still be able to pin and unpin
+ // pages as normal.
+ ASSERT_OK(pool.CreatePage(&client, TEST_BUFFER_LEN, &page));
+ WriteData(page, 1);
+ pool.Unpin(&client, &page);
+ WaitForAllWrites(&client);
+ if (error_type == NO_PERMS) {
+ // The error prevents read/write of scratch files - this will fail.
+ EXPECT_FALSE(pool.Pin(&client, &page).ok());
+ } else {
+ // The error does not prevent read/write of scratch files.
+ ASSERT_OK(AllocateAndFree(&pool, &client, TEST_BUFFER_LEN));
+ ASSERT_OK(pool.Pin(&client, &page));
+ VerifyData(page, 1);
+ }
+ pool.DestroyPage(&client, &page);
+ pool.DeregisterClient(&client);
+ }
+}
+
/// Test that the buffer pool fails cleanly when all scratch directories are inaccessible
/// at runtime.
TEST_F(BufferPoolTest, NoDirsAllocationError) {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/42002b91/be/src/runtime/tmp-file-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index 4650e69..37a05cf 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -372,6 +372,7 @@ Status TmpFileMgr::FileGroup::Read(WriteHandle* handle, MemRange buffer) {
DCHECK(handle->write_range_ != nullptr);
DCHECK(!handle->is_cancelled_);
DCHECK_EQ(buffer.len(), handle->len());
+ Status status;
// Don't grab 'lock_' in this method - it is not necessary because we don't touch
// any members that it protects and could block other threads for the duration of
@@ -384,23 +385,35 @@ Status TmpFileMgr::FileGroup::Read(WriteHandle* handle, MemRange buffer) {
scan_range->Reset(nullptr, handle->write_range_->file(), handle->write_range_->len(),
handle->write_range_->offset(), handle->write_range_->disk_id(), false,
DiskIoMgr::BufferOpts::ReadInto(buffer.data(), buffer.len()));
- DiskIoMgr::BufferDescriptor* io_mgr_buffer;
+ DiskIoMgr::BufferDescriptor* io_mgr_buffer = nullptr;
{
SCOPED_TIMER(disk_read_timer_);
read_counter_->Add(1);
bytes_read_counter_->Add(buffer.len());
- RETURN_IF_ERROR(io_mgr_->Read(io_ctx_, scan_range, &io_mgr_buffer));
+ status = io_mgr_->Read(io_ctx_, scan_range, &io_mgr_buffer);
+ if (!status.ok()) goto exit;
}
if (FLAGS_disk_spill_encryption) {
- RETURN_IF_ERROR(handle->CheckHashAndDecrypt(buffer));
+ status = handle->CheckHashAndDecrypt(buffer);
+ if (!status.ok()) goto exit;
}
- DCHECK_EQ(io_mgr_buffer->buffer(), buffer.data());
- DCHECK_EQ(io_mgr_buffer->len(), buffer.len());
DCHECK(io_mgr_buffer->eosr());
- io_mgr_buffer->Return();
- return Status::OK();
+ DCHECK_LE(io_mgr_buffer->len(), buffer.len());
+ if (io_mgr_buffer->len() < buffer.len()) {
+ // The read was truncated - this is an error.
+ status = Status(TErrorCode::SCRATCH_READ_TRUNCATED, buffer.len(),
+ handle->write_range_->file(), handle->write_range_->offset(),
+ io_mgr_buffer->len());
+ goto exit;
+ }
+ DCHECK_EQ(io_mgr_buffer->buffer(), buffer.data());
+
+exit:
+ // Always return the buffer before exiting to avoid leaking it.
+ if (io_mgr_buffer != nullptr) io_mgr_buffer->Return();
+ return status;
}
Status TmpFileMgr::FileGroup::RestoreData(
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/42002b91/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 4c32768..39036c5 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -312,6 +312,9 @@ error_codes = (
("SCRATCH_ALLOCATION_FAILED", 101, "Could not create files in any configured scratch "
"directories (--scratch_dirs). See logs for previous errors that may have prevented "
"creating or writing scratch files."),
+
+ ("SCRATCH_READ_TRUNCATED", 102, "Error reading $0 bytes from scratch file '$1' at "
+ "offset $2: could only read $3 bytes"),
)
import sys