You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2021/03/10 16:45:02 UTC

[impala] 02/02: IMPALA-10555: Fix Hit DCHECK in TmpFileGroup::RecoverWriteError

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit b2088b30d7a4bc46878ca4815a89561551c05b15
Author: Yida Wu <wy...@gmail.com>
AuthorDate: Sun Feb 28 08:53:39 2021 -0800

    IMPALA-10555: Fix Hit DCHECK in TmpFileGroup::RecoverWriteError
    
    The DCHECK error happens when there is an IO error during the spilling
    process if the scratch directory is in a remote filesystem and doing
    an error recovery(rewrite). Because currently the DCHECK only consider
    the file number of local scratch files, it leads to a file number
    requirement mismatch in the DCHECK.
    Because the implementation of spilling to the local fs and the remote fs
    are quite different, for simplify, we don't recover write error
    for spilling to a remote fs in the current version. Instead, the errors
    generated during spilling to remote would be returned directly to the
    upper layer. So, we avoid the DCHECK logic for spilling to remote.
    
    Tests:
    * Added a unit test: TmpFileMgrTest::TestRemoteRemoveBuffer.
    * Ran Unit Tests:
    $IMPALA_HOME/be/build/latest/runtime/tmp-file-mgr-test
    
    Change-Id: Ifd9aea4bf2fff634ea9a30bf6e87987be4e1c611
    Reviewed-on: http://gerrit.cloudera.org:8080/17140
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/tmp-file-mgr-test.cc | 33 +++++++++++++++++++++++++++++++++
 be/src/runtime/tmp-file-mgr.cc      |  5 +++++
 2 files changed, 38 insertions(+)

diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index e598b91..c191d6d 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -143,6 +143,10 @@ class TmpFileMgrTest : public ::testing::Test {
     }
   }
 
+  void RemoveDirs(const vector<string>& dirs) {
+    ASSERT_OK(FileSystemUtil::RemovePaths(dirs));
+  }
+
   /// Helper to call the private CreateFiles() method and return
   /// the created files.
   static Status CreateFiles(
@@ -1785,4 +1789,33 @@ TEST_F(TmpFileMgrTest, TestSpillingWithRemoteDefaultFS) {
   test_env_->TearDownQueries();
 }
 
+/// Test writing a single record to a remote dir with local buffer, create an IO error
+/// to trigger writing failed.
+TEST_F(TmpFileMgrTest, TestRemoteRemoveBuffer) {
+  vector<string> tmp_dirs({LOCAL_BUFFER_PATH});
+  TmpFileMgr tmp_file_mgr;
+  RemoveAndCreateDirs(tmp_dirs);
+  tmp_dirs.push_back(REMOTE_URL);
+
+  ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, true, "", false, metrics_.get()));
+  TUniqueId id;
+  TmpFileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
+  string data = "arbitrary data";
+  MemRange data_mem_range(reinterpret_cast<uint8_t*>(&data[0]), data.size());
+
+  unique_ptr<TmpWriteHandle> handle;
+  WriteRange::WriteDoneCallback callback = [this](const Status& status) {
+    EXPECT_FALSE(status.ok());
+    EXPECT_TRUE(status.IsDiskIoError());
+    SignalCallback(status);
+  };
+  // Remove the buffer before the write to create an IO Error.
+  vector<string> buffer_dirs({LOCAL_BUFFER_PATH});
+  RemoveDirs(buffer_dirs);
+  ASSERT_OK(file_group.Write(data_mem_range, callback, &handle));
+  WaitForWrite(handle.get());
+  WaitForCallbacks(1);
+  file_group.Close();
+  test_env_->TearDownQueries();
+}
 } // namespace impala
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index afc2436..e0a65a6 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -1379,6 +1379,11 @@ Status TmpFileGroup::RecoverWriteError(
     return write_status;
   }
 
+  // We don't recover the errors generated during spilling to a remote file.
+  if (handle->file_->disk_type() != io::DiskFileType::LOCAL) {
+    return write_status;
+  }
+
   // Save and report the error before retrying so that the failure isn't silent.
   {
     lock_guard<SpinLock> lock(lock_);