You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2021/10/08 00:50:36 UTC

[impala] branch master updated: IMPALA-10945: Fix S3 scratch path behavior

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 45fd332  IMPALA-10945: Fix S3 scratch path behavior
45fd332 is described below

commit 45fd3320ad4f68ca86998dff0c9504aa896a278a
Author: Yida Wu <wy...@gmail.com>
AuthorDate: Sun Oct 3 08:30:46 2021 -0700

    IMPALA-10945: Fix S3 scratch path behavior
    
    IMPALA-10429 "Support Spill to HDFS" introduces a new behavior to
    S3 scratch path. It added a path verification for the S3 path,
    however, the HdfsFsCache::GetNameNodeFromPath in the verification
    forces the input to have at least a directory after the authority,
    like "s3a://authority/dir", otherwise it will return an error and
    lead to a failure on the TmpFileMgr initialization. Therefore, it
    changes the previous behavior which was able to support
    "s3a://authority", and may affect current users.
    
    This patch resumes the behavior of the s3 scratch path to allow
    "s3a://authority". The solution is to pass the path of the user's
    input combined with a scratch suffix "/impala-scratch" to the
    verification function, therefore, at least one directory is contained
    in the path.
    
    Tests:
    Ran core tests.
    Added logic to run two types of path format in TmpFileMgrTest:
    "s3a://authority" and "s3a://authority/dir".
    
    Change-Id: I028f375b9f535f8641261cc02f921497e076aa9b
    Reviewed-on: http://gerrit.cloudera.org:8080/17901
    Reviewed-by: Abhishek Rawat <ar...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/tmp-file-mgr-test.cc | 183 +++++++++++++++++++-----------------
 be/src/runtime/tmp-file-mgr.cc      |   6 +-
 2 files changed, 98 insertions(+), 91 deletions(-)

diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index 57cd9a7..d75b2f8 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -1015,101 +1015,108 @@ TEST_F(TmpFileMgrTest, TestDirectoryLimitParsingRemotePath) {
       "/tmp/local-buffer-dir2", "/tmp/local-buffer-dir3"});
 
   // Successful cases for HDFS paths.
-  auto& dirs1 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost:20500/tmp,/tmp/local-buffer-dir"));
-  EXPECT_NE(nullptr, dirs1);
-
-  auto& dirs2 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost:20500/tmp:100,/tmp/local-buffer-dir"));
-  EXPECT_NE(nullptr, dirs2);
-  EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs2->path());
-  EXPECT_EQ(100, dirs2->bytes_limit());
-
-  auto& dirs3 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost:20500/tmp:1KB:1,/tmp/local-buffer-dir"));
-  EXPECT_NE(nullptr, dirs3);
-  EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs3->path());
-  EXPECT_EQ(1024, dirs3->bytes_limit());
-
-  // Multiple local paths with one remote path.
-  auto tmp_mgr_4 = CreateTmpFileMgr("hdfs://localhost:20500/tmp,/tmp/local-buffer-dir1,"
-                                    "/tmp/local-buffer-dir2,/tmp/local-buffer-dir3");
-  auto& dirs4_local = GetTmpDirs(tmp_mgr_4);
-  auto& dirs4_remote = GetTmpRemoteDir(tmp_mgr_4);
-  EXPECT_NE(nullptr, dirs4_remote);
-  EXPECT_EQ(2, dirs4_local.size());
-  EXPECT_EQ("/tmp/local-buffer-dir2/impala-scratch", dirs4_local[0]->path());
-  EXPECT_EQ("/tmp/local-buffer-dir3/impala-scratch", dirs4_local[1]->path());
-
-  // Fails the parsing due to no port number for the HDFS path.
-  auto tmp_mgr_5 = CreateTmpFileMgr("hdfs://localhost/tmp,/tmp/local-buffer-dir");
-  auto& dirs5_local = GetTmpDirs(tmp_mgr_5);
-  auto& dirs5_remote = GetTmpRemoteDir(tmp_mgr_5);
-  EXPECT_EQ(1, dirs5_local.size());
-  EXPECT_EQ(nullptr, dirs5_remote);
-
-  // Fails to init due to no local buffer for the remote scratch space.
-  // We expect a non-null tmp_mgr, but the init process should fail.
-  EXPECT_NE(nullptr, CreateTmpFileMgr("hdfs://localhost:20500/tmp", false));
-
-  // Parse successfully, but the parsed HDFS path is unable to connect.
-  // These cases would fail the initialization of TmpFileMgr.
-  auto& dirs7 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost:1/tmp::1,/tmp/local-buffer-dir", false));
-  EXPECT_EQ(nullptr, dirs7);
-
-  auto& dirs8 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost:/tmp::,/tmp/local-buffer-dir", false));
-  EXPECT_EQ(nullptr, dirs8);
-
-  auto& dirs9 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost/tmp::1,/tmp/local-buffer-dir", false));
-  EXPECT_EQ(nullptr, dirs9);
-
-  auto& dirs10 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost/tmp:1,/tmp/local-buffer-dir", false));
-  EXPECT_EQ(nullptr, dirs10);
-
-  // Multiple remote paths, should support only one.
-  auto& dirs11 = GetTmpRemoteDir(
-      CreateTmpFileMgr("hdfs://localhost:20500/tmp,hdfs://localhost:20501/tmp,"
-                       "/tmp/local-buffer-dir"));
-  EXPECT_NE(nullptr, dirs11);
-  EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs11->path());
-
-  // The order of the buffer and the remote dir should not affect the result.
-  auto& dirs12 = GetTmpRemoteDir(
-      CreateTmpFileMgr("/tmp/local-buffer-dir, hdfs://localhost:20500/tmp,"
-                       "hdfs://localhost:20501/tmp"));
-  EXPECT_NE(nullptr, dirs12);
-  EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs12->path());
+  // Two types of paths, one with directory, one without.
+  vector<string> hdfs_paths{"hdfs://localhost:20500", "hdfs://localhost:20500/tmp"};
+  for (string hdfs_path : hdfs_paths) {
+    string full_hdfs_path = hdfs_path + "/impala-scratch";
+    auto& dirs1 = GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path + ",/tmp/local-buffer-dir"));
+    EXPECT_NE(nullptr, dirs1);
+
+    auto& dirs2 =
+        GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path + ":100,/tmp/local-buffer-dir"));
+    EXPECT_NE(nullptr, dirs2);
+    EXPECT_EQ(full_hdfs_path, dirs2->path());
+    EXPECT_EQ(100, dirs2->bytes_limit());
+
+    auto& dirs3 =
+        GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path + ":1KB:1,/tmp/local-buffer-dir"));
+    EXPECT_NE(nullptr, dirs3);
+    EXPECT_EQ(full_hdfs_path, dirs3->path());
+    EXPECT_EQ(1024, dirs3->bytes_limit());
+
+    // Multiple local paths with one remote path.
+    auto tmp_mgr_4 = CreateTmpFileMgr(hdfs_path
+        + ",/tmp/local-buffer-dir1,"
+          "/tmp/local-buffer-dir2,/tmp/local-buffer-dir3");
+    auto& dirs4_local = GetTmpDirs(tmp_mgr_4);
+    auto& dirs4_remote = GetTmpRemoteDir(tmp_mgr_4);
+    EXPECT_NE(nullptr, dirs4_remote);
+    EXPECT_EQ(full_hdfs_path, dirs4_remote->path());
+    EXPECT_EQ(2, dirs4_local.size());
+    EXPECT_EQ("/tmp/local-buffer-dir2/impala-scratch", dirs4_local[0]->path());
+    EXPECT_EQ("/tmp/local-buffer-dir3/impala-scratch", dirs4_local[1]->path());
+
+    // Fails the parsing due to no port number for the HDFS path.
+    auto tmp_mgr_5 = CreateTmpFileMgr("hdfs://localhost/tmp,/tmp/local-buffer-dir");
+    auto& dirs5_local = GetTmpDirs(tmp_mgr_5);
+    auto& dirs5_remote = GetTmpRemoteDir(tmp_mgr_5);
+    EXPECT_EQ(1, dirs5_local.size());
+    EXPECT_EQ(nullptr, dirs5_remote);
+
+    // Fails to init due to no local buffer for the remote scratch space.
+    // We expect a non-null tmp_mgr, but the init process should fail.
+    EXPECT_NE(nullptr, CreateTmpFileMgr(hdfs_path, false));
+
+    // Parse successfully, but the parsed HDFS path is unable to connect.
+    // These cases would fail the initialization of TmpFileMgr.
+    auto& dirs7 = GetTmpRemoteDir(
+        CreateTmpFileMgr("hdfs://localhost:1/tmp::1,/tmp/local-buffer-dir", false));
+    EXPECT_EQ(nullptr, dirs7);
+
+    auto& dirs8 = GetTmpRemoteDir(
+        CreateTmpFileMgr("hdfs://localhost:/tmp::,/tmp/local-buffer-dir", false));
+    EXPECT_EQ(nullptr, dirs8);
+
+    auto& dirs9 = GetTmpRemoteDir(
+        CreateTmpFileMgr("hdfs://localhost/tmp::1,/tmp/local-buffer-dir", false));
+    EXPECT_EQ(nullptr, dirs9);
+
+    auto& dirs10 = GetTmpRemoteDir(
+        CreateTmpFileMgr("hdfs://localhost/tmp:1,/tmp/local-buffer-dir", false));
+    EXPECT_EQ(nullptr, dirs10);
+
+    // Multiple remote paths, should support only one.
+    auto& dirs11 = GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path
+        + ",hdfs://localhost:20501/tmp,"
+          "/tmp/local-buffer-dir"));
+    EXPECT_NE(nullptr, dirs11);
+    EXPECT_EQ(full_hdfs_path, dirs11->path());
+
+    // The order of the buffer and the remote dir should not affect the result.
+    auto& dirs12 = GetTmpRemoteDir(CreateTmpFileMgr(
+        "/tmp/local-buffer-dir, " + hdfs_path + ",hdfs://localhost:20501/tmp"));
+    EXPECT_NE(nullptr, dirs12);
+    EXPECT_EQ(full_hdfs_path, dirs12->path());
+  }
 
   // Successful cases for parsing S3 paths.
   // Create a fake s3 connection in order to pass the connection verification.
   HdfsFsCache::HdfsFsMap fake_hdfs_conn_map;
   hdfsFS fake_conn = reinterpret_cast<hdfsFS>(1);
   fake_hdfs_conn_map.insert(make_pair("s3a://fake_host/", fake_conn));
-  auto& dirs13 = GetTmpRemoteDir(
-      CreateTmpFileMgr("/tmp/local-buffer-dir, s3a://fake_host/for-parsing-test-only",
-          true, &fake_hdfs_conn_map));
-  EXPECT_NE(nullptr, dirs13);
-  EXPECT_EQ("s3a://fake_host/for-parsing-test-only/impala-scratch", dirs13->path());
-
-  auto& dirs14 = GetTmpRemoteDir(
-      CreateTmpFileMgr("/tmp/local-buffer-dir, s3a://fake_host/for-parsing-test-only:100",
-          true, &fake_hdfs_conn_map));
-  EXPECT_NE(nullptr, dirs14);
-  EXPECT_EQ("s3a://fake_host/for-parsing-test-only/impala-scratch", dirs14->path());
-  EXPECT_EQ(100, dirs14->bytes_limit());
-
-  auto& dirs15 = GetTmpRemoteDir(CreateTmpFileMgr(
-      "/tmp/local-buffer-dir, s3a://fake_host/for-parsing-test-only:1KB:1", true,
-      &fake_hdfs_conn_map));
-  EXPECT_NE(nullptr, dirs15);
-  EXPECT_EQ("s3a://fake_host/for-parsing-test-only/impala-scratch", dirs15->path());
-  EXPECT_EQ(1024, dirs15->bytes_limit());
+  // Two types of paths, one with directory, one without.
+  vector<string> s3_paths{"s3a://fake_host", "s3a://fake_host/dir"};
+  for (string s3_path : s3_paths) {
+    string full_s3_path = s3_path + "/impala-scratch";
+    auto& dirs13 = GetTmpRemoteDir(
+        CreateTmpFileMgr("/tmp/local-buffer-dir, " + s3_path, true, &fake_hdfs_conn_map));
+    EXPECT_NE(nullptr, dirs13);
+    EXPECT_EQ(full_s3_path, dirs13->path());
+
+    auto& dirs14 = GetTmpRemoteDir(CreateTmpFileMgr(
+        "/tmp/local-buffer-dir, " + s3_path + ":100", true, &fake_hdfs_conn_map));
+    EXPECT_NE(nullptr, dirs14);
+    EXPECT_EQ(full_s3_path, dirs14->path());
+    EXPECT_EQ(100, dirs14->bytes_limit());
+
+    auto& dirs15 = GetTmpRemoteDir(CreateTmpFileMgr(
+        "/tmp/local-buffer-dir, " + s3_path + ":1KB:1", true, &fake_hdfs_conn_map));
+    EXPECT_NE(nullptr, dirs15);
+    EXPECT_EQ(full_s3_path, dirs15->path());
+    EXPECT_EQ(1024, dirs15->bytes_limit());
+  }
 
-  // Failure cases for parsing S3 paths.
+  // Failure cases for parsing S3 paths, S3 paths don't allow the port number.
   auto& dirs16 = GetTmpRemoteDir(CreateTmpFileMgr(
       "/tmp/local-buffer-dir, s3a://fake_host:1234/for-parsing-test-only:1KB:1", true,
       &fake_hdfs_conn_map));
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index d494284..ce3e21c 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -720,7 +720,7 @@ Status TmpDirS3::VerifyAndCreate(MetricGroup* metrics, vector<bool>* is_tmp_dir_
   DCHECK(is_parsed_);
   hdfsFS hdfs_conn;
   RETURN_IF_ERROR(HdfsFsCache::instance()->GetConnection(
-      parsed_raw_path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_), tmp_mgr->s3a_options()));
+      path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_), tmp_mgr->s3a_options()));
   return Status::OK();
 }
 
@@ -755,8 +755,8 @@ Status TmpDirHdfs::VerifyAndCreate(MetricGroup* metrics, vector<bool>* is_tmp_di
   hdfsFS hdfs_conn;
   // If the HDFS path doesn't exist, it would fail while uploading, so we
   // create the HDFS path if it doesn't exist.
-  RETURN_IF_ERROR(HdfsFsCache::instance()->GetConnection(
-      parsed_raw_path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_)));
+  RETURN_IF_ERROR(
+      HdfsFsCache::instance()->GetConnection(path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_)));
   if (hdfsExists(hdfs_conn, path_.c_str()) != 0) {
     if (hdfsCreateDirectory(hdfs_conn, path_.c_str()) != 0) {
       return Status(GetHdfsErrorMsg("HDFS create path failed: ", path_));