You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2021/10/08 00:50:36 UTC
[impala] branch master updated: IMPALA-10945: Fix S3 scratch path
behavior
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 45fd332 IMPALA-10945: Fix S3 scratch path behavior
45fd332 is described below
commit 45fd3320ad4f68ca86998dff0c9504aa896a278a
Author: Yida Wu <wy...@gmail.com>
AuthorDate: Sun Oct 3 08:30:46 2021 -0700
IMPALA-10945: Fix S3 scratch path behavior
IMPALA-10429 "Support Spill to HDFS" introduces a new behavior to
S3 scratch path. It added a path verification for the S3 path,
however, the HdfsFsCache::GetNameNodeFromPath in the verification
forces the input to have at least a directory after the authority,
like "s3a://authority/dir", otherwise it will return an error and
lead to a failure on the TmpFileMgr initialization. Therefore, it
changes the previous behavior which was able to support
"s3a://authority", and may affect current users.
This patch resumes the behavior of the s3 scratch path to allow
"s3a://authority". The solution is to pass the path of the user's
input combined with a scratch suffix "/impala-scratch" to the
verification function, therefore, at least one directory is contained
in the path.
Tests:
Ran core tests.
Added logic to run two types of path format in TmpFileMgrTest:
"s3a://authority" and "s3a://authority/dir".
Change-Id: I028f375b9f535f8641261cc02f921497e076aa9b
Reviewed-on: http://gerrit.cloudera.org:8080/17901
Reviewed-by: Abhishek Rawat <ar...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/runtime/tmp-file-mgr-test.cc | 183 +++++++++++++++++++-----------------
be/src/runtime/tmp-file-mgr.cc | 6 +-
2 files changed, 98 insertions(+), 91 deletions(-)
diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index 57cd9a7..d75b2f8 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -1015,101 +1015,108 @@ TEST_F(TmpFileMgrTest, TestDirectoryLimitParsingRemotePath) {
"/tmp/local-buffer-dir2", "/tmp/local-buffer-dir3"});
// Successful cases for HDFS paths.
- auto& dirs1 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost:20500/tmp,/tmp/local-buffer-dir"));
- EXPECT_NE(nullptr, dirs1);
-
- auto& dirs2 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost:20500/tmp:100,/tmp/local-buffer-dir"));
- EXPECT_NE(nullptr, dirs2);
- EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs2->path());
- EXPECT_EQ(100, dirs2->bytes_limit());
-
- auto& dirs3 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost:20500/tmp:1KB:1,/tmp/local-buffer-dir"));
- EXPECT_NE(nullptr, dirs3);
- EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs3->path());
- EXPECT_EQ(1024, dirs3->bytes_limit());
-
- // Multiple local paths with one remote path.
- auto tmp_mgr_4 = CreateTmpFileMgr("hdfs://localhost:20500/tmp,/tmp/local-buffer-dir1,"
- "/tmp/local-buffer-dir2,/tmp/local-buffer-dir3");
- auto& dirs4_local = GetTmpDirs(tmp_mgr_4);
- auto& dirs4_remote = GetTmpRemoteDir(tmp_mgr_4);
- EXPECT_NE(nullptr, dirs4_remote);
- EXPECT_EQ(2, dirs4_local.size());
- EXPECT_EQ("/tmp/local-buffer-dir2/impala-scratch", dirs4_local[0]->path());
- EXPECT_EQ("/tmp/local-buffer-dir3/impala-scratch", dirs4_local[1]->path());
-
- // Fails the parsing due to no port number for the HDFS path.
- auto tmp_mgr_5 = CreateTmpFileMgr("hdfs://localhost/tmp,/tmp/local-buffer-dir");
- auto& dirs5_local = GetTmpDirs(tmp_mgr_5);
- auto& dirs5_remote = GetTmpRemoteDir(tmp_mgr_5);
- EXPECT_EQ(1, dirs5_local.size());
- EXPECT_EQ(nullptr, dirs5_remote);
-
- // Fails to init due to no local buffer for the remote scratch space.
- // We expect a non-null tmp_mgr, but the init process should fail.
- EXPECT_NE(nullptr, CreateTmpFileMgr("hdfs://localhost:20500/tmp", false));
-
- // Parse successfully, but the parsed HDFS path is unable to connect.
- // These cases would fail the initialization of TmpFileMgr.
- auto& dirs7 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost:1/tmp::1,/tmp/local-buffer-dir", false));
- EXPECT_EQ(nullptr, dirs7);
-
- auto& dirs8 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost:/tmp::,/tmp/local-buffer-dir", false));
- EXPECT_EQ(nullptr, dirs8);
-
- auto& dirs9 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost/tmp::1,/tmp/local-buffer-dir", false));
- EXPECT_EQ(nullptr, dirs9);
-
- auto& dirs10 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost/tmp:1,/tmp/local-buffer-dir", false));
- EXPECT_EQ(nullptr, dirs10);
-
- // Multiple remote paths, should support only one.
- auto& dirs11 = GetTmpRemoteDir(
- CreateTmpFileMgr("hdfs://localhost:20500/tmp,hdfs://localhost:20501/tmp,"
- "/tmp/local-buffer-dir"));
- EXPECT_NE(nullptr, dirs11);
- EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs11->path());
-
- // The order of the buffer and the remote dir should not affect the result.
- auto& dirs12 = GetTmpRemoteDir(
- CreateTmpFileMgr("/tmp/local-buffer-dir, hdfs://localhost:20500/tmp,"
- "hdfs://localhost:20501/tmp"));
- EXPECT_NE(nullptr, dirs12);
- EXPECT_EQ("hdfs://localhost:20500/tmp/impala-scratch", dirs12->path());
+ // Two types of paths, one with directory, one without.
+ vector<string> hdfs_paths{"hdfs://localhost:20500", "hdfs://localhost:20500/tmp"};
+ for (string hdfs_path : hdfs_paths) {
+ string full_hdfs_path = hdfs_path + "/impala-scratch";
+ auto& dirs1 = GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path + ",/tmp/local-buffer-dir"));
+ EXPECT_NE(nullptr, dirs1);
+
+ auto& dirs2 =
+ GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path + ":100,/tmp/local-buffer-dir"));
+ EXPECT_NE(nullptr, dirs2);
+ EXPECT_EQ(full_hdfs_path, dirs2->path());
+ EXPECT_EQ(100, dirs2->bytes_limit());
+
+ auto& dirs3 =
+ GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path + ":1KB:1,/tmp/local-buffer-dir"));
+ EXPECT_NE(nullptr, dirs3);
+ EXPECT_EQ(full_hdfs_path, dirs3->path());
+ EXPECT_EQ(1024, dirs3->bytes_limit());
+
+ // Multiple local paths with one remote path.
+ auto tmp_mgr_4 = CreateTmpFileMgr(hdfs_path
+ + ",/tmp/local-buffer-dir1,"
+ "/tmp/local-buffer-dir2,/tmp/local-buffer-dir3");
+ auto& dirs4_local = GetTmpDirs(tmp_mgr_4);
+ auto& dirs4_remote = GetTmpRemoteDir(tmp_mgr_4);
+ EXPECT_NE(nullptr, dirs4_remote);
+ EXPECT_EQ(full_hdfs_path, dirs4_remote->path());
+ EXPECT_EQ(2, dirs4_local.size());
+ EXPECT_EQ("/tmp/local-buffer-dir2/impala-scratch", dirs4_local[0]->path());
+ EXPECT_EQ("/tmp/local-buffer-dir3/impala-scratch", dirs4_local[1]->path());
+
+ // Fails the parsing due to no port number for the HDFS path.
+ auto tmp_mgr_5 = CreateTmpFileMgr("hdfs://localhost/tmp,/tmp/local-buffer-dir");
+ auto& dirs5_local = GetTmpDirs(tmp_mgr_5);
+ auto& dirs5_remote = GetTmpRemoteDir(tmp_mgr_5);
+ EXPECT_EQ(1, dirs5_local.size());
+ EXPECT_EQ(nullptr, dirs5_remote);
+
+ // Fails to init due to no local buffer for the remote scratch space.
+ // We expect a non-null tmp_mgr, but the init process should fail.
+ EXPECT_NE(nullptr, CreateTmpFileMgr(hdfs_path, false));
+
+ // Parse successfully, but the parsed HDFS path is unable to connect.
+ // These cases would fail the initialization of TmpFileMgr.
+ auto& dirs7 = GetTmpRemoteDir(
+ CreateTmpFileMgr("hdfs://localhost:1/tmp::1,/tmp/local-buffer-dir", false));
+ EXPECT_EQ(nullptr, dirs7);
+
+ auto& dirs8 = GetTmpRemoteDir(
+ CreateTmpFileMgr("hdfs://localhost:/tmp::,/tmp/local-buffer-dir", false));
+ EXPECT_EQ(nullptr, dirs8);
+
+ auto& dirs9 = GetTmpRemoteDir(
+ CreateTmpFileMgr("hdfs://localhost/tmp::1,/tmp/local-buffer-dir", false));
+ EXPECT_EQ(nullptr, dirs9);
+
+ auto& dirs10 = GetTmpRemoteDir(
+ CreateTmpFileMgr("hdfs://localhost/tmp:1,/tmp/local-buffer-dir", false));
+ EXPECT_EQ(nullptr, dirs10);
+
+ // Multiple remote paths, should support only one.
+ auto& dirs11 = GetTmpRemoteDir(CreateTmpFileMgr(hdfs_path
+ + ",hdfs://localhost:20501/tmp,"
+ "/tmp/local-buffer-dir"));
+ EXPECT_NE(nullptr, dirs11);
+ EXPECT_EQ(full_hdfs_path, dirs11->path());
+
+ // The order of the buffer and the remote dir should not affect the result.
+ auto& dirs12 = GetTmpRemoteDir(CreateTmpFileMgr(
+ "/tmp/local-buffer-dir, " + hdfs_path + ",hdfs://localhost:20501/tmp"));
+ EXPECT_NE(nullptr, dirs12);
+ EXPECT_EQ(full_hdfs_path, dirs12->path());
+ }
// Successful cases for parsing S3 paths.
// Create a fake s3 connection in order to pass the connection verification.
HdfsFsCache::HdfsFsMap fake_hdfs_conn_map;
hdfsFS fake_conn = reinterpret_cast<hdfsFS>(1);
fake_hdfs_conn_map.insert(make_pair("s3a://fake_host/", fake_conn));
- auto& dirs13 = GetTmpRemoteDir(
- CreateTmpFileMgr("/tmp/local-buffer-dir, s3a://fake_host/for-parsing-test-only",
- true, &fake_hdfs_conn_map));
- EXPECT_NE(nullptr, dirs13);
- EXPECT_EQ("s3a://fake_host/for-parsing-test-only/impala-scratch", dirs13->path());
-
- auto& dirs14 = GetTmpRemoteDir(
- CreateTmpFileMgr("/tmp/local-buffer-dir, s3a://fake_host/for-parsing-test-only:100",
- true, &fake_hdfs_conn_map));
- EXPECT_NE(nullptr, dirs14);
- EXPECT_EQ("s3a://fake_host/for-parsing-test-only/impala-scratch", dirs14->path());
- EXPECT_EQ(100, dirs14->bytes_limit());
-
- auto& dirs15 = GetTmpRemoteDir(CreateTmpFileMgr(
- "/tmp/local-buffer-dir, s3a://fake_host/for-parsing-test-only:1KB:1", true,
- &fake_hdfs_conn_map));
- EXPECT_NE(nullptr, dirs15);
- EXPECT_EQ("s3a://fake_host/for-parsing-test-only/impala-scratch", dirs15->path());
- EXPECT_EQ(1024, dirs15->bytes_limit());
+ // Two types of paths, one with directory, one without.
+ vector<string> s3_paths{"s3a://fake_host", "s3a://fake_host/dir"};
+ for (string s3_path : s3_paths) {
+ string full_s3_path = s3_path + "/impala-scratch";
+ auto& dirs13 = GetTmpRemoteDir(
+ CreateTmpFileMgr("/tmp/local-buffer-dir, " + s3_path, true, &fake_hdfs_conn_map));
+ EXPECT_NE(nullptr, dirs13);
+ EXPECT_EQ(full_s3_path, dirs13->path());
+
+ auto& dirs14 = GetTmpRemoteDir(CreateTmpFileMgr(
+ "/tmp/local-buffer-dir, " + s3_path + ":100", true, &fake_hdfs_conn_map));
+ EXPECT_NE(nullptr, dirs14);
+ EXPECT_EQ(full_s3_path, dirs14->path());
+ EXPECT_EQ(100, dirs14->bytes_limit());
+
+ auto& dirs15 = GetTmpRemoteDir(CreateTmpFileMgr(
+ "/tmp/local-buffer-dir, " + s3_path + ":1KB:1", true, &fake_hdfs_conn_map));
+ EXPECT_NE(nullptr, dirs15);
+ EXPECT_EQ(full_s3_path, dirs15->path());
+ EXPECT_EQ(1024, dirs15->bytes_limit());
+ }
- // Failure cases for parsing S3 paths.
+ // Failure cases for parsing S3 paths, S3 paths don't allow the port number.
auto& dirs16 = GetTmpRemoteDir(CreateTmpFileMgr(
"/tmp/local-buffer-dir, s3a://fake_host:1234/for-parsing-test-only:1KB:1", true,
&fake_hdfs_conn_map));
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index d494284..ce3e21c 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -720,7 +720,7 @@ Status TmpDirS3::VerifyAndCreate(MetricGroup* metrics, vector<bool>* is_tmp_dir_
DCHECK(is_parsed_);
hdfsFS hdfs_conn;
RETURN_IF_ERROR(HdfsFsCache::instance()->GetConnection(
- parsed_raw_path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_), tmp_mgr->s3a_options()));
+ path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_), tmp_mgr->s3a_options()));
return Status::OK();
}
@@ -755,8 +755,8 @@ Status TmpDirHdfs::VerifyAndCreate(MetricGroup* metrics, vector<bool>* is_tmp_di
hdfsFS hdfs_conn;
// If the HDFS path doesn't exist, it would fail while uploading, so we
// create the HDFS path if it doesn't exist.
- RETURN_IF_ERROR(HdfsFsCache::instance()->GetConnection(
- parsed_raw_path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_)));
+ RETURN_IF_ERROR(
+ HdfsFsCache::instance()->GetConnection(path_, &hdfs_conn, &(tmp_mgr->hdfs_conns_)));
if (hdfsExists(hdfs_conn, path_.c_str()) != 0) {
if (hdfsCreateDirectory(hdfs_conn, path_.c_str()) != 0) {
return Status(GetHdfsErrorMsg("HDFS create path failed: ", path_));