You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by db...@apache.org on 2022/11/16 10:46:48 UTC
[impala] branch master updated: IMPALA-11683: Support Aliyun OSS File System
This is an automated email from the ASF dual-hosted git repository.
dbecker pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new c95342669 IMPALA-11683: Support Aliyun OSS File System
c95342669 is described below
commit c953426692625f3a7f1334307f195cfcb7584035
Author: yacai <he...@alibaba-inc.com>
AuthorDate: Mon Oct 24 16:31:45 2022 +0800
IMPALA-11683: Support Aliyun OSS File System
This patch adds support for OSS (Aliyun Object Storage Service).
Using the hadoop-aliyun, the implementation is similar to other
remote FileSystems.
Tests:
- Prepare:
Initialize OSS-related environment variables:
OSS_ACCESS_KEY_ID, OSS_SECRET_ACCESS_KEY, OSS_ACCESS_ENDPOINT.
Compile and create hdfs test data on a ECS instance. Upload test data
to an OSS bucket.
- Modify all locations in HMS DB to point to the OSS bucket.
Remove some hdfs caching params. Run CORE tests.
Change-Id: I267e6531da58e3ac97029fea4c5e075724587910
Reviewed-on: http://gerrit.cloudera.org:8080/19165
Reviewed-by: Quanlong Huang <hu...@gmail.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/hdfs-table-sink.cc | 1 +
bin/impala-config.sh | 26 ++++++++++++++++++++++
.../java/org/apache/impala/catalog/HdfsTable.java | 6 +++++
.../org/apache/impala/common/FileSystemUtil.java | 13 +++++++++++
.../java/org/apache/impala/testutil/TestUtils.java | 2 +-
java/executor-deps/pom.xml | 6 +++++
java/pom.xml | 1 +
testdata/bin/create-load-data.sh | 8 +++----
.../common/etc/hadoop/conf/core-site.xml.py | 10 +++++++++
tests/common/impala_test_suite.py | 4 ++++
tests/common/skip.py | 9 ++++----
tests/custom_cluster/test_hdfs_fd_caching.py | 9 ++++----
tests/custom_cluster/test_metastore_service.py | 6 ++---
tests/util/filesystem_utils.py | 3 ++-
14 files changed, 87 insertions(+), 17 deletions(-)
diff --git a/be/src/exec/hdfs-table-sink.cc b/be/src/exec/hdfs-table-sink.cc
index 75b8a5bf7..37e6dcc25 100644
--- a/be/src/exec/hdfs-table-sink.cc
+++ b/be/src/exec/hdfs-table-sink.cc
@@ -459,6 +459,7 @@ Status HdfsTableSink::CreateNewTmpFile(RuntimeState* state,
if (IsS3APath(tmp_hdfs_file_name_cstr) ||
IsABFSPath(tmp_hdfs_file_name_cstr) ||
IsADLSPath(tmp_hdfs_file_name_cstr) ||
+ IsOSSPath(tmp_hdfs_file_name_cstr) ||
IsGcsPath(tmp_hdfs_file_name_cstr) ||
IsCosPath(tmp_hdfs_file_name_cstr) ||
IsSFSPath(tmp_hdfs_file_name_cstr) ||
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 212e87b49..188f21879 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -225,6 +225,7 @@ export ARCH_NAME=$(uname -p)
# other branches to override them in impala-config-branch.sh for cleaner patches.
export IMPALA_BOUNCY_CASTLE_VERSION=1.68
export IMPALA_COS_VERSION=3.1.0-8.0.8
+export IMPALA_OSS_VERSION=3.1.1
export IMPALA_DERBY_VERSION=10.14.2.0
export IMPALA_GUAVA_VERSION=31.1-jre
export IMPALA_HUDI_VERSION=0.5.0-incubating
@@ -490,6 +491,10 @@ export COS_SECRET_ID="${COS_SECRET_ID-}"
export COS_SECRET_KEY="${COS_SECRET_KEY-}"
export COS_REGION="${COS_REGION-}"
export COS_BUCKET="${COS_BUCKET-}"
+export OSS_ACCESS_KEY_ID="${OSS_ACCESS_KEY_ID-}"
+export OSS_SECRET_ACCESS_KEY="${OSS_SECRET_ACCESS_KEY-}"
+export OSS_ACCESS_ENDPOINT="${OSS_ACCESS_ENDPOINT-}"
+export OSS_BUCKET="${OSS_BUCKET-}"
export HDFS_REPLICATION="${HDFS_REPLICATION-3}"
export ISILON_NAMENODE="${ISILON_NAMENODE-}"
# Internal and external interfaces that test cluster services will listen on. The
@@ -678,6 +683,26 @@ elif [ "${TARGET_FILESYSTEM}" = "cosn" ]; then
fi
DEFAULT_FS="cosn://${COS_BUCKET}"
export DEFAULT_FS
+elif [ "${TARGET_FILESYSTEM}" = "oss" ]; then
+ # Basic error checking
+ if [[ "${OSS_ACCESS_KEY_ID}" = "" ]]; then
+ echo "OSS_ACCESS_KEY_ID cannot be an empty string for OSS"
+ return 1
+ fi
+ if [[ "${OSS_SECRET_ACCESS_KEY}" = "" ]]; then
+ echo "OSS_SECRET_ACCESS_KEY cannot be an empty string for OSS"
+ return 1
+ fi
+ if [[ "${OSS_ACCESS_ENDPOINT}" = "" ]]; then
+ echo "OSS_ACCESS_ENDPOINT cannot be an empty string for OSS"
+ return 1
+ fi
+ if [[ "${OSS_BUCKET}" = "" ]]; then
+ echo "OSS_BUCKET cannot be an empty string for OSS"
+ return 1
+ fi
+ DEFAULT_FS="oss://${OSS_BUCKET}"
+ export DEFAULT_FS
elif [ "${TARGET_FILESYSTEM}" = "isilon" ]; then
if [ "${ISILON_NAMENODE}" = "" ]; then
echo "In order to access the Isilon filesystem, ISILON_NAMENODE"
@@ -946,6 +971,7 @@ echo "IMPALA_KUDU_VERSION = $IMPALA_KUDU_VERSION"
echo "IMPALA_RANGER_VERSION = $IMPALA_RANGER_VERSION"
echo "IMPALA_ICEBERG_VERSION = $IMPALA_ICEBERG_VERSION"
echo "IMPALA_COS_VERSION = $IMPALA_COS_VERSION"
+echo "IMPALA_OSS_VERSION = $IMPALA_OSS_VERSION"
# Kerberos things. If the cluster exists and is kerberized, source
# the required environment. This is required for any hadoop tool to
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 9ca396294..7a9913828 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -867,6 +867,12 @@ public class HdfsTable extends Table implements FeFsTable {
// So calling getPermissions() on COS files make no sense. Assume all COS files have
// READ_WRITE permissions.
if (FileSystemUtil.isCOSFileSystem(fs)) return true;
+
+ // In OSS, file owner and group are persisted, but the permissions model is not
+ // enforced. Authorization occurs at the level of the entire Aliyun account via Aliyun
+ // Resource Access Management.
+ // The append operation is not supported.
+ if (FileSystemUtil.isOSSFileSystem(fs)) return true;
return false;
}
diff --git a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
index 82cf9cfcf..2eca3b76f 100644
--- a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
+++ b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
@@ -72,6 +72,7 @@ public class FileSystemUtil {
public static final String SCHEME_ALLUXIO = "alluxio";
public static final String SCHEME_GCS = "gs";
public static final String SCHEME_COS = "cosn";
+ public static final String SCHEME_OSS = "oss";
public static final String SCHEME_SFS = "sfs";
/**
@@ -101,6 +102,7 @@ public class FileSystemUtil {
.add(SCHEME_OFS)
.add(SCHEME_GCS)
.add(SCHEME_COS)
+ .add(SCHEME_OSS)
.build();
/**
@@ -116,6 +118,7 @@ public class FileSystemUtil {
.add(SCHEME_OFS)
.add(SCHEME_GCS)
.add(SCHEME_COS)
+ .add(SCHEME_OSS)
.build();
/**
@@ -132,6 +135,7 @@ public class FileSystemUtil {
.add(SCHEME_OFS)
.add(SCHEME_GCS)
.add(SCHEME_COS)
+ .add(SCHEME_OSS)
.build();
/**
@@ -439,6 +443,13 @@ public class FileSystemUtil {
return hasScheme(fs, SCHEME_COS);
}
+ /**
+ * Returns true iff the filesystem is an OssFileSystem.
+ */
+ public static boolean isOSSFileSystem(FileSystem fs) {
+ return hasScheme(fs, SCHEME_OSS);
+ }
+
/**
* Returns true iff the filesystem is AdlFileSystem.
*/
@@ -549,6 +560,7 @@ public class FileSystemUtil {
ALLUXIO,
GCS,
COS,
+ OSS,
SFS;
private static final Map<String, FsType> SCHEME_TO_FS_MAPPING =
@@ -564,6 +576,7 @@ public class FileSystemUtil {
.put(SCHEME_ALLUXIO, ALLUXIO)
.put(SCHEME_GCS, GCS)
.put(SCHEME_COS, COS)
+ .put(SCHEME_OSS, OSS)
.build();
/**
diff --git a/fe/src/test/java/org/apache/impala/testutil/TestUtils.java b/fe/src/test/java/org/apache/impala/testutil/TestUtils.java
index c7fde3e69..24582f8ad 100644
--- a/fe/src/test/java/org/apache/impala/testutil/TestUtils.java
+++ b/fe/src/test/java/org/apache/impala/testutil/TestUtils.java
@@ -178,7 +178,7 @@ public class TestUtils {
*/
public static final ResultFilter SCAN_NODE_SCHEME_FILTER = new ResultFilter() {
- private final String fsSchemes = "(HDFS|S3|LOCAL|ADLS)";
+ private final String fsSchemes = "(HDFS|S3|LOCAL|ADLS|OSS)";
private final Pattern scanNodeFsScheme = Pattern.compile("SCAN " + fsSchemes);
// We don't match the size because the FILE_SIZE_FILTER could remove it
private final Pattern scanNodeInputMetadata =
diff --git a/java/executor-deps/pom.xml b/java/executor-deps/pom.xml
index 232e8ef1f..4f1ea5029 100644
--- a/java/executor-deps/pom.xml
+++ b/java/executor-deps/pom.xml
@@ -125,6 +125,12 @@ under the License.
<version>${cos.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-aliyun</artifactId>
+ <version>${oss.version}</version>
+ </dependency>
+
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
diff --git a/java/pom.xml b/java/pom.xml
index 5a76a72f4..41ce4f0a7 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -53,6 +53,7 @@ under the License.
<kite.version>${env.IMPALA_KITE_VERSION}</kite.version>
<knox.version>${env.IMPALA_KNOX_VERSION}</knox.version>
<cos.version>${env.IMPALA_COS_VERSION}</cos.version>
+ <oss.version>${env.IMPALA_OSS_VERSION}</oss.version>
<thrift.version>${env.IMPALA_THRIFT_POM_VERSION}</thrift.version>
<impala.extdatasrc.api.version>${project.version}</impala.extdatasrc.api.version>
<impala.query.event.hook.api.version>${project.version}</impala.query.event.hook.api.version>
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 78090596d..9585fa3d3 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -119,7 +119,7 @@ fi
TIMEOUT_PID=$!
SCHEMA_MISMATCH_ERROR="A schema change has been detected in the metadata, "
-SCHEMA_MISMATCH_ERROR+="but it cannot be loaded on Isilon, s3, gcs, cos or local "
+SCHEMA_MISMATCH_ERROR+="but it cannot be loaded on Isilon, s3, gcs, cos, oss or local "
SCHEMA_MISMATCH_ERROR+="filesystem, and the filesystem is ${TARGET_FILESYSTEM}".
if [[ $SKIP_METADATA_LOAD -eq 0 && "$SNAPSHOT_FILE" = "" ]]; then
@@ -135,10 +135,10 @@ elif [ $SKIP_SNAPSHOT_LOAD -eq 0 ]; then
if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh; then
if [[ "${TARGET_FILESYSTEM}" == "isilon" || "${TARGET_FILESYSTEM}" == "s3" || \
"${TARGET_FILESYSTEM}" == "local" || "${TARGET_FILESYSTEM}" == "gs" || \
- "${TARGET_FILESYSTEM}" == "cosn" ]] ; then
+ "${TARGET_FILESYSTEM}" == "cosn" || "${TARGET_FILESYSTEM}" == "oss" ]] ; then
echo "ERROR in $0 at line $LINENO: A schema change has been detected in the"
- echo "metadata, but it cannot be loaded on isilon, s3, gcs, cos or local and the"
- echo "target file system is ${TARGET_FILESYSTEM}. Exiting."
+ echo "metadata, but it cannot be loaded on isilon, s3, gcs, cos, oss or local"
+ echo "and the target file system is ${TARGET_FILESYSTEM}. Exiting."
# Generate an explicit JUnitXML symptom report here for easier triaging
${IMPALA_HOME}/bin/generate_junitxml.py --phase=dataload \
--step=check-schema-diff.sh --error "${SCHEMA_MISMATCH_ERROR}"
diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py b/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py
index 4d5f8d8c6..499cba249 100644
--- a/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py
+++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py
@@ -108,6 +108,16 @@ CONFIG = {
'fs.cosn.bucket.region': '${COS_REGION}',
'fs.cosn.impl': 'org.apache.hadoop.fs.CosFileSystem',
'fs.AbstractFileSystem.cosn.impl': 'org.apache.hadoop.fs.CosN',
+
+ # OSS configuration
+ # Note: This is needed even when not running on OSS, because some frontend tests
+ # include OSS paths that require initializing an OSS filesystem.
+ # See ExplainTest.testScanNodeFsScheme().
+ 'fs.oss.accessKeyId': '${OSS_ACCESS_KEY_ID}',
+ 'fs.oss.accessKeySecret': '${OSS_SECRET_ACCESS_KEY}',
+ 'fs.oss.endpoint': '${OSS_ACCESS_ENDPOINT}',
+ 'fs.oss.impl': 'org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem',
+ 'fs.AbstractFileSystem.oss.impl': 'org.apache.hadoop.fs.aliyun.oss.OSS',
}
if target_filesystem == 's3':
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index bfa926dff..149b0a790 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -69,6 +69,7 @@ from tests.util.filesystem_utils import (
IS_ADLS,
IS_GCS,
IS_COS,
+ IS_OSS,
IS_HDFS,
S3_BUCKET_NAME,
S3GUARD_ENABLED,
@@ -271,6 +272,9 @@ class ImpalaTestSuite(BaseTestSuite):
elif IS_COS:
# COS is implemented via HDFS command line client
cls.filesystem_client = HadoopFsCommandLineClient("COS")
+ elif IS_OSS:
+ # OSS is implemented via HDFS command line client
+ cls.filesystem_client = HadoopFsCommandLineClient("OSS")
elif IS_OZONE:
cls.filesystem_client = HadoopFsCommandLineClient("Ozone")
diff --git a/tests/common/skip.py b/tests/common/skip.py
index 939d5d9cd..6ed5b477d 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -32,6 +32,7 @@ from tests.util.filesystem_utils import (
IS_ADLS,
IS_GCS,
IS_COS,
+ IS_OSS,
IS_EC,
IS_HDFS,
IS_ISILON,
@@ -64,9 +65,9 @@ class SkipIfFS:
reason="Empty directories are not supported on S3")
file_or_folder_name_ends_with_period = pytest.mark.skipif(IS_ABFS,
reason="ABFS does not support file / directories that end with a period")
- stress_insert_timeouts = pytest.mark.skipif(IS_COS or IS_GCS,
+ stress_insert_timeouts = pytest.mark.skipif(IS_COS or IS_GCS or IS_OSS,
reason="IMPALA-10563, IMPALA-10773")
- shutdown_idle_fails = pytest.mark.skipif(IS_COS or IS_GCS,
+ shutdown_idle_fails = pytest.mark.skipif(IS_COS or IS_GCS or IS_OSS,
reason="IMPALA-10562")
late_filters = pytest.mark.skipif(IS_ISILON, reason="IMPALA-6998")
read_past_eof = pytest.mark.skipif(IS_S3 or IS_GCS, reason="IMPALA-2512")
@@ -80,9 +81,9 @@ class SkipIfFS:
reason="Tests rely on HDFS qualified paths, IMPALA-1872")
no_partial_listing = pytest.mark.skipif(not IS_HDFS,
reason="Tests rely on HDFS partial listing.")
- variable_listing_times = pytest.mark.skipif(IS_S3 or IS_GCS or IS_COS,
+ variable_listing_times = pytest.mark.skipif(IS_S3 or IS_GCS or IS_COS or IS_OSS,
reason="Flakiness due to unpredictable listing times on S3.")
- eventually_consistent = pytest.mark.skipif(IS_ADLS or IS_COS,
+ eventually_consistent = pytest.mark.skipif(IS_ADLS or IS_COS or IS_OSS,
reason="The client is slow to realize changes to file metadata")
class SkipIfKudu:
diff --git a/tests/custom_cluster/test_hdfs_fd_caching.py b/tests/custom_cluster/test_hdfs_fd_caching.py
index d46232d2b..fd9b04f82 100644
--- a/tests/custom_cluster/test_hdfs_fd_caching.py
+++ b/tests/custom_cluster/test_hdfs_fd_caching.py
@@ -24,7 +24,8 @@ from tests.util.filesystem_utils import (
IS_ISILON,
IS_ADLS,
IS_GCS,
- IS_COS)
+ IS_COS,
+ IS_OSS)
from time import sleep
@@ -135,7 +136,7 @@ class TestHdfsFdCaching(CustomClusterTestSuite):
# Caching applies to HDFS, Ozone, S3, and ABFS files. If this is HDFS, Ozone, S3, or
# ABFS, then verify that caching works. Otherwise, verify that file handles are not
# cached.
- if IS_ADLS or IS_ISILON or IS_GCS or IS_COS:
+ if IS_ADLS or IS_ISILON or IS_GCS or IS_COS or IS_OSS:
caching_expected = False
else:
caching_expected = True
@@ -152,7 +153,7 @@ class TestHdfsFdCaching(CustomClusterTestSuite):
handle_timeout = 5
# Only test eviction on platforms where caching is enabled.
- if IS_ADLS or IS_ISILON or IS_GCS or IS_COS:
+ if IS_ADLS or IS_ISILON or IS_GCS or IS_COS or IS_OSS:
return
caching_expected = True
self.run_fd_caching_test(vector, caching_expected, cache_capacity, handle_timeout)
@@ -203,7 +204,7 @@ class TestHdfsFdCaching(CustomClusterTestSuite):
eviction_timeout_secs = 5
# Only test eviction on platforms where caching is enabled.
- if IS_ADLS or IS_ISILON or IS_GCS or IS_COS:
+ if IS_ADLS or IS_ISILON or IS_GCS or IS_COS or IS_OSS:
return
# Maximum number of file handles cached.
diff --git a/tests/custom_cluster/test_metastore_service.py b/tests/custom_cluster/test_metastore_service.py
index 0611d63b6..e7c18cfb3 100644
--- a/tests/custom_cluster/test_metastore_service.py
+++ b/tests/custom_cluster/test_metastore_service.py
@@ -28,7 +28,7 @@ from hive_metastore.ttypes import SerDeInfo
from tests.util.event_processor_utils import EventProcessorUtils
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.util.filesystem_utils import (IS_S3, IS_ADLS, IS_GCS, IS_COS)
+from tests.util.filesystem_utils import (IS_S3, IS_ADLS, IS_GCS, IS_COS, IS_OSS)
class TestMetastoreService(CustomClusterTestSuite):
@@ -1174,7 +1174,7 @@ class TestMetastoreService(CustomClusterTestSuite):
assert get_parts_by_names_result.dictionary is not None
# obj_dict will only be populated when the table is on HDFS
# where block locations are available.
- if not IS_S3 and not IS_GCS and not IS_COS and not IS_ADLS:
+ if not IS_S3 and not IS_GCS and not IS_COS and not IS_ADLS and not IS_OSS:
assert len(get_parts_by_names_result.dictionary.values) > 0
else:
assert get_parts_by_names_result.dictionary is None
@@ -1202,7 +1202,7 @@ class TestMetastoreService(CustomClusterTestSuite):
assert obj_dict is not None
# obj_dict will only be populated when the table is on HDFS
# where block locations are available.
- if not IS_S3 and not IS_GCS and not IS_COS and not IS_ADLS:
+ if not IS_S3 and not IS_GCS and not IS_COS and not IS_ADLS and not IS_OSS:
assert len(obj_dict.values) > 0
def __assert_no_filemd(self, filemetadata, obj_dict):
diff --git a/tests/util/filesystem_utils.py b/tests/util/filesystem_utils.py
index 88f5bfd0a..7444db7e8 100644
--- a/tests/util/filesystem_utils.py
+++ b/tests/util/filesystem_utils.py
@@ -33,6 +33,7 @@ IS_ADLS = FILESYSTEM == "adls"
IS_ABFS = FILESYSTEM == "abfs"
IS_GCS = FILESYSTEM == "gs"
IS_COS = FILESYSTEM == "cosn"
+IS_OSS = FILESYSTEM == "oss"
IS_OZONE = FILESYSTEM == "ozone"
IS_EC = os.getenv("ERASURE_CODING") == "true"
IS_ENCRYPTED = os.getenv("USE_OZONE_ENCRYPTION") == "true"
@@ -60,7 +61,7 @@ ADLS_CLIENT_SECRET = os.getenv("azure_client_secret")
# A map of FILESYSTEM values to their corresponding Scan Node types
fs_to_name = {'s3': 'S3', 'hdfs': 'HDFS', 'local': 'LOCAL', 'adls': 'ADLS',
- 'abfs': 'ADLS', 'gs': 'GCS', 'cosn': 'COS', 'ozone': 'OZONE'}
+ 'abfs': 'ADLS', 'gs': 'GCS', 'cosn': 'COS', 'ozone': 'OZONE', 'oss': 'OSS'}
def get_fs_name(fs):