You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/05/10 15:25:00 UTC
[impala] branch master updated (d423979 -> 17daa6e)
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.
from d423979 IMPALA-5843: Use page index in Parquet files to skip pages
new a2c5d95 IMPALA-8121: part 2: use local catalog in containers
new 327b938 IMPALA-8516. Update maven for Jenkins builds
new 17daa6e IMPALA-8369 (part 2): Hive 3: switch to Tez-on-YARN execution
The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
bin/bootstrap_build.sh | 11 +-
bin/bootstrap_system.sh | 18 +-
bin/bootstrap_toolchain.py | 7 +-
bin/create-test-configuration.sh | 11 +-
bin/generate_xml_config.py | 8 +-
bin/impala-config.sh | 10 +-
bin/jenkins/critique-gerrit-review.py | 2 +-
docker/catalogd/Dockerfile | 3 +-
docker/coord_exec/Dockerfile | 3 +-
docker/coordinator/Dockerfile | 2 +-
fe/pom.xml | 26 ++-
.../apache/impala/analysis/CopyTestCaseStmt.java | 2 +-
.../java/org/apache/impala/service/JdbcTest.java | 12 +-
fe/src/test/resources/hive-site.xml.py | 5 +-
shaded-deps/pom.xml | 1 -
testdata/cluster/admin | 16 ++
.../common/etc/hadoop/conf/capacity-scheduler.xml | 223 +++++++++++++++++++++
.../common/etc/hadoop/conf/yarn-site.xml.py | 97 +++++++++
.../common/etc/hadoop/conf/yarn-site.xml.tmpl | 154 --------------
.../partition-ddl-predicates-hdfs-only.test | 12 +-
tests/common/impala_connection.py | 1 -
tests/common/impala_test_suite.py | 20 +-
tests/common/skip.py | 50 +++++
tests/hs2/test_hs2.py | 57 ++++++
tests/metadata/test_ddl.py | 18 +-
tests/metadata/test_ddl_base.py | 14 +-
tests/metadata/test_hdfs_permissions.py | 4 +-
tests/metadata/test_hms_integration.py | 10 +-
tests/metadata/test_metadata_query_statements.py | 4 +-
tests/metadata/test_recover_partitions.py | 3 +-
tests/query_test/test_hdfs_caching.py | 3 +-
tests/query_test/test_insert_behaviour.py | 8 +-
tests/query_test/test_kudu.py | 36 +++-
tests/query_test/test_queries.py | 3 +-
tests/query_test/test_udfs.py | 6 +-
35 files changed, 633 insertions(+), 227 deletions(-)
create mode 100644 testdata/cluster/node_templates/common/etc/hadoop/conf/capacity-scheduler.xml
create mode 100644 testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py
delete mode 100644 testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.tmpl
[impala] 02/03: IMPALA-8516. Update maven for Jenkins builds
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 327b93821453f6abda2cb2d4437bba10946acc00
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Tue May 7 11:09:17 2019 -0700
IMPALA-8516. Update maven for Jenkins builds
This changes Maven to download and install on both Ubuntu and Redhat for
the Jenkins builds (previously it was only installed on Redhat).
The version number is kept at 3.5.4 even though a newer release is
available upstream. The new release fails to build Impala due to an
XML-parsing bug causing it to fail to resolve the parquet pom [1]
This should hopefully address some of the hang issues we've seen
previously with the older version of Maven that shipped with the version
of Ubuntu we have on Ubuntu 16.04.
[1] https://github.com/codehaus-plexus/plexus-utils/issues/65
Change-Id: I793409eb4e9f4533b75bfe089a497c0ea62ad1ff
Reviewed-on: http://gerrit.cloudera.org:8080/13268
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Todd Lipcon <to...@apache.org>
---
bin/bootstrap_build.sh | 11 ++++++++++-
bin/bootstrap_system.sh | 18 ++++++++++++------
2 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/bin/bootstrap_build.sh b/bin/bootstrap_build.sh
index 82f1f37..b94a026 100755
--- a/bin/bootstrap_build.sh
+++ b/bin/bootstrap_build.sh
@@ -31,7 +31,7 @@ set -euxo pipefail
# Install non-java dependencies:
sudo apt-get update
-sudo apt-get --yes install g++ gcc git libsasl2-dev libssl-dev make maven \
+sudo apt-get --yes install g++ gcc git libsasl2-dev libssl-dev make \
python-dev python-setuptools libffi-dev libkrb5-dev
@@ -45,4 +45,13 @@ fi
sudo apt-get --yes install openjdk-${JDK_VERSION}-jdk openjdk-${JDK_VERSION}-source
export JAVA_HOME=/usr/lib/jvm/java-${JDK_VERSION}-openjdk-amd64
+# Download Maven since the packaged version is pretty old.
+if [ ! -d /usr/local/apache-maven-3.5.4 ]; then
+ sudo wget -nv \
+ https://www-us.apache.org/dist/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz
+ sha512sum -c - <<< '2a803f578f341e164f6753e410413d16ab60fabe31dc491d1fe35c984a5cce696bc71f57757d4538fe7738be04065a216f3ebad4ef7e0ce1bb4c51bc36d6be86 apache-maven-3.5.4-bin.tar.gz'
+ sudo tar -C /usr/local -xzf apache-maven-3.5.4-bin.tar.gz
+ sudo ln -s /usr/local/apache-maven-3.5.4/bin/mvn /usr/local/bin
+fi
+
./buildall.sh -notests -so
diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh
index af43a26..d24aed4 100755
--- a/bin/bootstrap_system.sh
+++ b/bin/bootstrap_system.sh
@@ -189,7 +189,7 @@ echo ">>> Installing build tools"
ubuntu apt-get update
ubuntu apt-get --yes install ccache g++ gcc libffi-dev liblzo2-dev libkrb5-dev \
krb5-admin-server krb5-kdc krb5-user libsasl2-dev libsasl2-modules \
- libsasl2-modules-gssapi-mit libssl-dev make maven ninja-build ntp \
+ libsasl2-modules-gssapi-mit libssl-dev make ninja-build ntp \
ntpdate python-dev python-setuptools postgresql ssh wget vim-common psmisc \
lsof openjdk-8-jdk openjdk-8-source openjdk-8-dbg apt-utils git ant
@@ -235,17 +235,23 @@ redhat sudo yum install -y ccache
# Clean up yum caches
redhat sudo yum clean all
-# Download ant and mvn for centos
+# Download ant for centos
redhat sudo wget -nv \
- https://www-us.apache.org/dist/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz \
https://www-us.apache.org/dist/ant/binaries/apache-ant-1.9.13-bin.tar.gz
-redhat sha512sum -c - <<< '2a803f578f341e164f6753e410413d16ab60fabe31dc491d1fe35c984a5cce696bc71f57757d4538fe7738be04065a216f3ebad4ef7e0ce1bb4c51bc36d6be86 apache-maven-3.5.4-bin.tar.gz'
redhat sha512sum -c - <<< 'c8321aa223f70d7e64d3d0274263000cfffb46fbea61488534e26f9f0245d99e9872d0888e35cd3274416392a13f80c748c07750caaeffa5f9cae1220020715f apache-ant-1.9.13-bin.tar.gz'
-redhat sudo tar -C /usr/local -xzf apache-maven-3.5.4-bin.tar.gz
redhat sudo tar -C /usr/local -xzf apache-ant-1.9.13-bin.tar.gz
-redhat sudo ln -s /usr/local/apache-maven-3.5.4/bin/mvn /usr/local/bin
redhat sudo ln -s /usr/local/apache-ant-1.9.13/bin/ant /usr/local/bin
+# Download maven for all OSes, since the OS-packaged version can be
+# pretty old.
+if [ ! -d /usr/local/apache-maven-3.5.4 ]; then
+ sudo wget -nv \
+ https://www-us.apache.org/dist/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz
+ sha512sum -c - <<< '2a803f578f341e164f6753e410413d16ab60fabe31dc491d1fe35c984a5cce696bc71f57757d4538fe7738be04065a216f3ebad4ef7e0ce1bb4c51bc36d6be86 apache-maven-3.5.4-bin.tar.gz'
+ sudo tar -C /usr/local -xzf apache-maven-3.5.4-bin.tar.gz
+ sudo ln -s /usr/local/apache-maven-3.5.4/bin/mvn /usr/local/bin
+fi
+
if ! { service --status-all | grep -E '^ \[ \+ \] ssh$'; }
then
ubuntu sudo service ssh start
[impala] 03/03: IMPALA-8369 (part 2): Hive 3: switch to Tez-on-YARN
execution
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 17daa6efb9c3c5c6fbd0908f2176b99d8498a250
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Thu May 2 14:25:57 2019 -0700
IMPALA-8369 (part 2): Hive 3: switch to Tez-on-YARN execution
This switches away from Tez local mode to tez-on-YARN. After spending a
couple of days trying to debug issues with Tez local mode, it seemed
like it was just going to be too much of a lift.
This patch switches on the starting of a Yarn RM and NM when
USE_CDP_HIVE is enabled. It also switches to a new yarn-site.xml with a
minimized set of configurations, generated by the new python templating.
In order for everything to work properly I also had to update the Hadoop
dependency to come from CDP instead of CDH when using CDP Hive.
Otherwise, the classpath of the launched Tez containers had conflicting
versions of various Hadoop classes which caused tasks to fail.
I verified that this fixes concurrent query execution by running queries
in parallel in two beeline sessions. With local mode, these queries
would periodically fail due to various races (HIVE-21682). I'm also able
to get farther along in data loading.
Change-Id: If96064f271582b2790a3cfb3d135f3834d46c41d
Reviewed-on: http://gerrit.cloudera.org:8080/13224
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Todd Lipcon <to...@apache.org>
---
bin/bootstrap_toolchain.py | 7 +-
bin/create-test-configuration.sh | 11 +-
bin/generate_xml_config.py | 8 +-
bin/impala-config.sh | 10 +-
bin/jenkins/critique-gerrit-review.py | 2 +-
fe/pom.xml | 26 ++-
.../apache/impala/analysis/CopyTestCaseStmt.java | 2 +-
fe/src/test/resources/hive-site.xml.py | 5 +-
shaded-deps/pom.xml | 1 -
testdata/cluster/admin | 16 ++
.../common/etc/hadoop/conf/capacity-scheduler.xml | 223 +++++++++++++++++++++
.../common/etc/hadoop/conf/yarn-site.xml.py | 97 +++++++++
.../common/etc/hadoop/conf/yarn-site.xml.tmpl | 154 --------------
13 files changed, 389 insertions(+), 173 deletions(-)
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 07a646d..34547fe 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -553,10 +553,10 @@ if __name__ == "__main__":
toolchain_host = os.environ["IMPALA_TOOLCHAIN_HOST"]
cdh_build_number = os.environ["CDH_BUILD_NUMBER"]
- cdh_components = map(Package, ["hadoop", "hbase", "sentry"])
+ cdh_components = map(Package, ["hbase", "sentry"])
use_cdp_hive = os.getenv("USE_CDP_HIVE") == "true"
if not use_cdp_hive:
- cdh_components += [Package("hive")]
+ cdh_components += [Package("hive"), Package("hadoop")]
if use_cdh_kudu:
if not try_get_platform_release_label() or not try_get_platform_release_label().cdh:
@@ -580,12 +580,13 @@ if __name__ == "__main__":
cdp_components = [
CdpComponent("ranger-{0}-admin".format(os.environ.get("IMPALA_RANGER_VERSION"))),
]
- use_cdp_hive = os.getenv("USE_CDP_HIVE") == "true"
if use_cdp_hive:
hive_version = os.environ.get("IMPALA_HIVE_VERSION")
cdp_components.append(CdpComponent("hive-{0}-source".format(hive_version),
pkg_directory="hive-{0}".format(hive_version))),
cdp_components.append(CdpComponent("apache-hive-{0}-bin".format(hive_version))),
+ cdp_components.append(CdpComponent("hadoop-{0}"
+ .format(os.environ.get("IMPALA_HADOOP_VERSION")))),
cdp_components.append(CdpComponent(
"tez-{0}-minimal".format(os.environ.get("IMPALA_TEZ_VERSION")),
makedir=True))
diff --git a/bin/create-test-configuration.sh b/bin/create-test-configuration.sh
index 208d4f8..8d08562 100755
--- a/bin/create-test-configuration.sh
+++ b/bin/create-test-configuration.sh
@@ -174,12 +174,12 @@ if [ $CREATE_RANGER_POLICY_DB -eq 1 ]; then
popd
fi
-echo "Linking core-site.xml from local cluster"
+echo "Linking common conf files from local cluster:"
CLUSTER_HADOOP_CONF_DIR=$(${CLUSTER_DIR}/admin get_hadoop_client_conf_dir)
-ln -s ${CLUSTER_HADOOP_CONF_DIR}/core-site.xml
-
-echo "Linking hdfs-site.xml from local cluster"
-ln -s ${CLUSTER_HADOOP_CONF_DIR}/hdfs-site.xml
+for file in core-site.xml hdfs-site.xml yarn-site.xml ; do
+ echo ... $file
+ ln -s ${CLUSTER_HADOOP_CONF_DIR}/$file
+done
if ${CLUSTER_DIR}/admin is_kerberized; then
# KERBEROS TODO: Without this, the yarn daemons can see these
@@ -190,7 +190,6 @@ if ${CLUSTER_DIR}/admin is_kerberized; then
# kerberos principals. Obviously this has to be sorted out before
# a kerberized cluster can load data.
echo "Linking yarn and mapred from local cluster"
- ln -s ${CLUSTER_HADOOP_CONF_DIR}/yarn-site.xml
ln -s ${CLUSTER_HADOOP_CONF_DIR}/mapred-site.xml
fi
diff --git a/bin/generate_xml_config.py b/bin/generate_xml_config.py
index a06da7e..18e3615 100755
--- a/bin/generate_xml_config.py
+++ b/bin/generate_xml_config.py
@@ -80,6 +80,8 @@ def dump_config(d, source_path, out):
print >>out, dedent(header)
for k, v in sorted(d.iteritems()):
try:
+ if isinstance(v, int):
+ v = str(v)
v = _substitute_env_vars(v)
except KeyError, e:
raise Exception("failed environment variable substitution for value {k}: {e}"
@@ -98,7 +100,11 @@ def main():
sys.exit(1)
_, in_path, out_path = sys.argv
- mod = imp.load_source('template', in_path)
+ try:
+ mod = imp.load_source('template', in_path)
+ except: # noqa
+ print >>sys.stderr, "Unable to load template: %s" % in_path
+ raise
conf = mod.__dict__.get('CONFIG')
if not isinstance(conf, dict):
raise Exception("module in '{path}' should define a dict named CONFIG"
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index cc8cfef..68de5e4 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -162,7 +162,8 @@ export IMPALA_TOOLCHAIN_HOST
export CDH_MAJOR_VERSION=6
export CDH_BUILD_NUMBER=1055188
export CDP_BUILD_NUMBER=1056671
-export IMPALA_HADOOP_VERSION=3.0.0-cdh6.x-SNAPSHOT
+export CDH_HADOOP_VERSION=3.0.0-cdh6.x-SNAPSHOT
+export CDP_HADOOP_VERSION=3.1.1.6.0.99.0-147
export IMPALA_HBASE_VERSION=2.1.0-cdh6.x-SNAPSHOT
export IMPALA_SENTRY_VERSION=2.1.0-cdh6.x-SNAPSHOT
export IMPALA_RANGER_VERSION=1.2.0.6.0.99.0-147
@@ -200,10 +201,14 @@ if $USE_CDP_HIVE; then
# the minicluster
export IMPALA_HIVE_VERSION=${CDP_HIVE_VERSION}
export IMPALA_TEZ_VERSION=0.9.1.6.0.99.0-147
+ export IMPALA_HADOOP_VERSION=${CDP_HADOOP_VERSION}
+ export HADOOP_HOME="$CDP_COMPONENTS_HOME/hadoop-${CDP_HADOOP_VERSION}/"
else
# CDH hive version is used to build and deploy in minicluster when USE_CDP_HIVE is
# false
export IMPALA_HIVE_VERSION=${CDH_HIVE_VERSION}
+ export IMPALA_HADOOP_VERSION=${CDH_HADOOP_VERSION}
+ export HADOOP_HOME="$CDH_COMPONENTS_HOME/hadoop-${IMPALA_HADOOP_VERSION}/"
fi
# Extract the first component of the hive version.
# Allow overriding of Hive source location in case we want to build Impala without
@@ -510,9 +515,6 @@ export IMPALA_COMMON_DIR="$IMPALA_HOME/common"
export PATH="$IMPALA_TOOLCHAIN/gdb-$IMPALA_GDB_VERSION/bin:$PATH"
export PATH="$IMPALA_HOME/bin:$IMPALA_TOOLCHAIN/cmake-$IMPALA_CMAKE_VERSION/bin/:$PATH"
-# Typically we build against a snapshot build of Hadoop that includes everything we need
-# for building Impala and running a minicluster.
-export HADOOP_HOME="$CDH_COMPONENTS_HOME/hadoop-${IMPALA_HADOOP_VERSION}/"
export HADOOP_CONF_DIR="$IMPALA_FE_DIR/src/test/resources"
# The include and lib paths are needed to pick up hdfs.h and libhdfs.*
# Allow overriding in case we want to point to a package/install with a different layout.
diff --git a/bin/jenkins/critique-gerrit-review.py b/bin/jenkins/critique-gerrit-review.py
index 5048a1d..c2bfdb7 100755
--- a/bin/jenkins/critique-gerrit-review.py
+++ b/bin/jenkins/critique-gerrit-review.py
@@ -69,7 +69,7 @@ EXCLUDE_FILE_PATTERNS = [
re.compile(r".*/catalog/BuiltinsDb.java"), # Many long strings.
re.compile(r".*/codegen/gen_ir_descriptions.py"), # Many long strings.
re.compile(r".*shell/ext-py/.*"), # Third-party code.
- re.compile(r".*/fe/src/test/resources/.*.py") # Long lines in config files.
+ re.compile(r".*/.*\.xml\.py") # Long lines in config template files.
]
diff --git a/fe/pom.xml b/fe/pom.xml
index 43701b4..ecd79d3 100644
--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -53,7 +53,11 @@ under the License.
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
-
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs-client</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
@@ -178,6 +182,10 @@ under the License.
<groupId>org.apache.hive</groupId>
<artifactId>*</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+ </exclusion>
</exclusions>
</dependency>
@@ -993,6 +1001,22 @@ under the License.
<groupId>org.apache.hive</groupId>
<artifactId>hive-shims</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <!-- needed for JobConf, which HiveConf inherits from -->
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-mapreduce-client-core</artifactId>
+ <version>${hadoop.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>*</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
</exclusions>
</dependency>
</dependencies>
diff --git a/fe/src/main/java/org/apache/impala/analysis/CopyTestCaseStmt.java b/fe/src/main/java/org/apache/impala/analysis/CopyTestCaseStmt.java
index 5023963..d961f95 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CopyTestCaseStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CopyTestCaseStmt.java
@@ -17,7 +17,7 @@
package org.apache.impala.analysis;
-import avro.shaded.com.google.common.collect.Sets;
+import com.google.common.collect.Sets;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.hadoop.fs.FSDataOutputStream;
diff --git a/fe/src/test/resources/hive-site.xml.py b/fe/src/test/resources/hive-site.xml.py
index 18e0011..66e62a7 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -81,7 +81,10 @@ if hive_major_version >= 3:
CONFIG.update({
'hive.tez.container.size': '512',
'hive.txn.manager': 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager',
- 'tez.local.mode': 'true'})
+ # We run YARN with Tez on the classpath directly
+ 'tez.ignore.lib.uris': 'true',
+ 'tez.use.cluster.hadoop-libs': 'true',
+ })
else:
CONFIG.update({
# TODO(vihang) Disabled for HMS3.
diff --git a/shaded-deps/pom.xml b/shaded-deps/pom.xml
index 6aad3c5..579758e 100644
--- a/shaded-deps/pom.xml
+++ b/shaded-deps/pom.xml
@@ -51,7 +51,6 @@ the same dependencies
<artifactSet>
<includes>
<include>org.apache.hive:hive-exec</include>
- <include>org.apache.hadoop:hadoop-mapreduce-client</include>
</includes>
</artifactSet>
<relocations>
diff --git a/testdata/cluster/admin b/testdata/cluster/admin
index acc44a5..9eafd8c 100755
--- a/testdata/cluster/admin
+++ b/testdata/cluster/admin
@@ -34,6 +34,11 @@ setup_report_build_error
: ${IMPALA_KERBERIZE=}
: ${INCLUDE_YARN=}
+# For Hive 3, we require Yarn for Tez support.
+if [[ $USE_CDP_HIVE ]]; then
+ INCLUDE_YARN=1
+fi
+
while getopts vky OPT; do
case $OPT in
v) set -x;;
@@ -54,6 +59,7 @@ NODE_PREFIX=node-
COMMON_NODE_TEMPLATE="$DIR/node_templates/common"
NODE_TEMPLATE="$DIR/node_templates/cdh$CDH_MAJOR_VERSION"
TEMPLATE_SUFFIX=".tmpl"
+PY_TEMPLATE_SUFFIX=".xml.py"
# Each process should be marked with this so a "pkill -f" can be done to nuke everything.
export KILL_CLUSTER_MARKER=IBelongToTheMiniCluster
@@ -237,6 +243,9 @@ function create_cluster {
# Remove master role scripts from slave nodes
rm -f "$NODE_DIR/etc/init.d/"{hdfs-namenode,yarn-resourcemanager} \
"$NODE_DIR/etc/init.d/"{kms,kudu-master}
+ # Only run one YARN nodemanager (more memory-efficient to scale up a
+ # single NM than run several)
+ rm -f "$NODE_DIR/etc/init.d/yarn-nodemanager"
fi
for EMPTY_NODE_DIR in $EMPTY_NODE_DIRS; do
mkdir -p "$NODE_DIR/$EMPTY_NODE_DIR"
@@ -302,6 +311,13 @@ function create_cluster {
fi
rm "$TEMPLATE_PATH" "$ACTUAL_PATH.1"
done
+ # Substitute python-templated XML files.
+ # TODO(todd): move over all the XML templates to be Python-based.
+ for TEMPLATE_PATH in $(find "$NODE_DIR" -name "*$PY_TEMPLATE_SUFFIX"); do
+ ACTUAL_PATH="${TEMPLATE_PATH%$PY_TEMPLATE_SUFFIX}".xml
+ $IMPALA_HOME/bin/generate_xml_config.py $TEMPLATE_PATH $ACTUAL_PATH
+ rm $TEMPLATE_PATH
+ done
done
}
diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/capacity-scheduler.xml b/testdata/cluster/node_templates/common/etc/hadoop/conf/capacity-scheduler.xml
new file mode 100644
index 0000000..80d4ed1
--- /dev/null
+++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/capacity-scheduler.xml
@@ -0,0 +1,223 @@
+<!--
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+
+ NOTE: this is the default capacity-scheduler.xml that ships with
+ YARN. No Impala-specific modifications have been made.
+-->
+<configuration>
+
+ <property>
+ <name>yarn.scheduler.capacity.maximum-applications</name>
+ <value>10000</value>
+ <description>
+ Maximum number of applications that can be pending and running.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
+ <value>0.1</value>
+ <description>
+ Maximum percent of resources in the cluster which can be used to run
+ application masters i.e. controls number of concurrent running
+ applications.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.resource-calculator</name>
+ <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
+ <description>
+ The ResourceCalculator implementation to be used to compare
+ Resources in the scheduler.
+ The default i.e. DefaultResourceCalculator only uses Memory while
+ DominantResourceCalculator uses dominant-resource to compare
+ multi-dimensional resources such as Memory, CPU etc.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.queues</name>
+ <value>default</value>
+ <description>
+ The queues at the this level (root is the root queue).
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.capacity</name>
+ <value>100</value>
+ <description>Default queue target capacity.</description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
+ <value>1</value>
+ <description>
+ Default queue user limit a percentage from 0.0 to 1.0.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
+ <value>100</value>
+ <description>
+ The maximum capacity of the default queue.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.state</name>
+ <value>RUNNING</value>
+ <description>
+ The state of the default queue. State can be one of RUNNING or STOPPED.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
+ <value>*</value>
+ <description>
+ The ACL of who can submit jobs to the default queue.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
+ <value>*</value>
+ <description>
+ The ACL of who can administer jobs on the default queue.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.acl_application_max_priority</name>
+ <value>*</value>
+ <description>
+ The ACL of who can submit applications with configured priority.
+ For e.g, [user={name} group={name} max_priority={priority} default_priority={priority}]
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.maximum-application-lifetime
+ </name>
+ <value>-1</value>
+ <description>
+ Maximum lifetime of an application which is submitted to a queue
+ in seconds. Any value less than or equal to zero will be considered as
+ disabled.
+ This will be a hard time limit for all applications in this
+ queue. If positive value is configured then any application submitted
+ to this queue will be killed after exceeds the configured lifetime.
+ User can also specify lifetime per application basis in
+ application submission context. But user lifetime will be
+ overridden if it exceeds queue maximum lifetime. It is point-in-time
+ configuration.
+ Note : Configuring too low value will result in killing application
+ sooner. This feature is applicable only for leaf queue.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.root.default.default-application-lifetime
+ </name>
+ <value>-1</value>
+ <description>
+ Default lifetime of an application which is submitted to a queue
+ in seconds. Any value less than or equal to zero will be considered as
+ disabled.
+ If the user has not submitted application with lifetime value then this
+ value will be taken. It is point-in-time configuration.
+ Note : Default lifetime can't exceed maximum lifetime. This feature is
+ applicable only for leaf queue.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.node-locality-delay</name>
+ <value>40</value>
+ <description>
+ Number of missed scheduling opportunities after which the CapacityScheduler
+ attempts to schedule rack-local containers.
+ When setting this parameter, the size of the cluster should be taken into account.
+ We use 40 as the default value, which is approximately the number of nodes in one rack.
+ Note, if this value is -1, the locality constraint in the container request
+ will be ignored, which disables the delay scheduling.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.rack-locality-additional-delay</name>
+ <value>-1</value>
+ <description>
+ Number of additional missed scheduling opportunities over the node-locality-delay
+ ones, after which the CapacityScheduler attempts to schedule off-switch containers,
+ instead of rack-local ones.
+ Example: with node-locality-delay=40 and rack-locality-delay=20, the scheduler will
+ attempt rack-local assignments after 40 missed opportunities, and off-switch assignments
+ after 40+20=60 missed opportunities.
+ When setting this parameter, the size of the cluster should be taken into account.
+ We use -1 as the default value, which disables this feature. In this case, the number
+ of missed opportunities for assigning off-switch containers is calculated based on
+ the number of containers and unique locations specified in the resource request,
+ as well as the size of the cluster.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.queue-mappings</name>
+ <value></value>
+ <description>
+ A list of mappings that will be used to assign jobs to queues
+ The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
+ Typically this list will be used to map users to queues,
+ for example, u:%user:%user maps all users to queues with the same name
+ as the user.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
+ <value>false</value>
+ <description>
+ If a queue mapping is present, will it override the value specified
+ by the user? This can be used by administrators to place jobs in queues
+ that are different than the one specified by the user.
+ The default is false.
+ </description>
+ </property>
+
+ <property>
+ <name>yarn.scheduler.capacity.per-node-heartbeat.maximum-offswitch-assignments</name>
+ <value>1</value>
+ <description>
+ Controls the number of OFF_SWITCH assignments allowed
+ during a node's heartbeat. Increasing this value can improve
+ scheduling rate for OFF_SWITCH containers. Lower values reduce
+ "clumping" of applications on particular nodes. The default is 1.
+ Legal values are 1-MAX_INT. This config is refreshable.
+ </description>
+ </property>
+
+
+ <property>
+ <name>yarn.scheduler.capacity.application.fail-fast</name>
+ <value>false</value>
+ <description>
+ Whether RM should fail during recovery if previous applications'
+ queue is no longer valid.
+ </description>
+ </property>
+
+</configuration>
diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py b/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py
new file mode 100644
index 0000000..305feb3
--- /dev/null
+++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+
+kerberize = os.environ.get('IMPALA_KERBERIZE') == '1'
+hive_major_version = int(os.environ['IMPALA_HIVE_VERSION'][0])
+
+
+def _get_system_ram_mb():
+ lines = file("/proc/meminfo").readlines()
+ memtotal_line = [l for l in lines if l.startswith('MemTotal')][0]
+ mem_kb = int(memtotal_line.split()[1])
+ return mem_kb / 1024
+
+
+def _get_yarn_nm_ram_mb():
+ sys_ram = _get_system_ram_mb()
+ # Fit into the following envelope:
+ # - need 4GB at a bare minimum
+ # - leave at least 24G for other services
+ # - don't need more than 48G
+ ret = min(max(sys_ram - 24 * 1024, 4096), 48 * 1024)
+ print >>sys.stderr, "Configuring Yarn NM to use {0}MB RAM".format(ret)
+ return ret
+
+
+CONFIG = {
+ # Host/port configs
+ 'yarn.resourcemanager.webapp.address': '${EXTERNAL_LISTEN_HOST}:${YARN_WEBUI_PORT}',
+ 'yarn.nodemanager.address': '${INTERNAL_LISTEN_HOST}:${NODEMANAGER_PORT}',
+ 'yarn.nodemanager.localizer.address': '${INTERNAL_LISTEN_HOST}:${NODEMANAGER_LOCALIZER_PORT}',
+ 'yarn.nodemanager.webapp.address': '${INTERNAL_LISTEN_HOST}:${NODEMANAGER_WEBUI_PORT}',
+
+ # Directories
+ 'yarn.nodemanager.local-dirs': '${NODE_DIR}/var/lib/hadoop-yarn/cache/${USER}/nm-local-dir',
+ 'yarn.nodemanager.log-dirs': '${NODE_DIR}/var/log/hadoop-yarn/containers',
+
+ # Enable the MR shuffle service, which is also used by Tez.
+ 'yarn.nodemanager.aux-services': 'mapreduce_shuffle',
+ 'yarn.nodemanager.aux-services.mapreduce_shuffle.class': 'org.apache.hadoop.mapred.ShuffleHandler',
+ # Disable vmem checking, since vmem is essentially free, and tasks
+ # fail with vmem limit errors otherwise.
+ 'yarn.nodemanager.vmem-check-enabled': 'false',
+
+ # Limit memory used by the NM to 8GB.
+ # TODO(todd): auto-configure this based on the memory available on the machine
+ # to speed up data-loading.
+ 'yarn.nodemanager.resource.memory-mb': _get_yarn_nm_ram_mb()
+}
+
+app_classpath = [
+ # Default classpath as provided by Hadoop: these environment variables are not
+ # expanded by our config templating, but rather evaluated and expanded by
+ # YARN itself, in a context where the various _HOMEs have been defined.
+ '$HADOOP_CONF_DIR',
+ '$HADOOP_COMMON_HOME/share/hadoop/common/*',
+ '$HADOOP_COMMON_HOME/share/hadoop/common/lib/*',
+ '$HADOOP_HDFS_HOME/share/hadoop/hdfs/*',
+ '$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*',
+ '$HADOOP_YARN_HOME/share/hadoop/yarn/*',
+ '$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*',
+ # Append the LZO jar for LZO-compressed file support.
+ '${LZO_JAR_PATH}']
+
+# Hive 3 needs Tez on the classpath.
+if hive_major_version == 3:
+ app_classpath += [
+ '${TEZ_HOME}/*',
+ '${TEZ_HOME}/lib/*']
+
+CONFIG['yarn.application.classpath'] = ",".join(app_classpath)
+
+if kerberize:
+ CONFIG.update({
+ 'yarn.resourcemanager.keytab': '${KRB5_KTNAME}',
+ 'yarn.resourcemanager.principal': '${MINIKDC_PRINC_USER}',
+ 'yarn.nodemanager.keytab': '${KRB5_KTNAME}',
+ 'yarn.nodemanager.principal': '${MINIKDC_PRINC_USER}',
+ })
diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.tmpl b/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.tmpl
deleted file mode 100644
index 036a21c..0000000
--- a/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.tmpl
+++ /dev/null
@@ -1,154 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-
-<!-- TODO: Remove any Llama-related configuration. Can this file be removed entirely? -->
-<configuration>
- <property>
- <name>yarn.resourcemanager.webapp.address</name>
- <value>${EXTERNAL_LISTEN_HOST}:${YARN_WEBUI_PORT}</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.address</name>
- <value>${INTERNAL_LISTEN_HOST}:${NODEMANAGER_PORT}</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.delete.debug-delay-sec</name>
- <value>600</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.resource.memory-mb</name>
- <value>16384</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.resource.cpu-vcores</name>
- <value>16</value>
- </property>
-
- <property>
- <name>yarn.resourcemanager.nodemanagers.heartbeat-interval-ms</name>
- <value>100</value>
- </property>
-
- <property>
- <name>yarn.scheduler.fair.continuous-scheduling-enabled</name>
- <value>true</value>
- </property>
-
- <property>
- <name>yarn.scheduler.fair.assignmultiple</name>
- <value>true</value>
- </property>
-
- <property>
- <name>yarn.resourcemanager.scheduler.class</name>
- <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.localizer.address</name>
- <value>${INTERNAL_LISTEN_HOST}:${NODEMANAGER_LOCALIZER_PORT}</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.webapp.address</name>
- <value>${INTERNAL_LISTEN_HOST}:${NODEMANAGER_WEBUI_PORT}</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.local-dirs</name>
- <value>${NODE_DIR}/data/yarn/local</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.log-dirs</name>
- <value>${NODE_DIR}/data/yarn/logs</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.aux-services</name>
- <value>mapreduce_shuffle</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
- <value>org.apache.hadoop.mapred.ShuffleHandler</value>
- </property>
-
- <property>
- <name>yarn.log-aggregation-enable</name>
- <value>true</value>
- </property>
-
- <property>
- <description>List of directories to store localized files in.</description>
- <name>yarn.nodemanager.local-dirs</name>
- <value>${NODE_DIR}/var/lib/hadoop-yarn/cache/${user.name}/nm-local-dir</value>
- </property>
-
- <property>
- <description>Where to store container logs.</description>
- <name>yarn.nodemanager.log-dirs</name>
- <value>${NODE_DIR}/var/log/hadoop-yarn/containers</value>
- </property>
-
- <property>
- <description>Where to aggregate logs to.</description>
- <name>yarn.nodemanager.remote-app-log-dir</name>
- <value>${NODE_DIR}/var/log/hadoop-yarn/apps</value>
- </property>
-
- <property>
- <description>Classpath for typical applications.</description>
- <name>yarn.application.classpath</name>
- <value>
- ${HADOOP_CONF_DIR},
- ${HADOOP_HOME}/share/hadoop/tools/lib/*,
- ${HADOOP_HOME}/share/hadoop/common/*,
- ${HADOOP_HOME}/share/hadoop/common/lib/*,
- ${HADOOP_HOME}/share/hadoop/hdfs/*,
- ${HADOOP_HOME}/share/hdfs/common/lib/*,
- ${HADOOP_HOME}/share/hadoop/mapreduce/*,
- ${HADOOP_HOME}/share/hadoop/mapreduce/lib/*,
- ${HADOOP_HOME}/share/hadoop/yarn/*,
- ${HADOOP_HOME}/share/hadoop/yarn/lib/*,
- ${LZO_JAR_PATH}
- </value>
- </property>
-
- <!-- BEGIN Kerberos settings -->
-
- <!-- KERBEROS TODO: Add these to yarn.application.classpath.
- ${IMPALA_FE_DIR}/target/*,${HADOOP_LZO}/build/*,
- ${IMPALA_FE_DIR}/target/dependency/* -->
-
- <!-- ResourceManager security configs -->
- <property>
- <name>yarn.resourcemanager.keytab</name>
- <value>${KRB5_KTNAME}</value>
- </property>
-
- <property>
- <name>yarn.resourcemanager.principal</name>
- <value>${MINIKDC_PRINC_USER}</value>
- <!-- Sort of horrible: instead of the yarn principle, we'll use ${USER}
- so that we don't have a problem with file system permissions. -->
- </property>
-
- <!-- NodeManager security configs -->
- <property>
- <name>yarn.nodemanager.keytab</name>
- <value>${KRB5_KTNAME}</value>
- </property>
-
- <property>
- <name>yarn.nodemanager.principal</name>
- <value>${MINIKDC_PRINC_USER}</value>
- <!-- Also sort of horrible as per above -->
- </property>
- <!-- END Kerberos settings -->
-
-</configuration>
[impala] 01/03: IMPALA-8121: part 2: use local catalog in containers
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit a2c5d953b0fa6d69aa34eccaa13fa9aacd31ad10
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Apr 25 11:13:16 2019 -0700
IMPALA-8121: part 2: use local catalog in containers
This enables "modern" catalog features including the
local catalog and HMS notification support in the
dockerised minicluster by default.
The flags can be overridden if needed.
Skip tests affected by these bugs:
* IMPALA-8486 (LibCache invalidations)
* IMPALA-8458 (alter column stats)
* IMPALA-7131 (data sources not supported)
* IMPALA-7538 (HDFS caching DDL not supported)
* IMPALA-8489 TestRecoverPartitions.test_post_invalidate fails with
IllegalStateException
* IMPALA-8459 (cannot drop Kudu table)
* IMPALA-7539 (insert permission checks)
Fix handling of table properties in _get_properties()
to avoid including properties from unrelated sections.
This caused problems becase of additional properties
added by metastore event processing.
Rewrite test_partition_ddl_predicates() to change file formats rather
than use HDFS caching DDL.
Update the various test_kudu_col* tests to not expect staleness of
Kudu metadata for catalog V2.
Fix IMPALA-8464 so that testMetaDataGetColumnComments() allows the
table comment to be present, which is the new behaviour. Add a
new end-to-end test test_get_tables() that tests the precise
behaviour for different catalog versions so as to not lose
coverage.
Change-Id: I900d4b718cca98bcf86d36a2e64c0b6a424a5b7c
Reviewed-on: http://gerrit.cloudera.org:8080/13226
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
docker/catalogd/Dockerfile | 3 +-
docker/coord_exec/Dockerfile | 3 +-
docker/coordinator/Dockerfile | 2 +-
.../java/org/apache/impala/service/JdbcTest.java | 12 +++--
.../partition-ddl-predicates-hdfs-only.test | 12 ++---
tests/common/impala_connection.py | 1 -
tests/common/impala_test_suite.py | 20 +++++++-
tests/common/skip.py | 50 +++++++++++++++++++
tests/hs2/test_hs2.py | 57 ++++++++++++++++++++++
tests/metadata/test_ddl.py | 18 ++++---
tests/metadata/test_ddl_base.py | 14 ++++--
tests/metadata/test_hdfs_permissions.py | 4 +-
tests/metadata/test_hms_integration.py | 10 ++--
tests/metadata/test_metadata_query_statements.py | 4 +-
tests/metadata/test_recover_partitions.py | 3 +-
tests/query_test/test_hdfs_caching.py | 3 +-
tests/query_test/test_insert_behaviour.py | 8 ++-
tests/query_test/test_kudu.py | 36 ++++++++++----
tests/query_test/test_queries.py | 3 +-
tests/query_test/test_udfs.py | 6 ++-
20 files changed, 222 insertions(+), 47 deletions(-)
diff --git a/docker/catalogd/Dockerfile b/docker/catalogd/Dockerfile
index 8699fb3..05e555d 100644
--- a/docker/catalogd/Dockerfile
+++ b/docker/catalogd/Dockerfile
@@ -22,4 +22,5 @@ FROM impala_base
EXPOSE 25020
ENTRYPOINT ["/opt/impala/bin/daemon_entrypoint.sh", "/opt/impala/bin/catalogd",\
- "-abort_on_config_error=false", "-state_store_host=statestored"]
+ "-abort_on_config_error=false", "-state_store_host=statestored",\
+ "-catalog_topic_mode=minimal", "-hms_event_polling_interval_s=1"]
diff --git a/docker/coord_exec/Dockerfile b/docker/coord_exec/Dockerfile
index 6e6fa09..11356bc 100644
--- a/docker/coord_exec/Dockerfile
+++ b/docker/coord_exec/Dockerfile
@@ -27,4 +27,5 @@ EXPOSE 25000
ENTRYPOINT ["/opt/impala/bin/daemon_entrypoint.sh", "/opt/impala/bin/impalad",\
"-abort_on_config_error=false", "-state_store_host=statestored",\
- "-catalog_service_host=catalogd", "-mem_limit_includes_jvm=true"]
+ "-catalog_service_host=catalogd", "-mem_limit_includes_jvm=true",\
+ "-use_local_catalog=true"]
diff --git a/docker/coordinator/Dockerfile b/docker/coordinator/Dockerfile
index def6da3..cf86f2e 100644
--- a/docker/coordinator/Dockerfile
+++ b/docker/coordinator/Dockerfile
@@ -28,4 +28,4 @@ EXPOSE 25000
ENTRYPOINT ["/opt/impala/bin/daemon_entrypoint.sh", "/opt/impala/bin/impalad",\
"-abort_on_config_error=false", "-state_store_host=statestored",\
"-catalog_service_host=catalogd", "-is_executor=false", \
- "-mem_limit_includes_jvm=true"]
+ "-mem_limit_includes_jvm=true", "-use_local_catalog=true"]
diff --git a/fe/src/test/java/org/apache/impala/service/JdbcTest.java b/fe/src/test/java/org/apache/impala/service/JdbcTest.java
index 820373d..1749fbe 100644
--- a/fe/src/test/java/org/apache/impala/service/JdbcTest.java
+++ b/fe/src/test/java/org/apache/impala/service/JdbcTest.java
@@ -473,15 +473,19 @@ public class JdbcTest {
addTestTable("create table default.jdbc_column_comments_test (" +
"a int comment 'column comment') comment 'table comment'");
- // If a table is not yet loaded before getTables(), then the 'remarks' field
- // is left empty. getColumns() loads the table metadata, so later getTables()
- // calls will return 'remarks' correctly.
ResultSet rs = con_.getMetaData().getTables(
null, "default", "jdbc_column_comments_test", null);
assertTrue(rs.next());
assertEquals("Incorrect table name", "jdbc_column_comments_test",
rs.getString("TABLE_NAME"));
- assertEquals("Incorrect table comment", "", rs.getString("REMARKS"));
+
+ String remarks = rs.getString("REMARKS");
+ // IMPALA-7587: with catalog V2, if a table is not yet loaded before
+ // getTables(), then the 'remarks' field is left empty. getColumns()
+ // loads the table metadata, so later getTables() calls will return
+ // 'remarks' correctly.
+ assertTrue("Incorrect table comment: " + remarks,
+ remarks.equals("") || remarks.equals("table comment"));
rs = con_.getMetaData().getColumns(
null, "default", "jdbc_column_comments_test", null);
diff --git a/testdata/workloads/functional-query/queries/QueryTest/partition-ddl-predicates-hdfs-only.test b/testdata/workloads/functional-query/queries/QueryTest/partition-ddl-predicates-hdfs-only.test
index 5ca2a06..1a166b7 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/partition-ddl-predicates-hdfs-only.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/partition-ddl-predicates-hdfs-only.test
@@ -16,23 +16,23 @@ alter table p1_hdfs add partition (j=NULL,k=NULL);
insert into p1_hdfs partition (j, k) values (100, 1, "a"), (200, 1, "b"), (300, 1, "c");
====
---- QUERY
-alter table p1_hdfs partition (j<2, k in ("b", "c")) set cached in 'testPool'
+alter table p1_hdfs partition (j<2, k in ("b", "c")) set fileformat parquet
---- RESULTS
-'Cached 2 partition(s).'
+'Updated 2 partition(s).'
---- TYPES
STRING
====
---- QUERY
-alter table p1_hdfs partition (j<2, j>0, k<>"d") set uncached
+alter table p1_hdfs partition (j<2, j>0, k<>"d") set fileformat avro
---- RESULTS
-'Uncached 2 partition(s).'
+'Updated 3 partition(s).'
---- TYPES
STRING
====
---- QUERY
-alter table p1_hdfs partition (j=3 or j=2, k like "%") set uncached
+alter table p1_hdfs partition (j=3 or j=2, k like "%") set fileformat parquet
---- RESULTS
-'Uncached 0 partition(s).'
+'Updated 3 partition(s).'
---- TYPES
STRING
====
diff --git a/tests/common/impala_connection.py b/tests/common/impala_connection.py
index 6f0c0fc..2f6f60c 100644
--- a/tests/common/impala_connection.py
+++ b/tests/common/impala_connection.py
@@ -262,7 +262,6 @@ class ImpylaHS2Connection(ImpalaConnection):
LOG.info("-- connecting to {0} with impyla".format(self.__host_port))
host, port = self.__host_port.split(":")
self.__impyla_conn = impyla.connect(host=host, port=int(port))
- LOG.info("Conn {0}".format(self.__impyla_conn))
# Get the default query options for the session before any modifications are made.
self.__cursor = self.__impyla_conn.cursor()
self.__cursor.execute("set all")
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index 0be8c95..6630b3f 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -919,7 +919,25 @@ class ImpalaTestSuite(BaseTestSuite):
except Exception:
time.sleep(0.2)
continue
- raise Exception("Table {0} didn't show up after {1}s", db_name, timeout_s)
+ raise Exception("DB {0} didn't show up after {1}s", db_name, timeout_s)
+
+ def wait_for_table_to_appear(self, db_name, table_name, timeout_s):
+ """Wait until the table with 'table_name' in 'db_name' is present in the
+ impalad's local catalog. Fail after timeout_s if the doesn't appear."""
+ start_time = time.time()
+ while time.time() - start_time < timeout_s:
+ try:
+ # This will throw an exception if the table is not present.
+ self.client.execute("describe `{db_name}`.`{table_name}`".format(
+ db_name=db_name, table_name=table_name))
+ return
+ except Exception, ex:
+ print str(ex)
+ time.sleep(0.2)
+ continue
+ raise Exception("Table {0}.{1} didn't show up after {2}s", db_name, table_name,
+ timeout_s)
+
def assert_impalad_log_contains(self, level, line_regex, expected_count=1):
"""
diff --git a/tests/common/skip.py b/tests/common/skip.py
index 3720378..8c2bdfb 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -213,3 +213,53 @@ class SkipIfCatalogV2:
return pytest.mark.skipif(
IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
reason="Test is specific to old implementation of catalog.")
+
+ # TODO: IMPALA-8486: fix invalidation or update tests to reflect expected behaviour.
+ @classmethod
+ def lib_cache_invalidation_broken(self):
+ return pytest.mark.skipif(
+ IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
+ reason="IMPALA-8486: LibCache isn't invalidated by function DDL.")
+
+ # TODO: IMPALA-8458: fix bug or update tests to reflect expected behaviour.
+ @classmethod
+ def alter_column_stats_broken(self):
+ return pytest.mark.skipif(
+ IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
+ reason="IMPALA-8458: setting column stats without setting NDV is no-op.")
+
+ # TODO: IMPALA-7131: add support or update tests to reflect expected behaviour.
+ @classmethod
+ def data_sources_unsupported(self):
+ return pytest.mark.skipif(
+ IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
+ reason="IMPALA-7131: data sources not supported.")
+
+ # TODO: IMPALA-7538: add support or update tests to reflect expected behaviour.
+ @classmethod
+ def hdfs_caching_ddl_unsupported(self):
+ return pytest.mark.skipif(
+ IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
+ reason="IMPALA-7538: HDFS caching DDL not supported.")
+
+ # TODO: IMPALA-8489: fix this bug.
+ @classmethod
+ def impala_8489(self):
+ return pytest.mark.skipif(
+ IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
+ reason="IMPALA-8489: TestRecoverPartitions.test_post_invalidate "
+ "IllegalStateException.")
+
+ # TODO: IMPALA-8459: fix this bug.
+ @classmethod
+ def impala_8459(self):
+ return pytest.mark.skipif(
+ IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
+ reason="IMPALA-8459: some kudu DDL is broken for local catalog")
+
+ # TODO: IMPALA-7539: fix this bug.
+ @classmethod
+ def impala_7539(self):
+ return pytest.mark.skipif(
+ IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
+ reason="IMPALA-7539: support HDFS permission checks for LocalCatalog")
diff --git a/tests/hs2/test_hs2.py b/tests/hs2/test_hs2.py
index 16fc156..cec1d9e 100644
--- a/tests/hs2/test_hs2.py
+++ b/tests/hs2/test_hs2.py
@@ -24,6 +24,7 @@ import time
from urllib2 import urlopen
from ImpalaService import ImpalaHiveServer2Service
+from tests.common.environ import IMPALA_TEST_CLUSTER_PROPERTIES
from tests.common.skip import SkipIfDockerizedCluster
from tests.hs2.hs2_test_suite import HS2TestSuite, needs_session, operation_id_to_query_id
from TCLIService import TCLIService
@@ -402,6 +403,52 @@ class TestHS2(HS2TestSuite):
assert "Sql Statement: GET_SCHEMAS" in profile_page
assert "Query Type: DDL" in profile_page
+ @pytest.mark.execute_serially
+ @needs_session()
+ def test_get_tables(self):
+ """Basic test for the GetTables() HS2 method. Needs to execute serially because
+ the test depends on controlling whether a table is loaded or not and other
+ concurrent tests loading or invalidating tables could interfere with it."""
+ # TODO: unique_database would be better, but it doesn't work with @needs_session
+ # at the moment.
+ table = "__hs2_column_comments_test"
+ self.execute_query("drop table if exists {0}".format(table))
+ self.execute_query("""
+ create table {0} (a int comment 'column comment')
+ comment 'table comment'""".format(table))
+ try:
+ req = TCLIService.TGetTablesReq()
+ req.sessionHandle = self.session_handle
+ req.schemaName = "default"
+ req.tableName = table
+
+ # Execute the request twice, the first time with the table unloaded and the second
+ # with it loaded.
+ self.execute_query("invalidate metadata {0}".format(table))
+ for i in range(2):
+ get_tables_resp = self.hs2_client.GetTables(req)
+ TestHS2.check_response(get_tables_resp)
+
+ fetch_results_resp = self._fetch_results(get_tables_resp.operationHandle, 100)
+ results = fetch_results_resp.results
+ table_cat = results.columns[0].stringVal.values[0]
+ table_schema = results.columns[1].stringVal.values[0]
+ table_name = results.columns[2].stringVal.values[0]
+ table_type = results.columns[3].stringVal.values[0]
+ table_remarks = results.columns[4].stringVal.values[0]
+ assert table_cat == ''
+ assert table_schema == "default"
+ assert table_name == table
+ assert table_type == "TABLE"
+ if i == 0 and not IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster():
+ # IMPALA-7587: comments not returned for non-loaded tables with legacy catalog.
+ assert table_remarks == ""
+ else:
+ assert table_remarks == "table comment"
+ # Ensure the table is loaded for the second iteration.
+ self.execute_query("describe {0}".format(table))
+ finally:
+ self.execute_query("drop table {0}".format(table))
@needs_session(conf_overlay={"idle_session_timeout": "5"})
def test_get_operation_status_session_timeout(self):
@@ -559,3 +606,13 @@ class TestHS2(HS2TestSuite):
typed_col = getattr(results.columns[0], 'stringVal')
for colType in types:
assert typed_col.values.count(colType) == 1
+
+ def _fetch_results(self, operation_handle, max_rows):
+ """Fetch results from 'operation_handle' with up to 'max_rows' rows using
+ self.hs2_client, returning the TFetchResultsResp object."""
+ fetch_results_req = TCLIService.TFetchResultsReq()
+ fetch_results_req.operationHandle = operation_handle
+ fetch_results_req.maxRows = max_rows
+ fetch_results_resp = self.hs2_client.FetchResults(fetch_results_req)
+ TestHS2.check_response(fetch_results_resp)
+ return fetch_results_resp
diff --git a/tests/metadata/test_ddl.py b/tests/metadata/test_ddl.py
index 05d8c01..765b06a 100644
--- a/tests/metadata/test_ddl.py
+++ b/tests/metadata/test_ddl.py
@@ -25,7 +25,8 @@ from test_ddl_base import TestDdlBase
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.impala_test_suite import LOG
from tests.common.parametrize import UniqueDatabase
-from tests.common.skip import SkipIf, SkipIfABFS, SkipIfADLS, SkipIfKudu, SkipIfLocal
+from tests.common.skip import (SkipIf, SkipIfABFS, SkipIfADLS, SkipIfKudu, SkipIfLocal,
+ SkipIfCatalogV2)
from tests.common.test_dimensions import create_single_exec_option_dimension
from tests.util.filesystem_utils import (
WAREHOUSE,
@@ -420,12 +421,16 @@ class TestDdlStatements(TestDdlBase):
file_data='1984')
self.run_test_case('QueryTest/alter-table', vector, use_db=unique_database,
multiple_impalad=self._use_multiple_impalad(vector))
- # The following tests require HDFS caching which is supported only in the HDFS
- # filesystem.
- if IS_HDFS:
- self.run_test_case('QueryTest/alter-table-hdfs-caching', vector,
- use_db=unique_database, multiple_impalad=self._use_multiple_impalad(vector))
+ @SkipIf.not_hdfs
+ @SkipIfLocal.hdfs_client
+ @SkipIfCatalogV2.hdfs_caching_ddl_unsupported()
+ @UniqueDatabase.parametrize(sync_ddl=True, num_dbs=2)
+ def test_alter_table_hdfs_caching(self, vector, unique_database):
+ self.run_test_case('QueryTest/alter-table-hdfs-caching', vector,
+ use_db=unique_database, multiple_impalad=self._use_multiple_impalad(vector))
+
+ @SkipIfCatalogV2.alter_column_stats_broken()
@UniqueDatabase.parametrize(sync_ddl=True)
def test_alter_set_column_stats(self, vector, unique_database):
self.run_test_case('QueryTest/alter-table-set-column-stats', vector,
@@ -797,6 +802,7 @@ class TestLibCache(TestDdlBase):
# Run serially because this test inspects global impalad metrics.
# TODO: The metrics checks could be relaxed to enable running this test in
# parallel, but that might need a more general wait_for_metric_value().
+ @SkipIfCatalogV2.data_sources_unsupported()
@pytest.mark.execute_serially
def test_create_drop_data_src(self, vector, unique_database):
"""This will create, run, and drop the same data source repeatedly, exercising
diff --git a/tests/metadata/test_ddl_base.py b/tests/metadata/test_ddl_base.py
index 83399b0..63409ea 100644
--- a/tests/metadata/test_ddl_base.py
+++ b/tests/metadata/test_ddl_base.py
@@ -84,13 +84,17 @@ class TestDdlBase(ImpalaTestSuite):
match = False
properties = dict()
for row in result.data:
- if section_name in row:
- match = True
+ fields = row.split("\t")
+ if fields[0] != '':
+ # Start of new section.
+ if match:
+ # Finished processing matching section.
+ break
+ match = section_name in fields[0]
elif match:
- row = row.split('\t')
- if row[1] == 'NULL':
+ if fields[1] == 'NULL':
break
- properties[row[1].rstrip()] = row[2].rstrip()
+ properties[fields[1].rstrip()] = fields[2].rstrip()
return properties
def _get_property(self, property_name, name, is_db=False):
diff --git a/tests/metadata/test_hdfs_permissions.py b/tests/metadata/test_hdfs_permissions.py
index d495fc4..192920c 100644
--- a/tests/metadata/test_hdfs_permissions.py
+++ b/tests/metadata/test_hdfs_permissions.py
@@ -16,7 +16,8 @@
# under the License.
from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.skip import SkipIfS3, SkipIfABFS, SkipIfADLS, SkipIfLocal
+from tests.common.skip import (SkipIfS3, SkipIfABFS, SkipIfADLS, SkipIfLocal,
+ SkipIfCatalogV2)
from tests.common.test_dimensions import (
create_single_exec_option_dimension,
create_uncompressed_text_dimension)
@@ -53,6 +54,7 @@ class TestHdfsPermissions(ImpalaTestSuite):
self.client.execute('drop table if exists %s' % TEST_TBL)
self.hdfs_client.delete_file_dir('test-warehouse/%s' % TEST_TBL, recursive=True)
+ @SkipIfCatalogV2.impala_7539()
def test_insert_into_read_only_table(self, vector):
permission = 444
if IS_ISILON:
diff --git a/tests/metadata/test_hms_integration.py b/tests/metadata/test_hms_integration.py
index 71a0879..c1a1734 100644
--- a/tests/metadata/test_hms_integration.py
+++ b/tests/metadata/test_hms_integration.py
@@ -84,9 +84,13 @@ class TestHmsIntegrationSanity(ImpalaTestSuite):
# Creating a table with the same name using 'IF NOT EXISTS' in Impala should
# not fail
self.client.execute("create table if not exists hms_sanity_db.test_tbl (a int)")
- # The table should not appear in the catalog unless invalidate metadata is
- # executed
- assert 'test_tbl' not in self.client.execute("show tables in hms_sanity_db").data
+ # The table should not appear in the catalog for catalog_v1 unless invalidate
+ # metadata is executed.
+ if IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster():
+ self.wait_for_table_to_appear("hms_sanity_db", "test_tbl", 10)
+ assert 'test_tbl' in self.client.execute("show tables in hms_sanity_db").data
+ else:
+ assert 'test_tbl' not in self.client.execute("show tables in hms_sanity_db").data
self.client.execute("invalidate metadata hms_sanity_db.test_tbl")
assert 'test_tbl' in self.client.execute("show tables in hms_sanity_db").data
diff --git a/tests/metadata/test_metadata_query_statements.py b/tests/metadata/test_metadata_query_statements.py
index 5633ba0..3921e87 100644
--- a/tests/metadata/test_metadata_query_statements.py
+++ b/tests/metadata/test_metadata_query_statements.py
@@ -23,7 +23,8 @@ import re
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.environ import IMPALA_TEST_CLUSTER_PROPERTIES
from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.skip import SkipIfIsilon, SkipIfS3, SkipIfABFS, SkipIfADLS, SkipIfLocal
+from tests.common.skip import (SkipIfIsilon, SkipIfS3, SkipIfABFS, SkipIfADLS,
+ SkipIfLocal, SkipIfCatalogV2)
from tests.common.test_dimensions import ALL_NODES_ONLY
from tests.common.test_dimensions import create_exec_option_dimension
from tests.common.test_dimensions import create_uncompressed_text_dimension
@@ -134,6 +135,7 @@ class TestMetadataQueryStatements(ImpalaTestSuite):
compare=compare_describe_formatted)
@pytest.mark.execute_serially # due to data src setup/teardown
+ @SkipIfCatalogV2.data_sources_unsupported()
def test_show_data_sources(self, vector):
try:
self.__create_data_sources()
diff --git a/tests/metadata/test_recover_partitions.py b/tests/metadata/test_recover_partitions.py
index 9ba4164..36a23ea 100644
--- a/tests/metadata/test_recover_partitions.py
+++ b/tests/metadata/test_recover_partitions.py
@@ -19,7 +19,7 @@
import os
from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.skip import SkipIfLocal, SkipIfS3
+from tests.common.skip import SkipIfLocal, SkipIfS3, SkipIfCatalogV2
from tests.common.test_dimensions import ALL_NODES_ONLY
from tests.common.test_dimensions import create_exec_option_dimension
from tests.util.filesystem_utils import WAREHOUSE, IS_S3
@@ -246,6 +246,7 @@ class TestRecoverPartitions(ImpalaTestSuite):
"duplicate partition key values." % FQ_TBL_NAME
@SkipIfLocal.hdfs_client
+ @SkipIfCatalogV2.impala_8489()
def test_post_invalidate(self, vector, unique_database):
"""Test that RECOVER PARTITIONS works correctly after invalidate."""
TBL_NAME = "test_recover_partitions"
diff --git a/tests/query_test/test_hdfs_caching.py b/tests/query_test/test_hdfs_caching.py
index d683383..cab08d8 100644
--- a/tests/query_test/test_hdfs_caching.py
+++ b/tests/query_test/test_hdfs_caching.py
@@ -26,7 +26,7 @@ from tests.common.environ import build_flavor_timeout, IS_DOCKERIZED_TEST_CLUSTE
from tests.common.impala_cluster import ImpalaCluster
from tests.common.impala_test_suite import ImpalaTestSuite, LOG
from tests.common.skip import (SkipIfS3, SkipIfABFS, SkipIfADLS, SkipIfIsilon,
- SkipIfLocal, SkipIfEC, SkipIfDockerizedCluster)
+ SkipIfLocal, SkipIfEC, SkipIfDockerizedCluster, SkipIfCatalogV2)
from tests.common.test_dimensions import create_single_exec_option_dimension
from tests.util.filesystem_utils import get_fs_path
from tests.util.shell_util import exec_process
@@ -179,6 +179,7 @@ class TestHdfsCachingFallbackPath(ImpalaTestSuite):
@SkipIfADLS.caching
@SkipIfIsilon.caching
@SkipIfLocal.caching
+@SkipIfCatalogV2.hdfs_caching_ddl_unsupported()
class TestHdfsCachingDdl(ImpalaTestSuite):
@classmethod
def get_workload(self):
diff --git a/tests/query_test/test_insert_behaviour.py b/tests/query_test/test_insert_behaviour.py
index 3bcb0c1..fc622b1 100644
--- a/tests/query_test/test_insert_behaviour.py
+++ b/tests/query_test/test_insert_behaviour.py
@@ -24,7 +24,7 @@ import re
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.parametrize import UniqueDatabase
from tests.common.skip import (SkipIfS3, SkipIfABFS, SkipIfADLS, SkipIfIsilon,
- SkipIfLocal, SkipIfDockerizedCluster)
+ SkipIfLocal, SkipIfDockerizedCluster, SkipIfCatalogV2)
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path, IS_S3
@SkipIfLocal.hdfs_client
@@ -198,6 +198,7 @@ class TestInsertBehaviour(ImpalaTestSuite):
@SkipIfABFS.hdfs_acls
@SkipIfADLS.hdfs_acls
@SkipIfIsilon.hdfs_acls
+ @SkipIfCatalogV2.impala_7539()
def test_insert_file_permissions(self, unique_database):
"""Test that INSERT correctly respects file permission (minimum ACLs)"""
table = "`{0}`.`insert_acl_permissions`".format(unique_database)
@@ -250,6 +251,7 @@ class TestInsertBehaviour(ImpalaTestSuite):
@SkipIfABFS.hdfs_acls
@SkipIfADLS.hdfs_acls
@SkipIfIsilon.hdfs_acls
+ @SkipIfCatalogV2.impala_7539()
def test_mixed_partition_permissions(self, unique_database):
"""
Test that INSERT and LOAD DATA into explicit partitions is allowed even
@@ -331,6 +333,7 @@ class TestInsertBehaviour(ImpalaTestSuite):
@SkipIfABFS.hdfs_acls
@SkipIfADLS.hdfs_acls
@SkipIfIsilon.hdfs_acls
+ @SkipIfCatalogV2.impala_7539()
def test_readonly_table_dir(self, unique_database):
"""
Test that, if a partitioned table has a read-only base directory,
@@ -364,6 +367,7 @@ class TestInsertBehaviour(ImpalaTestSuite):
@SkipIfADLS.hdfs_acls
@SkipIfIsilon.hdfs_acls
@SkipIfDockerizedCluster.insert_acls
+ @SkipIfCatalogV2.impala_7539()
def test_insert_acl_permissions(self, unique_database):
"""Test that INSERT correctly respects ACLs"""
table = "`{0}`.`insert_acl_permissions`".format(unique_database)
@@ -443,6 +447,7 @@ class TestInsertBehaviour(ImpalaTestSuite):
@SkipIfABFS.hdfs_acls
@SkipIfADLS.hdfs_acls
@SkipIfIsilon.hdfs_acls
+ @SkipIfCatalogV2.impala_7539()
def test_load_permissions(self, unique_database):
# We rely on test_insert_acl_permissions() to exhaustively check that ACL semantics
# are correct. Here we just validate that LOADs can't be done when we cannot read from
@@ -569,6 +574,7 @@ class TestInsertBehaviour(ImpalaTestSuite):
@SkipIfADLS.hdfs_acls
@SkipIfIsilon.hdfs_acls
@SkipIfDockerizedCluster.insert_acls
+ @SkipIfCatalogV2.impala_7539()
def test_multiple_group_acls(self, unique_database):
"""Test that INSERT correctly respects multiple group ACLs"""
table = "`{0}`.`insert_group_acl_permissions`".format(unique_database)
diff --git a/tests/query_test/test_kudu.py b/tests/query_test/test_kudu.py
index 776486c..216a41a 100644
--- a/tests/query_test/test_kudu.py
+++ b/tests/query_test/test_kudu.py
@@ -37,9 +37,10 @@ import time
from datetime import datetime
from pytz import utc
+from tests.common.environ import IMPALA_TEST_CLUSTER_PROPERTIES
from tests.common.kudu_test_suite import KuduTestSuite
from tests.common.impala_cluster import ImpalaCluster
-from tests.common.skip import SkipIfNotHdfsMinicluster, SkipIfKudu
+from tests.common.skip import SkipIfNotHdfsMinicluster, SkipIfKudu, SkipIfCatalogV2
from tests.common.test_dimensions import add_exec_option_dimension
from tests.verifiers.metric_verifier import MetricVerifier
@@ -185,11 +186,14 @@ class TestKuduOperations(KuduTestSuite):
session.apply(op)
session.flush()
- # Scanning should result in an error
+ # Scanning should result in an error with Catalog V1, since the metadata is cached.
try:
cursor.execute("SELECT * FROM %s.foo" % (unique_database))
- assert False
+ assert IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),\
+ "Should fail with Catalog V1, which caches metadata"
except Exception as e:
+ assert not IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),\
+ "Should succeed with Catalog V2, which does not cache metadata"
expected_error = "Column 's' is type INT but Impala expected STRING. The table "\
"metadata in Impala may be outdated and need to be refreshed."
assert expected_error in str(e)
@@ -229,8 +233,11 @@ class TestKuduOperations(KuduTestSuite):
# Scanning should result in an error
try:
cursor.execute("SELECT * FROM %s.foo" % (unique_database))
- assert False
+ assert IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),\
+ "Should fail with Catalog V1, which caches metadata"
except Exception as e:
+ assert not IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),\
+ "Should succeed with Catalog V2, which does not cache metadata"
expected_error = "Column 's' is nullable but Impala expected it to be "\
"not nullable. The table metadata in Impala may be outdated and need to be "\
"refreshed."
@@ -271,8 +278,11 @@ class TestKuduOperations(KuduTestSuite):
# Scanning should result in an error
try:
cursor.execute("SELECT * FROM %s.foo" % (unique_database))
- assert False
+ assert IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),\
+ "Should fail with Catalog V1, which caches metadata"
except Exception as e:
+ assert not IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),\
+ "Should succeed with Catalog V2, which does not cache metadata"
expected_error = "Column 's' is not nullable but Impala expected it to be "\
"nullable. The table metadata in Impala may be outdated and need to be "\
"refreshed."
@@ -305,12 +315,16 @@ class TestKuduOperations(KuduTestSuite):
session.apply(op)
session.flush()
- # Only the first col is visible to Impala. Impala will not know about the missing
- # column, so '*' is expanded to known columns. This doesn't have a separate check
- # because the query can proceed and checking would need to fetch metadata from the
- # Kudu master, which is what REFRESH is for.
cursor.execute("SELECT * FROM %s.foo" % (unique_database))
- assert cursor.fetchall() == [(0, )]
+ if IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster():
+ # Changes in Kudu should be immediately visible to Impala with Catalog V2.
+ assert cursor.fetchall() == [(0, 0)]
+ else:
+ # Only the first col is visible to Impala. Impala will not know about the missing
+ # column, so '*' is expanded to known columns. This doesn't have a separate check
+ # because the query can proceed and checking would need to fetch metadata from the
+ # Kudu master, which is what REFRESH is for.
+ assert cursor.fetchall() == [(0, )]
# After a REFRESH both cols should be visible
cursor.execute("REFRESH %s.foo" % (unique_database))
@@ -1062,6 +1076,7 @@ class TestImpalaKuduIntegration(KuduTestSuite):
("c", "string", "", "false", "true", "", "AUTO_ENCODING",
"DEFAULT_COMPRESSION", "0")]
+ @SkipIfCatalogV2.impala_8459()
def test_delete_external_kudu_table(self, cursor, kudu_client):
"""Check that Impala can recover from the case where the underlying Kudu table of
an external table is dropped using the Kudu client.
@@ -1088,6 +1103,7 @@ class TestImpalaKuduIntegration(KuduTestSuite):
cursor.execute("SHOW TABLES")
assert (impala_table_name,) not in cursor.fetchall()
+ @SkipIfCatalogV2.impala_8459()
def test_delete_managed_kudu_table(self, cursor, kudu_client, unique_database):
"""Check that dropping a managed Kudu table works even if the underlying Kudu table
has been dropped externally."""
diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py
index d156212..8e2ad17 100644
--- a/tests/query_test/test_queries.py
+++ b/tests/query_test/test_queries.py
@@ -22,7 +22,7 @@ import re
from copy import deepcopy
from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.skip import SkipIfEC, SkipIfDockerizedCluster
+from tests.common.skip import SkipIfEC, SkipIfCatalogV2
from tests.common.test_dimensions import (
create_uncompressed_text_dimension, extend_exec_option_dimension,
create_beeswax_hs2_dimension, hs2_parquet_constraint)
@@ -170,6 +170,7 @@ class TestQueriesTextTables(ImpalaTestSuite):
vector.get_value('exec_option')['abort_on_error'] = 1
self.run_test_case('QueryTest/strict-mode-abort', vector)
+ @SkipIfCatalogV2.data_sources_unsupported()
def test_data_source_tables(self, vector):
self.run_test_case('QueryTest/data-source-tables', vector)
diff --git a/tests/query_test/test_udfs.py b/tests/query_test/test_udfs.py
index 771430f..5823ad3 100644
--- a/tests/query_test/test_udfs.py
+++ b/tests/query_test/test_udfs.py
@@ -24,7 +24,7 @@ from subprocess import call, check_call
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.impala_cluster import ImpalaCluster
from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.skip import SkipIfLocal
+from tests.common.skip import SkipIfLocal, SkipIfCatalogV2
from tests.common.test_dimensions import (
create_exec_option_dimension,
create_exec_option_dimension_from_dict,
@@ -49,7 +49,7 @@ class TestUdfBase(ImpalaTestSuite):
for impalad in impala_cluster.impalads:
client = impalad.service.create_beeswax_client()
result = self.execute_query_expect_success(client, query, exec_options)
- assert result.data == expected
+ assert result.data == expected, impalad
def _load_functions(self, template, vector, database, location):
queries = template.format(database=database, location=location)
@@ -507,6 +507,7 @@ class TestUdfTargeted(TestUdfBase):
def test_libs_with_same_filenames(self, vector, unique_database):
self.run_test_case('QueryTest/libs_with_same_filenames', vector, use_db=unique_database)
+ @SkipIfCatalogV2.lib_cache_invalidation_broken()
def test_udf_update_via_drop(self, vector, unique_database):
"""Test updating the UDF binary without restarting Impala. Dropping
the function should remove the binary from the local cache."""
@@ -540,6 +541,7 @@ class TestUdfTargeted(TestUdfBase):
self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
self._run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
+ @SkipIfCatalogV2.lib_cache_invalidation_broken()
def test_udf_update_via_create(self, vector, unique_database):
"""Test updating the UDF binary without restarting Impala. Creating a new function
from the library should refresh the cache."""