You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/04/26 23:45:52 UTC
[impala] 03/03: Configure Hive 3's HS2 to execute queries using Tez
local mode
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 8e97a3b5f68e55bd68b25a6d7966c8eb0d57e6d0
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Wed Apr 3 21:02:59 2019 -0700
Configure Hive 3's HS2 to execute queries using Tez local mode
Hive 3 no longer supports MR execution, so this sets up the appropriate
configuration and classpath so that HS2 can run queries using Tez.
The bulk of this patch is toolchain changes to download Tez itself. The
Tez tarball is slightly odd in that it has no top-level directory, so
the patch changes around bootstrap_toolchain a bit to support creating
its own top-level directory for a component.
The remainder of the patch is some classpath setup and hive-site changes
when Hive 3 is enabled.
So far I tested this manually by setting up a metastore and
impala-config with USE_CDP_HIVE=true, and then connecting to HS2 using
hive beeline -u 'jdbc:hive2://localhost:11050'
I was able to insert and query data, and was able to verify that queries
like 'select count(*)' were executing via Tez local mode.
NOTE: this patch relies on a custom build of Tez, based on a private
branch. I've submitted a PR to Tez upstream, referenced in the commits
here. Will remove this hack once the PR is accepted and makes its way
into an official build.
Change-Id: I76e47fbd1d6ff5103d81a8de430d5465dba284cd
Reviewed-on: http://gerrit.cloudera.org:8080/12931
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
---
bin/bootstrap_toolchain.py | 55 ++++++++++++++++++++++++++++------
bin/impala-config.sh | 10 +++++++
fe/pom.xml | 12 ++++++++
fe/src/test/resources/hive-site.xml.py | 8 ++++-
testdata/bin/run-hive-server.sh | 15 ++++++++++
5 files changed, 90 insertions(+), 10 deletions(-)
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 7c44902..d51b5cf 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -103,6 +103,20 @@ class Package(object):
self.url = os.environ.get(url_env_var)
+class CdpComponent(object):
+ def __init__(self, basename, makedir=False):
+ """
+ basename: the name of the file to be downloaded, without its .tar.gz suffix
+ makedir: if false, it is assumed that the downloaded tarball will expand
+ into a directory with the same name as 'basename'. If True, we
+ assume that the tarball doesn't have any top-level directory,
+ and so we need to manually create a directory within which to
+ expand the tarball.
+ """
+ self.basename = basename
+ self.makedir = makedir
+
+
def try_get_platform_release_label():
"""Gets the right package label from the OS version. Returns an OsMapping with both
'toolchain' and 'cdh' labels. Return None if not found.
@@ -418,8 +432,12 @@ def download_cdh_components(toolchain_root, cdh_components, url_prefix):
def download_cdp_components(cdp_components, url_prefix):
- """Downloads and unpacks the CDP components for a given URL prefix into
- $CDP_COMPONENTS_HOME if not found."""
+ """
+ Downloads and unpacks the CDP components for a given URL prefix into
+ $CDP_COMPONENTS_HOME if not found.
+
+ cdp_components: list of CdpComponent instances
+ """
cdp_components_home = os.environ.get("CDP_COMPONENTS_HOME")
if not cdp_components_home:
logging.error("Impala environment not set up correctly, make sure "
@@ -430,12 +448,27 @@ def download_cdp_components(cdp_components, url_prefix):
if not os.path.exists(cdp_components_home):
os.makedirs(cdp_components_home)
- def download(component_name):
- pkg_directory = "{0}/{1}".format(cdp_components_home, component_name)
+ def download(component):
+ pkg_directory = "{0}/{1}".format(cdp_components_home, component.basename)
if os.path.isdir(pkg_directory): return
- file_name = "{0}.tar.gz".format(component_name)
+ file_name = "{0}.tar.gz".format(component.basename)
download_path = "{0}/{1}".format(url_prefix, file_name)
- wget_and_unpack_package(download_path, file_name, cdp_components_home, False)
+ dst = cdp_components_home
+ if component.makedir:
+ # Download and unpack in a temp directory, which we'll later move into place
+ dst = tempfile.mkdtemp(dir=cdp_components_home)
+ try:
+ wget_and_unpack_package(download_path, file_name, dst, False)
+ except: # noqa
+ # Clean up any partially-unpacked result.
+ if os.path.isdir(pkg_directory):
+ shutil.rmtree(pkg_directory)
+ # Clean up any temp directory if we made one
+ if component.makedir:
+ shutil.rmtree(dst)
+ raise
+ if component.makedir:
+ os.rename(dst, pkg_directory)
execute_many(download, cdp_components)
@@ -533,11 +566,15 @@ if __name__ == "__main__":
cdp_build_number = os.environ["CDP_BUILD_NUMBER"]
cdp_components = [
- "ranger-{0}-admin".format(os.environ.get("IMPALA_RANGER_VERSION")),
+ CdpComponent("ranger-{0}-admin".format(os.environ.get("IMPALA_RANGER_VERSION"))),
]
+ use_cdp_hive = os.getenv("USE_CDP_HIVE") == "true"
if use_cdp_hive:
- cdp_components.append("apache-hive-{0}-bin"
- .format(os.environ.get("IMPALA_HIVE_VERSION")))
+ cdp_components.append(CdpComponent("apache-hive-{0}-bin"
+ .format(os.environ.get("IMPALA_HIVE_VERSION"))))
+ cdp_components.append(CdpComponent(
+ "tez-{0}-minimal".format(os.environ.get("IMPALA_TEZ_VERSION")),
+ makedir=True))
download_path_prefix = \
"https://{0}/build/cdp_components/{1}/tarballs".format(toolchain_host,
cdp_build_number)
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 87b5216..709cfcb 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -174,6 +174,16 @@ export KUDU_JAVA_VERSION=1.10.0-cdh6.x-SNAPSHOT
export USE_CDP_HIVE=${USE_CDP_HIVE-false}
if $USE_CDP_HIVE; then
export IMPALA_HIVE_VERSION=3.1.0.6.0.99.0-45
+ # Temporary version of Tez, patched with the fix for TEZ-1348:
+ # https://github.com/apache/tez/pull/40
+ # We'll switch to a non-"todd" version of Tez once that fix is integrated.
+ # For now, if you're bumping the CDP build number, you'll need to download
+ # this tarball from an earlier build and re-upload it to the new directory
+ # in the toolchain bucket.
+ #
+ # TODO(todd) switch to an official build.
+ export IMPALA_TEZ_VERSION=0.10.0-todd-6fcc41e5798b.1
+ export TEZ_HOME="$CDP_COMPONENTS_HOME/tez-${IMPALA_TEZ_VERSION}-minimal"
else
export IMPALA_HIVE_VERSION=2.1.1-cdh6.x-SNAPSHOT
fi
diff --git a/fe/pom.xml b/fe/pom.xml
index bac4555..920b7e9 100644
--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -313,6 +313,10 @@ under the License.
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.calcite.avatica</groupId>
+ <artifactId>avatica</artifactId>
+ </exclusion>
</exclusions>
</dependency>
@@ -349,6 +353,10 @@ under the License.
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.calcite.avatica</groupId>
+ <artifactId>avatica</artifactId>
+ </exclusion>
</exclusions>
</dependency>
@@ -408,6 +416,10 @@ under the License.
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.calcite.avatica</groupId>
+ <artifactId>avatica</artifactId>
+ </exclusion>
</exclusions>
</dependency>
diff --git a/fe/src/test/resources/hive-site.xml.py b/fe/src/test/resources/hive-site.xml.py
index fb64374..9c0ca7a 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -76,7 +76,13 @@ if kerberize:
# hive.metastore.kerberos.keytab.file
# hive.metastore.kerberos.principal
-if hive_major_version < 3:
+# Enable Tez and ACID for Hive 3
+if hive_major_version >= 3:
+ CONFIG.update({
+ 'hive.tez.container.size': '512',
+ 'hive.txn.manager': 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager',
+ 'tez.local.mode': 'true'})
+else:
CONFIG.update({
# TODO(vihang) Disabled for HMS3.
'hive.metastore.event.listeners': 'org.apache.sentry.binding.metastore.SentrySyncHMSNotificationsPostEventListener',
diff --git a/testdata/bin/run-hive-server.sh b/testdata/bin/run-hive-server.sh
index 8a6a1ca..6bdaaee 100755
--- a/testdata/bin/run-hive-server.sh
+++ b/testdata/bin/run-hive-server.sh
@@ -90,6 +90,21 @@ HADOOP_CLIENT_OPTS="-Xmx2024m -Dhive.log.file=hive-metastore.log" hive \
${CLUSTER_BIN}/wait-for-metastore.py --transport=${METASTORE_TRANSPORT}
if [ ${ONLY_METASTORE} -eq 0 ]; then
+ # For Hive 3, we use Tez for execution. We have to add it to the HS2 classpath.
+ if $USE_CDP_HIVE; then
+ export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:${TEZ_HOME}/*
+ # This is a little hacky, but Tez bundles a bunch of junk into lib/, such
+ # as extra copies of the hadoop libraries, etc, and we want to avoid conflicts.
+ # So, we'll be a bit choosy about what we add to the classpath here.
+ for jar in $TEZ_HOME/lib/* ; do
+ case $(basename $jar) in
+ commons-*|RoaringBitmap*)
+ export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$jar
+ ;;
+ esac
+ done
+ fi
+
# Starts a HiveServer2 instance on the port specified by the HIVE_SERVER2_THRIFT_PORT
# environment variable. HADOOP_HEAPSIZE should be set to at least 2048 to avoid OOM
# when loading ORC tables like widerow.