You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/04/26 23:45:52 UTC

[impala] 03/03: Configure Hive 3's HS2 to execute queries using Tez local mode

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 8e97a3b5f68e55bd68b25a6d7966c8eb0d57e6d0
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Wed Apr 3 21:02:59 2019 -0700

    Configure Hive 3's HS2 to execute queries using Tez local mode
    
    Hive 3 no longer supports MR execution, so this sets up the appropriate
    configuration and classpath so that HS2 can run queries using Tez.
    
    The bulk of this patch is toolchain changes to download Tez itself. The
    Tez tarball is slightly odd in that it has no top-level directory, so
    the patch changes around bootstrap_toolchain a bit to support creating
    its own top-level directory for a component.
    
    The remainder of the patch is some classpath setup and hive-site changes
    when Hive 3 is enabled.
    
    So far I tested this manually by setting up a metastore and
    impala-config with USE_CDP_HIVE=true, and then connecting to HS2 using
    
      hive beeline -u 'jdbc:hive2://localhost:11050'
    
    I was able to insert and query data, and was able to verify that queries
    like 'select count(*)' were executing via Tez local mode.
    
    NOTE: this patch relies on a custom build of Tez, based on a private
    branch. I've submitted a PR to Tez upstream, referenced in the commits
    here. Will remove this hack once the PR is accepted and makes its way
    into an official build.
    
    Change-Id: I76e47fbd1d6ff5103d81a8de430d5465dba284cd
    Reviewed-on: http://gerrit.cloudera.org:8080/12931
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
---
 bin/bootstrap_toolchain.py             | 55 ++++++++++++++++++++++++++++------
 bin/impala-config.sh                   | 10 +++++++
 fe/pom.xml                             | 12 ++++++++
 fe/src/test/resources/hive-site.xml.py |  8 ++++-
 testdata/bin/run-hive-server.sh        | 15 ++++++++++
 5 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 7c44902..d51b5cf 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -103,6 +103,20 @@ class Package(object):
       self.url = os.environ.get(url_env_var)
 
 
+class CdpComponent(object):
+  def __init__(self, basename, makedir=False):
+    """
+    basename: the name of the file to be downloaded, without its .tar.gz suffix
+    makedir: if false, it is assumed that the downloaded tarball will expand
+             into a directory with the same name as 'basename'. If True, we
+             assume that the tarball doesn't have any top-level directory,
+             and so we need to manually create a directory within which to
+             expand the tarball.
+    """
+    self.basename = basename
+    self.makedir = makedir
+
+
 def try_get_platform_release_label():
   """Gets the right package label from the OS version. Returns an OsMapping with both
      'toolchain' and 'cdh' labels. Return None if not found.
@@ -418,8 +432,12 @@ def download_cdh_components(toolchain_root, cdh_components, url_prefix):
 
 
 def download_cdp_components(cdp_components, url_prefix):
-  """Downloads and unpacks the CDP components for a given URL prefix into
-  $CDP_COMPONENTS_HOME if not found."""
+  """
+  Downloads and unpacks the CDP components for a given URL prefix into
+  $CDP_COMPONENTS_HOME if not found.
+
+  cdp_components: list of CdpComponent instances
+  """
   cdp_components_home = os.environ.get("CDP_COMPONENTS_HOME")
   if not cdp_components_home:
     logging.error("Impala environment not set up correctly, make sure "
@@ -430,12 +448,27 @@ def download_cdp_components(cdp_components, url_prefix):
   if not os.path.exists(cdp_components_home):
     os.makedirs(cdp_components_home)
 
-  def download(component_name):
-    pkg_directory = "{0}/{1}".format(cdp_components_home, component_name)
+  def download(component):
+    pkg_directory = "{0}/{1}".format(cdp_components_home, component.basename)
     if os.path.isdir(pkg_directory): return
-    file_name = "{0}.tar.gz".format(component_name)
+    file_name = "{0}.tar.gz".format(component.basename)
     download_path = "{0}/{1}".format(url_prefix, file_name)
-    wget_and_unpack_package(download_path, file_name, cdp_components_home, False)
+    dst = cdp_components_home
+    if component.makedir:
+      # Download and unpack in a temp directory, which we'll later move into place
+      dst = tempfile.mkdtemp(dir=cdp_components_home)
+    try:
+      wget_and_unpack_package(download_path, file_name, dst, False)
+    except:  # noqa
+      # Clean up any partially-unpacked result.
+      if os.path.isdir(pkg_directory):
+        shutil.rmtree(pkg_directory)
+      # Clean up any temp directory if we made one
+      if component.makedir:
+        shutil.rmtree(dst)
+      raise
+    if component.makedir:
+      os.rename(dst, pkg_directory)
 
   execute_many(download, cdp_components)
 
@@ -533,11 +566,15 @@ if __name__ == "__main__":
 
   cdp_build_number = os.environ["CDP_BUILD_NUMBER"]
   cdp_components = [
-    "ranger-{0}-admin".format(os.environ.get("IMPALA_RANGER_VERSION")),
+    CdpComponent("ranger-{0}-admin".format(os.environ.get("IMPALA_RANGER_VERSION"))),
   ]
+  use_cdp_hive = os.getenv("USE_CDP_HIVE") == "true"
   if use_cdp_hive:
-    cdp_components.append("apache-hive-{0}-bin"
-                          .format(os.environ.get("IMPALA_HIVE_VERSION")))
+    cdp_components.append(CdpComponent("apache-hive-{0}-bin"
+                          .format(os.environ.get("IMPALA_HIVE_VERSION"))))
+    cdp_components.append(CdpComponent(
+        "tez-{0}-minimal".format(os.environ.get("IMPALA_TEZ_VERSION")),
+        makedir=True))
   download_path_prefix = \
     "https://{0}/build/cdp_components/{1}/tarballs".format(toolchain_host,
                                                            cdp_build_number)
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 87b5216..709cfcb 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -174,6 +174,16 @@ export KUDU_JAVA_VERSION=1.10.0-cdh6.x-SNAPSHOT
 export USE_CDP_HIVE=${USE_CDP_HIVE-false}
 if $USE_CDP_HIVE; then
   export IMPALA_HIVE_VERSION=3.1.0.6.0.99.0-45
+  # Temporary version of Tez, patched with the fix for TEZ-1348:
+  # https://github.com/apache/tez/pull/40
+  # We'll switch to a non-"todd" version of Tez once that fix is integrated.
+  # For now, if you're bumping the CDP build number, you'll need to download
+  # this tarball from an earlier build and re-upload it to the new directory
+  # in the toolchain bucket.
+  #
+  # TODO(todd) switch to an official build.
+  export IMPALA_TEZ_VERSION=0.10.0-todd-6fcc41e5798b.1
+  export TEZ_HOME="$CDP_COMPONENTS_HOME/tez-${IMPALA_TEZ_VERSION}-minimal"
 else
   export IMPALA_HIVE_VERSION=2.1.1-cdh6.x-SNAPSHOT
 fi
diff --git a/fe/pom.xml b/fe/pom.xml
index bac4555..920b7e9 100644
--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -313,6 +313,10 @@ under the License.
           <groupId>net.minidev</groupId>
           <artifactId>json-smart</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.calcite.avatica</groupId>
+          <artifactId>avatica</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -349,6 +353,10 @@ under the License.
           <groupId>net.minidev</groupId>
           <artifactId>json-smart</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.calcite.avatica</groupId>
+          <artifactId>avatica</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -408,6 +416,10 @@ under the License.
           <groupId>net.minidev</groupId>
           <artifactId>json-smart</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.calcite.avatica</groupId>
+          <artifactId>avatica</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
diff --git a/fe/src/test/resources/hive-site.xml.py b/fe/src/test/resources/hive-site.xml.py
index fb64374..9c0ca7a 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -76,7 +76,13 @@ if kerberize:
   #   hive.metastore.kerberos.keytab.file
   #   hive.metastore.kerberos.principal
 
-if hive_major_version < 3:
+# Enable Tez and ACID for Hive 3
+if hive_major_version >= 3:
+  CONFIG.update({
+   'hive.tez.container.size': '512',
+   'hive.txn.manager': 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager',
+   'tez.local.mode': 'true'})
+else:
   CONFIG.update({
    # TODO(vihang) Disabled for HMS3.
    'hive.metastore.event.listeners': 'org.apache.sentry.binding.metastore.SentrySyncHMSNotificationsPostEventListener',
diff --git a/testdata/bin/run-hive-server.sh b/testdata/bin/run-hive-server.sh
index 8a6a1ca..6bdaaee 100755
--- a/testdata/bin/run-hive-server.sh
+++ b/testdata/bin/run-hive-server.sh
@@ -90,6 +90,21 @@ HADOOP_CLIENT_OPTS="-Xmx2024m -Dhive.log.file=hive-metastore.log" hive \
 ${CLUSTER_BIN}/wait-for-metastore.py --transport=${METASTORE_TRANSPORT}
 
 if [ ${ONLY_METASTORE} -eq 0 ]; then
+  # For Hive 3, we use Tez for execution. We have to add it to the HS2 classpath.
+  if $USE_CDP_HIVE; then
+    export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:${TEZ_HOME}/*
+    # This is a little hacky, but Tez bundles a bunch of junk into lib/, such
+    # as extra copies of the hadoop libraries, etc, and we want to avoid conflicts.
+    # So, we'll be a bit choosy about what we add to the classpath here.
+    for jar in $TEZ_HOME/lib/* ; do
+      case $(basename $jar) in
+        commons-*|RoaringBitmap*)
+          export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$jar
+          ;;
+      esac
+    done
+  fi
+
   # Starts a HiveServer2 instance on the port specified by the HIVE_SERVER2_THRIFT_PORT
   # environment variable. HADOOP_HEAPSIZE should be set to at least 2048 to avoid OOM
   # when loading ORC tables like widerow.