You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/10/24 22:10:56 UTC

[impala] branch master updated: IMPALA-9071: Handle translated external HDFS table in CTAS

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new b6b31e4  IMPALA-9071: Handle translated external HDFS table in CTAS
b6b31e4 is described below

commit b6b31e4cc415d0ba22d30b17f5dd039ba23700a6
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Thu Oct 24 11:37:25 2019 +0800

    IMPALA-9071: Handle translated external HDFS table in CTAS
    
    After upgrading Hive-3 to a version containing HIVE-22158, it's not
    allowed for managed tables to be non transactional. Creating non ACID
    tables will result in creating an external table with table property
    'external.table.purge' set to true.
    
    In Hive-3, the default location of external HDFS tables will be located
    in 'metastore.warehouse.external.dir' if it's set. This property is
    added by HIVE-19837 in Hive 2.7, but hasn't been added to Hive in cdh6
    yet.
    
    In CTAS statement, we create a temporary HMS Table for the analysis on
    the Insert part. The table path is created assuming it's a managed
    table, and the Insert part will use this path for insertion. However, in
    Hive-3, the created table is translated to an external table. It's not
    the same as we passed to the HMS API. The created table is located in
    'metastore.warehouse.external.dir', while the table path we assumed is
    in 'metastore.warehouse.dir'. This introduces bugs when these two
    properties are different. CTAS statement will create table in one place
    and insert data in another place.
    
    This patch adds a new method in MetastoreShim to wrap the difference for
    getting the default table path for non transactional tables between
    Hive-2 and Hive-3.
    
    Changes in the infra:
     - To support customizing hive configuration, add an env var,
       CUSTOM_CLASSPATH in bin/set-classpath.sh to be put in front of
       existing CLASSPATH. The customized hive-site.xml should be put inside
       CUSTOM_CLASSPATH.
     - Change hive-site.xml.py to generate a hive-site.xml with non default
       'metastore.warehouse.external.dir'
     - Add an option, --env_vars, in bin/start-impala-cluster.py to pass
       down CUSTOM_CLASSPATH.
    
    Tests:
     - Add a custom cluster test to start Hive with
       metastore.warehouse.external.dir being set to non default value. Run
       it locally using CDP components with HIVE-22158. xfail the test until
       we bump CDP_BUILD_NUMBER to 1507246.
     - Run CORE tests using CDH components
    
    Change-Id: I460a57dc877ef68ad7dd0864a33b1599b1e9a8d9
    Reviewed-on: http://gerrit.cloudera.org:8080/14527
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
---
 bin/create-test-configuration.sh                   |  5 ++
 bin/set-classpath.sh                               |  3 +
 bin/start-impala-cluster.py                        |  6 ++
 .../org/apache/impala/compat/MetastoreShim.java    | 16 ++++++
 .../org/apache/impala/compat/MetastoreShim.java    | 22 +++++++-
 .../impala/analysis/CreateTableAsSelectStmt.java   | 11 +---
 fe/src/test/resources/hive-site.xml.py             |  6 ++
 tests/common/custom_cluster_test_suite.py          | 33 ++++++++++-
 tests/custom_cluster/test_custom_hive_configs.py   | 66 ++++++++++++++++++++++
 9 files changed, 156 insertions(+), 12 deletions(-)

diff --git a/bin/create-test-configuration.sh b/bin/create-test-configuration.sh
index 2bcabe1..d1ec5b1 100755
--- a/bin/create-test-configuration.sh
+++ b/bin/create-test-configuration.sh
@@ -140,6 +140,11 @@ rm -f authz-provider.ini
 # if needed
 
 $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site.xml
+export HIVE_VARIANT=changed_external_dir
+$IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site_ext.xml
+mkdir -p hive-site-ext
+rm -f hive-site-ext/hive-site.xml
+ln -s "${CONFIG_DIR}/hive-site_ext.xml" hive-site-ext/hive-site.xml
 
 generate_config hive-log4j2.properties.template hive-log4j2.properties
 
diff --git a/bin/set-classpath.sh b/bin/set-classpath.sh
index 73e8449..5ee9874 100644
--- a/bin/set-classpath.sh
+++ b/bin/set-classpath.sh
@@ -47,4 +47,7 @@ fi
 
 CLASSPATH=$(cat "$IMPALA_HOME"/fe/target/build-classpath.txt):"$CLASSPATH"
 
+: ${CUSTOM_CLASSPATH=}
+CLASSPATH="$CUSTOM_CLASSPATH:$CLASSPATH"
+
 export CLASSPATH
diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py
index 75e8b93..bfbd50a 100755
--- a/bin/start-impala-cluster.py
+++ b/bin/start-impala-cluster.py
@@ -100,6 +100,8 @@ parser.add_option("--log_level", type="int", dest="log_level", default=1,
                    help="Set the impalad backend logging level")
 parser.add_option("--jvm_args", dest="jvm_args", default="",
                   help="Additional arguments to pass to the JVM(s) during startup.")
+parser.add_option("--env_vars", dest="env_vars", default="",
+                  help="Additional environment variables for Impala to run with")
 parser.add_option("--kudu_master_hosts", default=KUDU_MASTER_HOSTS,
                   help="The host name or address of the Kudu master. Multiple masters "
                       "can be specified using a comma separated list.")
@@ -163,6 +165,10 @@ def check_process_exists(binary, attempts=1):
 def run_daemon_with_options(daemon_binary, args, output_file, jvm_debug_port=None):
   """Wrapper around run_daemon() with options determined from command-line options."""
   env_vars = {"JAVA_TOOL_OPTIONS": build_java_tool_options(jvm_debug_port)}
+  if options.env_vars is not None:
+    for kv in options.env_vars.split():
+      k, v = kv.split('=')
+      env_vars[k] = v
   run_daemon(daemon_binary, args, build_type=options.build_type, env_vars=env_vars,
       output_file=output_file)
 
diff --git a/fe/src/compat-hive-2/java/org/apache/impala/compat/MetastoreShim.java b/fe/src/compat-hive-2/java/org/apache/impala/compat/MetastoreShim.java
index 83c95df..667d883 100644
--- a/fe/src/compat-hive-2/java/org/apache/impala/compat/MetastoreShim.java
+++ b/fe/src/compat-hive-2/java/org/apache/impala/compat/MetastoreShim.java
@@ -26,6 +26,7 @@ import com.google.common.collect.ImmutableMap;
 import java.util.EnumSet;
 import java.util.List;
 
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.FileUtils;
 import org.apache.hadoop.hive.common.StatsSetupConst;
 import org.apache.hadoop.hive.common.ValidWriteIdList;
@@ -37,6 +38,7 @@ import org.apache.hadoop.hive.metastore.TableType;
 import org.apache.hadoop.hive.metastore.Warehouse;
 import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.Database;
 import org.apache.hadoop.hive.metastore.api.InvalidInputException;
 import org.apache.hadoop.hive.metastore.api.InvalidObjectException;
 import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
@@ -449,4 +451,18 @@ public class MetastoreShim {
   public static long getMajorVersion() {
     return 2;
   }
+
+  /**
+   * Return the default table path.
+   *
+   * Hive-3 doesn't allow managed table to be non transactional after HIVE-22158.
+   * Creating a non transactional managed table will finally result in an external table
+   * with table property "external.table.purge" set to true. As the table type become
+   * EXTERNAL, the location will be under "metastore.warehouse.external.dir" (HIVE-19837,
+   * introduces in hive-2.7, not in hive-2.1.x-cdh6.x yet).
+   */
+  public static String getNonAcidTablePath(Database db, String tableName)
+      throws MetaException {
+    return new Path(db.getLocationUri(), tableName).toString();
+  }
 }
diff --git a/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java b/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java
index da33946..abdf321 100644
--- a/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java
+++ b/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java
@@ -40,6 +40,7 @@ import org.apache.hadoop.hive.common.ValidReaderWriteIdList;
 import org.apache.hadoop.hive.common.ValidTxnList;
 import org.apache.hadoop.hive.common.ValidTxnWriteIdList;
 import org.apache.hadoop.hive.common.ValidWriteIdList;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
 import org.apache.hadoop.hive.metastore.IMetaStoreClient;
 import org.apache.hadoop.hive.metastore.LockRequestBuilder;
@@ -47,6 +48,7 @@ import org.apache.hadoop.hive.metastore.TableType;
 import org.apache.hadoop.hive.metastore.Warehouse;
 import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.Database;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.InvalidInputException;
 import org.apache.hadoop.hive.metastore.api.InvalidObjectException;
@@ -67,7 +69,6 @@ import org.apache.hadoop.hive.metastore.api.TxnAbortedException;
 import org.apache.hadoop.hive.metastore.conf.MetastoreConf;
 import org.apache.hadoop.hive.metastore.messaging.AlterTableMessage;
 import org.apache.hadoop.hive.metastore.messaging.EventMessage;
-import org.apache.hadoop.hive.metastore.messaging.InsertMessage;
 import org.apache.hadoop.hive.metastore.messaging.MessageBuilder;
 import org.apache.hadoop.hive.metastore.messaging.MessageDeserializer;
 import org.apache.hadoop.hive.metastore.messaging.MessageEncoder;
@@ -84,8 +85,6 @@ import org.apache.impala.common.ImpalaException;
 import org.apache.impala.common.ImpalaRuntimeException;
 import org.apache.impala.common.Pair;
 import org.apache.impala.common.TransactionException;
-import org.apache.impala.common.TransactionKeepalive;
-import org.apache.impala.compat.HiveMetadataFormatUtils;
 import org.apache.impala.service.BackendConfig;
 import org.apache.impala.service.Frontend;
 import org.apache.impala.service.MetadataOp;
@@ -932,4 +931,21 @@ public class MetastoreShim {
      }
   }
 
+  /**
+   * Return the default table path.
+   *
+   * Hive-3 doesn't allow managed table to be non transactional after HIVE-22158.
+   * Creating a non transactional managed table will finally result in an external table
+   * with table property "external.table.purge" set to true. As the table type become
+   * EXTERNAL, the location will be under "metastore.warehouse.external.dir" (HIVE-19837,
+   * introduces in hive-2.7, not in hive-2.1.x-cdh6.x yet).
+   */
+  public static String getNonAcidTablePath(Database db, String tableName)
+      throws MetaException {
+    Warehouse wh = new Warehouse(new HiveConf());
+    // Non ACID managed tables are all translated to external tables by HMS's default
+    // transformer (HIVE-22158).
+    // TODO(IMPALA-9088): deal with customized transformer in HMS.
+    return wh.getDefaultTablePath(db, tableName, /*isExternal*/ true).toString();
+  }
 }
diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableAsSelectStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableAsSelectStmt.java
index 35ff006..e0a632f 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CreateTableAsSelectStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableAsSelectStmt.java
@@ -22,8 +22,6 @@ import java.util.Collections;
 import java.util.EnumSet;
 import java.util.List;
 
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.metastore.api.Table;
 import org.apache.impala.authorization.Privilege;
 import org.apache.impala.catalog.FeDb;
 import org.apache.impala.catalog.FeFsTable;
@@ -33,6 +31,7 @@ import org.apache.impala.catalog.HdfsFileFormat;
 import org.apache.impala.catalog.KuduTable;
 import org.apache.impala.catalog.Type;
 import org.apache.impala.common.AnalysisException;
+import org.apache.impala.compat.MetastoreShim;
 import org.apache.impala.rewrite.ExprRewriter;
 import org.apache.impala.service.CatalogOpExecutor;
 import org.apache.impala.thrift.THdfsFileFormat;
@@ -215,7 +214,8 @@ public class CreateTableAsSelectStmt extends StatementBase {
       // Set a valid location of this table using the same rules as the metastore, unless
       // the user specified a path.
       if (msTbl.getSd().getLocation() == null || msTbl.getSd().getLocation().isEmpty()) {
-        msTbl.getSd().setLocation(getPathForNewTable(db, msTbl));
+        msTbl.getSd().setLocation(MetastoreShim.getNonAcidTablePath(
+            db.getMetaStoreDb(), msTbl.getTableName().toLowerCase()));
       }
 
       FeTable tmpTable = null;
@@ -238,11 +238,6 @@ public class CreateTableAsSelectStmt extends StatementBase {
     insertStmt_.analyze(analyzer);
   }
 
-  private static String getPathForNewTable(FeDb db, Table msTbl) {
-    String dbLocation = db.getMetaStoreDb().getLocationUri();
-    return new Path(dbLocation, msTbl.getTableName().toLowerCase()).toString();
-  }
-
   @Override
   public List<Expr> getResultExprs() { return insertStmt_.getResultExprs(); }
 
diff --git a/fe/src/test/resources/hive-site.xml.py b/fe/src/test/resources/hive-site.xml.py
index d8acccd..e9f902f 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -21,6 +21,7 @@ import os
 
 hive_major_version = int(os.environ['IMPALA_HIVE_VERSION'][0])
 kerberize = os.environ.get('IMPALA_KERBERIZE') == '1'
+variant = os.environ.get('HIVE_VARIANT')
 
 CONFIG = {
   'dfs.replication': '3'
@@ -59,6 +60,11 @@ CONFIG.update({
   'hive.support.concurrency': 'true',
 })
 
+if variant == 'changed_external_dir':
+  CONFIG.update({
+    'hive.metastore.warehouse.external.dir': '${WAREHOUSE_LOCATION_PREFIX}/test-warehouse-external',
+  })
+
 # HBase-related configs.
 # Impala processes need to connect to zookeeper on INTERNAL_LISTEN_HOST for HBase.
 CONFIG.update({
diff --git a/tests/common/custom_cluster_test_suite.py b/tests/common/custom_cluster_test_suite.py
index edae090..ab2c627 100644
--- a/tests/common/custom_cluster_test_suite.py
+++ b/tests/common/custom_cluster_test_suite.py
@@ -43,6 +43,7 @@ KUDU_ARGS = 'kudu_args'
 START_ARGS = 'start_args'
 SENTRY_CONFIG = 'sentry_config'
 SENTRY_LOG_DIR = 'sentry_log_dir'
+HIVE_CONF_DIR = 'hive_conf_dir'
 CLUSTER_SIZE = "cluster_size"
 # Default query options passed to the impala daemon command line. Handled separately from
 # other impala daemon arguments to allow merging multiple defaults into a single list.
@@ -103,7 +104,7 @@ class CustomClusterTestSuite(ImpalaTestSuite):
   @staticmethod
   def with_args(impalad_args=None, statestored_args=None, catalogd_args=None,
       start_args=None, sentry_config=None, default_query_options=None,
-      impala_log_dir=None, sentry_log_dir=None, cluster_size=None,
+      impala_log_dir=None, sentry_log_dir=None, hive_conf_dir=None, cluster_size=None,
       num_exclusive_coordinators=None, kudu_args=None, statestored_timeout_s=None,
       impalad_timeout_s=None):
     """Records arguments to be passed to a cluster by adding them to the decorated
@@ -120,6 +121,8 @@ class CustomClusterTestSuite(ImpalaTestSuite):
         func.func_dict[SENTRY_CONFIG] = sentry_config
       if sentry_log_dir is not None:
         func.func_dict[SENTRY_LOG_DIR] = sentry_log_dir
+      if hive_conf_dir is not None:
+        func.func_dict[HIVE_CONF_DIR] = hive_conf_dir
       if kudu_args is not None:
         func.func_dict[KUDU_ARGS] = kudu_args
       if default_query_options is not None:
@@ -145,6 +148,14 @@ class CustomClusterTestSuite(ImpalaTestSuite):
     if START_ARGS in method.func_dict:
       cluster_args.extend(method.func_dict[START_ARGS])
 
+    if HIVE_CONF_DIR in method.func_dict:
+      self._start_hive_service(method.func_dict[HIVE_CONF_DIR])
+      # Should let Impala adopt the same hive-site.xml. The only way is to add it in the
+      # beginning of the CLASSPATH. Because there's already a hive-site.xml in the
+      # default CLASSPATH (see bin/set-classpath.sh).
+      cluster_args.append(
+        '--env_vars=CUSTOM_CLASSPATH=%s ' % method.func_dict[HIVE_CONF_DIR])
+
     if KUDU_ARGS in method.func_dict:
       self._restart_kudu_service(method.func_dict[KUDU_ARGS])
 
@@ -180,6 +191,8 @@ class CustomClusterTestSuite(ImpalaTestSuite):
     super(CustomClusterTestSuite, self).setup_class()
 
   def teardown_method(self, method):
+    if HIVE_CONF_DIR in method.func_dict:
+      self._start_hive_service(None)  # Restart Hive Service using default configs
     super(CustomClusterTestSuite, self).teardown_class()
 
   @classmethod
@@ -224,6 +237,24 @@ class CustomClusterTestSuite(ImpalaTestSuite):
                           close_fds=True)
 
   @classmethod
+  def _start_hive_service(cls, hive_conf_dir):
+    hive_env = dict(os.environ)
+    if hive_conf_dir is not None:
+      hive_env['HIVE_CONF_DIR'] = hive_conf_dir
+    call = subprocess.Popen(
+      ['/bin/bash', '-c', os.path.join(IMPALA_HOME, 'testdata/bin/run-hive-server.sh')],
+      env=hive_env)
+    call.wait()
+    if call.returncode != 0:
+      raise RuntimeError("Unable to start Hive")
+
+  @classmethod
+  def _stop_hive_service(cls):
+    subprocess.check_call([os.path.join(IMPALA_HOME,
+                                        "testdata/bin/kill-hive-server.sh")],
+                          close_fds=True)
+
+  @classmethod
   def _start_impala_cluster(cls,
                             options,
                             impala_log_dir=os.getenv('LOG_DIR', "/tmp/"),
diff --git a/tests/custom_cluster/test_custom_hive_configs.py b/tests/custom_cluster/test_custom_hive_configs.py
new file mode 100644
index 0000000..74186bd
--- /dev/null
+++ b/tests/custom_cluster/test_custom_hive_configs.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+
+import pytest
+from os import getenv
+
+HIVE_SITE_EXT_DIR = getenv('IMPALA_HOME') + '/fe/src/test/resources/hive-site-ext'
+
+
+class TestCustomHiveConfigs(CustomClusterTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def setup_class(cls):
+    super(TestCustomHiveConfigs, cls).setup_class()
+
+  # TODO: Remove the xfail marker after bumping CDP_BUILD_NUMBER to contain HIVE-22158
+  @pytest.mark.xfail(run=True, reason="May fail on Hive3 versions without HIVE-22158")
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(hive_conf_dir=HIVE_SITE_EXT_DIR)
+  def test_ctas_read_write_consistence(self, unique_database):
+    """
+    IMPALA-9071: Check that CTAS inserts data to the correct directory when
+    'metastore.warehouse.external.dir' is different from 'metastore.warehouse.dir'
+    in Hive.
+    """
+    self.execute_query_expect_success(
+        self.client, 'create table %s.ctas_tbl as select 1, 2, "name"' %
+                     unique_database)
+    res = self.execute_query_expect_success(
+        self.client, 'select * from %s.ctas_tbl' % unique_database)
+    assert '1\t2\tname' == res.get_data()
+
+    self.execute_query_expect_success(
+        self.client, 'create external table %s.ctas_ext_tbl as select 1, 2, "name"' %
+                     unique_database)
+    # Set "external.table.purge"="true" so we can clean files of the external table
+    # finally.
+    self.execute_query_expect_success(
+        self.client, 'alter table %s.ctas_ext_tbl set tblproperties'
+                     '("external.table.purge"="true")' % unique_database)
+    res = self.execute_query_expect_success(
+        self.client, 'select * from %s.ctas_ext_tbl' % unique_database)
+    assert '1\t2\tname' == res.get_data()
+
+    # Explicitly drop the database with CASCADE to clean files of the external table
+    self.execute_query_expect_success(
+        self.client, 'drop database if exists cascade' + unique_database)