You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/05/15 17:08:49 UTC

[impala] 01/06: IMPALA-8369: Fixing some core tests in Hive environment

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 9dd8d8241a6f3b20d4625560416498dc02498945
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed May 8 20:46:38 2019 +0200

    IMPALA-8369: Fixing some core tests in Hive environment
    
    Fixes:
    impala_test_suite.py:
      DROP PARTITIONS in the SETUP section of test files did
      not work with Hive 3, because 'max_parts' argument of
      hive_client.get_partition_names() was 0, while it should
      be -1 to return all partitions. The issue broke sevaral
      'insert' tests.
      Hive 2 used to return all partitions with argument 0 too
      but Hive 3 changed this to be more consistent, see HIVE-18567.
    load_nested.py:
      query/test_mt_dop.py:test_parquet_filtering amd several planner
      tests were broken because Hive 3 generates different number of
      files for tpch_nested_parquet.customer than Hive 2. The fix is to
      split the loading of this table to two inserts on Hive 3 in order
      to produce an extra file.
    
    Change-Id: I45d9b9312c6c77f436ab020ae68c15f3c7c737de
    Reviewed-on: http://gerrit.cloudera.org:8080/13283
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Vihang Karajgaonkar <vi...@cloudera.com>
---
 testdata/bin/load_nested.py       | 50 ++++++++++++++++++++++++++++-----------
 tests/common/environ.py           |  2 ++
 tests/common/impala_test_suite.py |  2 +-
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/testdata/bin/load_nested.py b/testdata/bin/load_nested.py
index be0dc13..d0f9066 100755
--- a/testdata/bin/load_nested.py
+++ b/testdata/bin/load_nested.py
@@ -24,6 +24,7 @@ import logging
 import os
 
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+from tests.common.environ import HIVE_MAJOR_VERSION
 import tests.comparison.cli_options as cli_options
 
 
@@ -292,27 +293,48 @@ def load():
   # Hive is used to convert the data into parquet/orc and drop all the temp tables.
   # The Hive SET values are necessary to prevent Impala remote reads of parquet files.
   # These values are taken from http://blog.cloudera.com/blog/2014/12/the-impala-cookbook.
-  with cluster.hive.cursor(db_name=target_db) as hive:
-    LOG.info("Converting temp tables")
-    for stmt in """
-        SET mapred.min.split.size=1073741824;
-        SET parquet.block.size=10737418240;
-        SET dfs.block.size=1073741824;
-
+  create_final_tables_sql = """
+      SET mapred.min.split.size=1073741824;
+      SET parquet.block.size=10737418240;
+      SET dfs.block.size=1073741824;
+
+      CREATE TABLE region
+      STORED AS {file_format}
+      TBLPROPERTIES('{compression_key}'='{compression_value}')
+      AS SELECT * FROM tmp_region;
+
+      CREATE TABLE supplier
+      STORED AS {file_format}
+      TBLPROPERTIES('{compression_key}'='{compression_value}')
+      AS SELECT * FROM tmp_supplier;"""
+
+  # A simple CTAS for tpch_nested_parquet.customer would create 3 files with Hive3 vs
+  # 4 files with Hive2. This difference would break several tests, and it seemed
+  # easier to hack the loading of the table than to add Hive version specific behavior
+  # for each affected test. A small part of the table is inserted in a separate statement
+  # to generate the +1 file (needs hive.merge.tezfiles to avoid creating +3 files).
+  # TODO: find a less hacky way to ensure a fix number of files
+  if HIVE_MAJOR_VERSION >= 3 and file_format == "parquet":
+    create_final_tables_sql += """
         CREATE TABLE customer
         STORED AS {file_format}
         TBLPROPERTIES('{compression_key}'='{compression_value}')
-        AS SELECT * FROM tmp_customer;
+        AS SELECT * FROM tmp_customer
+        WHERE c_custkey >= 10;
 
-        CREATE TABLE region
+        INSERT INTO customer
+        SELECT * FROM tmp_customer
+        WHERE c_custkey < 10;"""
+  else:
+    create_final_tables_sql += """
+        CREATE TABLE customer
         STORED AS {file_format}
         TBLPROPERTIES('{compression_key}'='{compression_value}')
-        AS SELECT * FROM tmp_region;
+        AS SELECT * FROM tmp_customer;"""
 
-        CREATE TABLE supplier
-        STORED AS {file_format}
-        TBLPROPERTIES('{compression_key}'='{compression_value}')
-        AS SELECT * FROM tmp_supplier;""".format(**sql_params).split(";"):
+  with cluster.hive.cursor(db_name=target_db) as hive:
+    LOG.info("Converting temp tables")
+    for stmt in create_final_tables_sql.format(**sql_params).split(";"):
       if not stmt.strip():
         continue
       LOG.info("Executing: {0}".format(stmt))
diff --git a/tests/common/environ.py b/tests/common/environ.py
index 5b92755..30805e7 100644
--- a/tests/common/environ.py
+++ b/tests/common/environ.py
@@ -57,6 +57,8 @@ if docker_network_search_result is not None:
   docker_network = docker_network_search_result.groups()[0]
 IS_DOCKERIZED_TEST_CLUSTER = docker_network is not None
 
+HIVE_MAJOR_VERSION = int(os.environ.get("IMPALA_HIVE_MAJOR_VERSION"))
+
 # Resolve any symlinks in the path.
 impalad_basedir = \
     os.path.realpath(os.path.join(IMPALA_HOME, 'be/build', build_type_dir)).rstrip('/')
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index 6630b3f..98e7d71 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -735,7 +735,7 @@ class ImpalaTestSuite(BaseTestSuite):
 
   def __drop_partitions(self, db_name, table_name):
     """Drops all partitions in the given table"""
-    for partition in self.hive_client.get_partition_names(db_name, table_name, 0):
+    for partition in self.hive_client.get_partition_names(db_name, table_name, -1):
       assert self.hive_client.drop_partition_by_name(db_name, table_name, \
           partition, True), 'Could not drop partition: %s' % partition