You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/05/15 17:08:49 UTC
[impala] 01/06: IMPALA-8369: Fixing some core tests in Hive
environment
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 9dd8d8241a6f3b20d4625560416498dc02498945
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed May 8 20:46:38 2019 +0200
IMPALA-8369: Fixing some core tests in Hive environment
Fixes:
impala_test_suite.py:
DROP PARTITIONS in the SETUP section of test files did
not work with Hive 3, because 'max_parts' argument of
hive_client.get_partition_names() was 0, while it should
be -1 to return all partitions. The issue broke sevaral
'insert' tests.
Hive 2 used to return all partitions with argument 0 too
but Hive 3 changed this to be more consistent, see HIVE-18567.
load_nested.py:
query/test_mt_dop.py:test_parquet_filtering amd several planner
tests were broken because Hive 3 generates different number of
files for tpch_nested_parquet.customer than Hive 2. The fix is to
split the loading of this table to two inserts on Hive 3 in order
to produce an extra file.
Change-Id: I45d9b9312c6c77f436ab020ae68c15f3c7c737de
Reviewed-on: http://gerrit.cloudera.org:8080/13283
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Vihang Karajgaonkar <vi...@cloudera.com>
---
testdata/bin/load_nested.py | 50 ++++++++++++++++++++++++++++-----------
tests/common/environ.py | 2 ++
tests/common/impala_test_suite.py | 2 +-
3 files changed, 39 insertions(+), 15 deletions(-)
diff --git a/testdata/bin/load_nested.py b/testdata/bin/load_nested.py
index be0dc13..d0f9066 100755
--- a/testdata/bin/load_nested.py
+++ b/testdata/bin/load_nested.py
@@ -24,6 +24,7 @@ import logging
import os
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+from tests.common.environ import HIVE_MAJOR_VERSION
import tests.comparison.cli_options as cli_options
@@ -292,27 +293,48 @@ def load():
# Hive is used to convert the data into parquet/orc and drop all the temp tables.
# The Hive SET values are necessary to prevent Impala remote reads of parquet files.
# These values are taken from http://blog.cloudera.com/blog/2014/12/the-impala-cookbook.
- with cluster.hive.cursor(db_name=target_db) as hive:
- LOG.info("Converting temp tables")
- for stmt in """
- SET mapred.min.split.size=1073741824;
- SET parquet.block.size=10737418240;
- SET dfs.block.size=1073741824;
-
+ create_final_tables_sql = """
+ SET mapred.min.split.size=1073741824;
+ SET parquet.block.size=10737418240;
+ SET dfs.block.size=1073741824;
+
+ CREATE TABLE region
+ STORED AS {file_format}
+ TBLPROPERTIES('{compression_key}'='{compression_value}')
+ AS SELECT * FROM tmp_region;
+
+ CREATE TABLE supplier
+ STORED AS {file_format}
+ TBLPROPERTIES('{compression_key}'='{compression_value}')
+ AS SELECT * FROM tmp_supplier;"""
+
+ # A simple CTAS for tpch_nested_parquet.customer would create 3 files with Hive3 vs
+ # 4 files with Hive2. This difference would break several tests, and it seemed
+ # easier to hack the loading of the table than to add Hive version specific behavior
+ # for each affected test. A small part of the table is inserted in a separate statement
+ # to generate the +1 file (needs hive.merge.tezfiles to avoid creating +3 files).
+ # TODO: find a less hacky way to ensure a fix number of files
+ if HIVE_MAJOR_VERSION >= 3 and file_format == "parquet":
+ create_final_tables_sql += """
CREATE TABLE customer
STORED AS {file_format}
TBLPROPERTIES('{compression_key}'='{compression_value}')
- AS SELECT * FROM tmp_customer;
+ AS SELECT * FROM tmp_customer
+ WHERE c_custkey >= 10;
- CREATE TABLE region
+ INSERT INTO customer
+ SELECT * FROM tmp_customer
+ WHERE c_custkey < 10;"""
+ else:
+ create_final_tables_sql += """
+ CREATE TABLE customer
STORED AS {file_format}
TBLPROPERTIES('{compression_key}'='{compression_value}')
- AS SELECT * FROM tmp_region;
+ AS SELECT * FROM tmp_customer;"""
- CREATE TABLE supplier
- STORED AS {file_format}
- TBLPROPERTIES('{compression_key}'='{compression_value}')
- AS SELECT * FROM tmp_supplier;""".format(**sql_params).split(";"):
+ with cluster.hive.cursor(db_name=target_db) as hive:
+ LOG.info("Converting temp tables")
+ for stmt in create_final_tables_sql.format(**sql_params).split(";"):
if not stmt.strip():
continue
LOG.info("Executing: {0}".format(stmt))
diff --git a/tests/common/environ.py b/tests/common/environ.py
index 5b92755..30805e7 100644
--- a/tests/common/environ.py
+++ b/tests/common/environ.py
@@ -57,6 +57,8 @@ if docker_network_search_result is not None:
docker_network = docker_network_search_result.groups()[0]
IS_DOCKERIZED_TEST_CLUSTER = docker_network is not None
+HIVE_MAJOR_VERSION = int(os.environ.get("IMPALA_HIVE_MAJOR_VERSION"))
+
# Resolve any symlinks in the path.
impalad_basedir = \
os.path.realpath(os.path.join(IMPALA_HOME, 'be/build', build_type_dir)).rstrip('/')
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index 6630b3f..98e7d71 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -735,7 +735,7 @@ class ImpalaTestSuite(BaseTestSuite):
def __drop_partitions(self, db_name, table_name):
"""Drops all partitions in the given table"""
- for partition in self.hive_client.get_partition_names(db_name, table_name, 0):
+ for partition in self.hive_client.get_partition_names(db_name, table_name, -1):
assert self.hive_client.drop_partition_by_name(db_name, table_name, \
partition, True), 'Could not drop partition: %s' % partition