You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by lv...@apache.org on 2017/05/09 15:55:56 UTC
[02/13] incubator-impala git commit: IMPALA-5287: Test
skip.header.line.count on gzip
IMPALA-5287: Test skip.header.line.count on gzip
This change fixed IMPALA-4873 by adding the capability to supply a dict
'test_file_vars' to run_test_case(). Keys in this dict will be replaced
with their values inside test queries before they are executed.
Change-Id: Ie3f3c29a42501cfb2751f7ad0af166eb88f63b70
Reviewed-on: http://gerrit.cloudera.org:8080/6817
Reviewed-by: Michael Brown <mi...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/12f3ecce
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/12f3ecce
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/12f3ecce
Branch: refs/heads/master
Commit: 12f3ecceabc5a7cdf401956376ebcd483d0c2376
Parents: fd62a7f
Author: Lars Volker <lv...@cloudera.com>
Authored: Sat May 6 22:17:05 2017 +0200
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue May 9 01:36:46 2017 +0000
----------------------------------------------------------------------
testdata/bin/generate-schema-statements.py | 3 +-
testdata/data/README | 19 ++++++--
testdata/data/table_with_header.gz | Bin 0 -> 64 bytes
testdata/data/table_with_header_2.gz | Bin 0 -> 82 bytes
.../functional/functional_schema_template.sql | 4 ++
.../datasets/functional/schema_constraints.csv | 4 ++
.../QueryTest/hdfs-text-scan-with-header.test | 48 ++++++++++---------
tests/common/impala_test_suite.py | 14 +++++-
tests/query_test/test_scanners.py | 22 +++++----
9 files changed, 75 insertions(+), 39 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/bin/generate-schema-statements.py
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index a214822..fdb9c64 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -359,7 +359,8 @@ def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_for
insert_statement = insert.format(db_name=db_name,
db_suffix=db_suffix,
table_name=table_name,
- hdfs_location=hdfs_path)
+ hdfs_location=hdfs_path,
+ impala_home = os.getenv("IMPALA_HOME"))
# Kudu tables are managed and don't support OVERWRITE, so we replace OVERWRITE
# with INTO to make this a regular INSERT.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index 465d80b..23b0586 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -68,24 +68,23 @@ first rowgroup column metadata for 'int_array' incorrectly states there are 50 v
(instead of 100), and the second rowgroup column metadata for 'id' incorrectly states
there are 11 values (instead of 10). The third rowgroup has the correct metadata.
-data-bzip2.bz2
+data-bzip2.bz2:
Generated with bzip2, contains single bzip2 stream
Contains 1 column, uncompressed data size < 8M
-large_bzip2.bz2
+large_bzip2.bz2:
Generated with bzip2, contains single bzip2 stream
Contains 1 column, uncompressed data size > 8M
-data-pbzip2.bz2
+data-pbzip2.bz2:
Generated with pbzip2, contains multiple bzip2 streams
Contains 1 column, uncompressed data size < 8M
-large_pbzip2.bz2
+large_pbzip2.bz2:
Generated with pbzip2, contains multiple bzip2 stream
Contains 1 column, uncompressed data size > 8M
out_of_range_timestamp.parquet:
------------
Generated with a hacked version of Impala parquet writer.
Contains a single timestamp column with 4 values, 2 of which are out of range
and should be read as NULL by Impala:
@@ -93,3 +92,13 @@ and should be read as NULL by Impala:
1400-01-01 00:00:00
9999-12-31 00:00:00
10000-01-01 00:00:00 (invalid - date too large)
+
+table_with_header.csv:
+Created with a text editor, contains a header line before the data rows.
+
+table_with_header_2.csv:
+Created with a text editor, contains two header lines before the data rows.
+
+table_with_header.gz, table_with_header_2.gz:
+Generated by gzip'ing table_with_header.csv and table_with_header_2.csv.
+
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/table_with_header.gz
----------------------------------------------------------------------
diff --git a/testdata/data/table_with_header.gz b/testdata/data/table_with_header.gz
new file mode 100644
index 0000000..a7c86df
Binary files /dev/null and b/testdata/data/table_with_header.gz differ
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/table_with_header_2.gz
----------------------------------------------------------------------
diff --git a/testdata/data/table_with_header_2.gz b/testdata/data/table_with_header_2.gz
new file mode 100644
index 0000000..d8600fd
Binary files /dev/null and b/testdata/data/table_with_header_2.gz differ
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 1aacf43..e7b8a07 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -2088,6 +2088,8 @@ delimited fields terminated by ',' escaped by '\\'
ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='1');
---- LOAD
LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
====
---- DATASET
functional
@@ -2102,6 +2104,8 @@ delimited fields terminated by ',' escaped by '\\'
ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='2');
---- LOAD
LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
====
---- DATASET
functional
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/datasets/functional/schema_constraints.csv
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index d6d1111..bb3487f 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -189,6 +189,10 @@ table_name:nullescapedtable, constraint:only, table_format:kudu/none/none
table_name:table_with_header, constraint:restrict_to, table_format:text/none/none
table_name:table_with_header_2, constraint:restrict_to, table_format:text/none/none
table_name:table_with_header_insert, constraint:restrict_to, table_format:text/none/none
+# We also test that skipping header lines works on compressed tables (IMPALA-5287)
+table_name:table_with_header, constraint:restrict_to, table_format:text/gzip/block
+table_name:table_with_header_2, constraint:restrict_to, table_format:text/gzip/block
+table_name:table_with_header_insert, constraint:restrict_to, table_format:text/gzip/block
# Inserting into parquet tables should not be affected by the 'skip.header.line.count'
# property, so we test parquet format as well.
table_name:table_with_header_insert, constraint:restrict_to, table_format:parquet/none/none
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
index 4aab121..d5f92f7 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
@@ -1,7 +1,7 @@
====
---- QUERY
set max_scan_range_length=0;
-select c1, c2 from functional.table_with_header
+select c1, c2 from table_with_header
---- RESULTS
1,2
3,4
@@ -11,7 +11,7 @@ INT,DOUBLE
====
---- QUERY
set max_scan_range_length=0;
-select count(*) from functional.table_with_header
+select count(*) from table_with_header
---- RESULTS
3
---- TYPES
@@ -19,7 +19,7 @@ BIGINT
====
---- QUERY
set max_scan_range_length=2;
-select c1, c2 from functional.table_with_header
+select c1, c2 from table_with_header
---- RESULTS
1,2
3,4
@@ -29,7 +29,7 @@ INT,DOUBLE
====
---- QUERY
set max_scan_range_length=2;
-select count(*) from functional.table_with_header
+select count(*) from table_with_header
---- RESULTS
3
---- TYPES
@@ -37,7 +37,7 @@ BIGINT
====
---- QUERY
set max_scan_range_length=30;
-select c1, c2 from functional.table_with_header
+select c1, c2 from table_with_header
---- RESULTS
1,2
3,4
@@ -47,7 +47,7 @@ INT,DOUBLE
====
---- QUERY
set max_scan_range_length=30;
-select count(*) from functional.table_with_header
+select count(*) from table_with_header
---- RESULTS
3
---- TYPES
@@ -55,7 +55,7 @@ BIGINT
====
---- QUERY
set max_scan_range_length=0;
-select c1, c2 from functional.table_with_header_2
+select c1, c2 from table_with_header_2
---- RESULTS
1,2
3,4
@@ -65,13 +65,15 @@ INT,DOUBLE
====
---- QUERY
set max_scan_range_length=0;
-select count(*) from functional.table_with_header_2
+select count(*) from table_with_header_2
---- RESULTS
3
---- TYPES
BIGINT
====
---- QUERY
+# This test is only supported on uncompressed tables, since we always only issue one
+# single scan range for a compressed file.
set max_scan_range_length=2;
set abort_on_error=1;
select c1, c2 from functional.table_with_header_2
@@ -81,6 +83,8 @@ increasing max_scan_range_length to a value larger than the size of the file's h
INT,DOUBLE
====
---- QUERY
+# This test is only supported on uncompressed tables, since we always only issue one
+# single scan range for a compressed file.
set max_scan_range_length=2;
set abort_on_error=0;
select c1, c2 from functional.table_with_header_2
@@ -91,7 +95,7 @@ INT,DOUBLE
====
---- QUERY
set max_scan_range_length=30;
-select c1, c2 from functional.table_with_header_2
+select c1, c2 from table_with_header_2
---- RESULTS
1,2
3,4
@@ -101,24 +105,24 @@ INT,DOUBLE
====
---- QUERY
set max_scan_range_length=30;
-select count(*) from functional.table_with_header_2
+select count(*) from table_with_header_2
---- RESULTS
3
---- TYPES
BIGINT
====
---- QUERY
-drop table if exists mixed;
-create table mixed (kf smallint) partitioned by (year smallint) stored as textfile;
-alter table mixed add partition (year=2012);
-alter table mixed add partition (year=2013);
-alter table mixed partition (year=2013) set fileformat parquet;
-insert into mixed partition (year=2012) values (1),(2),(3);
-insert into mixed partition (year=2013) values (4),(5),(6);
-alter table mixed set tblproperties("skip.header.line.count"="1");
-alter table mixed set fileformat parquet;
-alter table mixed set tblproperties("skip.header.line.count"="2");
-select * from mixed;
+drop table if exists $UNIQUE_DB.mixed;
+create table $UNIQUE_DB.mixed (kf smallint) partitioned by (year smallint) stored as textfile;
+alter table $UNIQUE_DB.mixed add partition (year=2012);
+alter table $UNIQUE_DB.mixed add partition (year=2013);
+alter table $UNIQUE_DB.mixed partition (year=2013) set fileformat parquet;
+insert into $UNIQUE_DB.mixed partition (year=2012) values (1),(2),(3);
+insert into $UNIQUE_DB.mixed partition (year=2013) values (4),(5),(6);
+alter table $UNIQUE_DB.mixed set tblproperties("skip.header.line.count"="1");
+alter table $UNIQUE_DB.mixed set fileformat parquet;
+alter table $UNIQUE_DB.mixed set tblproperties("skip.header.line.count"="2");
+select * from $UNIQUE_DB.mixed;
---- RESULTS
3,2012
4,2013
@@ -128,5 +132,5 @@ select * from mixed;
SMALLINT,SMALLINT
====
---- QUERY
-drop table mixed;
+drop table $UNIQUE_DB.mixed;
====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/tests/common/impala_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index af0ed1d..f7602af 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -274,7 +274,7 @@ class ImpalaTestSuite(BaseTestSuite):
def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False,
- encoding=None):
+ encoding=None, test_file_vars=None):
"""
Runs the queries in the specified test based on the vector values
@@ -285,6 +285,9 @@ class ImpalaTestSuite(BaseTestSuite):
Additionally, the encoding for all test data can be specified using the 'encoding'
parameter. This is useful when data is ingested in a different encoding (ex.
latin). If not set, the default system encoding will be used.
+ If a dict 'test_file_vars' is provided, then all keys will be replaced with their
+ values in queries before they are executed. Callers need to avoid using reserved key
+ names, see 'reserved_keywords' below.
"""
table_format_info = vector.get_value('table_format')
exec_options = vector.get_value('exec_option')
@@ -336,6 +339,15 @@ class ImpalaTestSuite(BaseTestSuite):
.replace('$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str()))
if use_db: query = query.replace('$DATABASE', use_db)
+ reserved_keywords = ["$DATABASE", "$FILESYSTEM_PREFIX", "$GROUP_NAME",
+ "$IMPALA_HOME", "$NAMENODE", "$QUERY", "$SECONDARY_FILESYSTEM"]
+
+ if test_file_vars:
+ for key, value in test_file_vars.iteritems():
+ if key in reserved_keywords:
+ raise RuntimeError("Key {0} is reserved".format(key))
+ query = query.replace(key, value)
+
if 'QUERY_NAME' in test_section:
LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME'])
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index b0e2e80..5dbe02a 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -122,7 +122,7 @@ class TestUnmatchedSchema(ImpalaTestSuite):
cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
# Avro has a more advanced schema evolution process which is covered in more depth
# in the test_avro_schema_evolution test suite.
- cls.ImpalaTestMatrix.add_constraint(\
+ cls.ImpalaTestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format != 'avro')
def _create_test_table(self, vector):
@@ -574,8 +574,8 @@ class TestTextScanRangeLengths(ImpalaTestSuite):
super(TestTextScanRangeLengths, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
- cls.ImpalaTestMatrix.add_constraint(lambda v:\
- v.get_value('table_format').file_format == 'text' and\
+ cls.ImpalaTestMatrix.add_constraint(lambda v:
+ v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec == 'none')
def test_text_scanner(self, vector):
@@ -605,8 +605,8 @@ class TestTextSplitDelimiters(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestTextSplitDelimiters, cls).add_test_dimensions()
- cls.ImpalaTestMatrix.add_constraint(lambda v:\
- v.get_value('table_format').file_format == 'text' and\
+ cls.ImpalaTestMatrix.add_constraint(lambda v:
+ v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec == 'none')
def test_text_split_delimiters(self, vector, unique_database):
@@ -682,11 +682,13 @@ class TestTextScanRangeLengths(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestTextScanRangeLengths, cls).add_test_dimensions()
- cls.ImpalaTestMatrix.add_constraint(
- lambda v: v.get_value('table_format').file_format == 'text')
+ cls.ImpalaTestMatrix.add_constraint(lambda v:
+ v.get_value('table_format').file_format == 'text' and
+ v.get_value('table_format').compression_codec in ['none', 'gzip'])
def test_text_scanner_with_header(self, vector, unique_database):
- self.run_test_case('QueryTest/hdfs-text-scan-with-header', vector, unique_database)
+ self.run_test_case('QueryTest/hdfs-text-scan-with-header', vector,
+ test_file_vars={'$UNIQUE_DB': unique_database})
# Missing Coverage: No coverage for truncated files errors or scans.
@@ -708,8 +710,8 @@ class TestScanTruncatedFiles(ImpalaTestSuite):
# strategy.
# TODO: Test other file formats
if cls.exploration_strategy() == 'exhaustive':
- cls.ImpalaTestMatrix.add_constraint(lambda v:\
- v.get_value('table_format').file_format == 'text' and\
+ cls.ImpalaTestMatrix.add_constraint(lambda v:
+ v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec == 'none')
else:
cls.ImpalaTestMatrix.add_constraint(lambda v: False)