You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by lv...@apache.org on 2017/05/09 15:55:56 UTC

[02/13] incubator-impala git commit: IMPALA-5287: Test skip.header.line.count on gzip

IMPALA-5287: Test skip.header.line.count on gzip

This change fixed IMPALA-4873 by adding the capability to supply a dict
'test_file_vars' to run_test_case(). Keys in this dict will be replaced
with their values inside test queries before they are executed.

Change-Id: Ie3f3c29a42501cfb2751f7ad0af166eb88f63b70
Reviewed-on: http://gerrit.cloudera.org:8080/6817
Reviewed-by: Michael Brown <mi...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/12f3ecce
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/12f3ecce
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/12f3ecce

Branch: refs/heads/master
Commit: 12f3ecceabc5a7cdf401956376ebcd483d0c2376
Parents: fd62a7f
Author: Lars Volker <lv...@cloudera.com>
Authored: Sat May 6 22:17:05 2017 +0200
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue May 9 01:36:46 2017 +0000

----------------------------------------------------------------------
 testdata/bin/generate-schema-statements.py      |   3 +-
 testdata/data/README                            |  19 ++++++--
 testdata/data/table_with_header.gz              | Bin 0 -> 64 bytes
 testdata/data/table_with_header_2.gz            | Bin 0 -> 82 bytes
 .../functional/functional_schema_template.sql   |   4 ++
 .../datasets/functional/schema_constraints.csv  |   4 ++
 .../QueryTest/hdfs-text-scan-with-header.test   |  48 ++++++++++---------
 tests/common/impala_test_suite.py               |  14 +++++-
 tests/query_test/test_scanners.py               |  22 +++++----
 9 files changed, 75 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/bin/generate-schema-statements.py
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index a214822..fdb9c64 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -359,7 +359,8 @@ def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_for
   insert_statement = insert.format(db_name=db_name,
                                    db_suffix=db_suffix,
                                    table_name=table_name,
-                                   hdfs_location=hdfs_path)
+                                   hdfs_location=hdfs_path,
+                                   impala_home = os.getenv("IMPALA_HOME"))
 
   # Kudu tables are managed and don't support OVERWRITE, so we replace OVERWRITE
   # with INTO to make this a regular INSERT.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index 465d80b..23b0586 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -68,24 +68,23 @@ first rowgroup column metadata for 'int_array' incorrectly states there are 50 v
 (instead of 100), and the second rowgroup column metadata for 'id' incorrectly states
 there are 11 values (instead of 10). The third rowgroup has the correct metadata.
 
-data-bzip2.bz2
+data-bzip2.bz2:
 Generated with bzip2, contains single bzip2 stream
 Contains 1 column, uncompressed data size < 8M
 
-large_bzip2.bz2
+large_bzip2.bz2:
 Generated with bzip2, contains single bzip2 stream
 Contains 1 column, uncompressed data size > 8M
 
-data-pbzip2.bz2
+data-pbzip2.bz2:
 Generated with pbzip2, contains multiple bzip2 streams
 Contains 1 column, uncompressed data size < 8M
 
-large_pbzip2.bz2
+large_pbzip2.bz2:
 Generated with pbzip2, contains multiple bzip2 stream
 Contains 1 column, uncompressed data size > 8M
 
 out_of_range_timestamp.parquet:
------------
 Generated with a hacked version of Impala parquet writer.
 Contains a single timestamp column with 4 values, 2 of which are out of range
 and should be read as NULL by Impala:
@@ -93,3 +92,13 @@ and should be read as NULL by Impala:
    1400-01-01 00:00:00
    9999-12-31 00:00:00
   10000-01-01 00:00:00 (invalid - date too large)
+
+table_with_header.csv:
+Created with a text editor, contains a header line before the data rows.
+
+table_with_header_2.csv:
+Created with a text editor, contains two header lines before the data rows.
+
+table_with_header.gz, table_with_header_2.gz:
+Generated by gzip'ing table_with_header.csv and table_with_header_2.csv.
+

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/table_with_header.gz
----------------------------------------------------------------------
diff --git a/testdata/data/table_with_header.gz b/testdata/data/table_with_header.gz
new file mode 100644
index 0000000..a7c86df
Binary files /dev/null and b/testdata/data/table_with_header.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/data/table_with_header_2.gz
----------------------------------------------------------------------
diff --git a/testdata/data/table_with_header_2.gz b/testdata/data/table_with_header_2.gz
new file mode 100644
index 0000000..d8600fd
Binary files /dev/null and b/testdata/data/table_with_header_2.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 1aacf43..e7b8a07 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -2088,6 +2088,8 @@ delimited fields terminated by ','  escaped by '\\'
 ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='1');
 ---- LOAD
 LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional
@@ -2102,6 +2104,8 @@ delimited fields terminated by ','  escaped by '\\'
 ALTER TABLE {table_name} SET TBLPROPERTIES('skip.header.line.count'='2');
 ---- LOAD
 LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+---- DEPENDENT_LOAD
+LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/table_with_header_2.gz' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
 ---- DATASET
 functional

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/datasets/functional/schema_constraints.csv
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index d6d1111..bb3487f 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -189,6 +189,10 @@ table_name:nullescapedtable, constraint:only, table_format:kudu/none/none
 table_name:table_with_header, constraint:restrict_to, table_format:text/none/none
 table_name:table_with_header_2, constraint:restrict_to, table_format:text/none/none
 table_name:table_with_header_insert, constraint:restrict_to, table_format:text/none/none
+# We also test that skipping header lines works on compressed tables (IMPALA-5287)
+table_name:table_with_header, constraint:restrict_to, table_format:text/gzip/block
+table_name:table_with_header_2, constraint:restrict_to, table_format:text/gzip/block
+table_name:table_with_header_insert, constraint:restrict_to, table_format:text/gzip/block
 # Inserting into parquet tables should not be affected by the 'skip.header.line.count'
 # property, so we test parquet format as well.
 table_name:table_with_header_insert, constraint:restrict_to, table_format:parquet/none/none

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
index 4aab121..d5f92f7 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan-with-header.test
@@ -1,7 +1,7 @@
 ====
 ---- QUERY
 set max_scan_range_length=0;
-select c1, c2 from functional.table_with_header
+select c1, c2 from table_with_header
 ---- RESULTS
 1,2
 3,4
@@ -11,7 +11,7 @@ INT,DOUBLE
 ====
 ---- QUERY
 set max_scan_range_length=0;
-select count(*) from functional.table_with_header
+select count(*) from table_with_header
 ---- RESULTS
 3
 ---- TYPES
@@ -19,7 +19,7 @@ BIGINT
 ====
 ---- QUERY
 set max_scan_range_length=2;
-select c1, c2 from functional.table_with_header
+select c1, c2 from table_with_header
 ---- RESULTS
 1,2
 3,4
@@ -29,7 +29,7 @@ INT,DOUBLE
 ====
 ---- QUERY
 set max_scan_range_length=2;
-select count(*) from functional.table_with_header
+select count(*) from table_with_header
 ---- RESULTS
 3
 ---- TYPES
@@ -37,7 +37,7 @@ BIGINT
 ====
 ---- QUERY
 set max_scan_range_length=30;
-select c1, c2 from functional.table_with_header
+select c1, c2 from table_with_header
 ---- RESULTS
 1,2
 3,4
@@ -47,7 +47,7 @@ INT,DOUBLE
 ====
 ---- QUERY
 set max_scan_range_length=30;
-select count(*) from functional.table_with_header
+select count(*) from table_with_header
 ---- RESULTS
 3
 ---- TYPES
@@ -55,7 +55,7 @@ BIGINT
 ====
 ---- QUERY
 set max_scan_range_length=0;
-select c1, c2 from functional.table_with_header_2
+select c1, c2 from table_with_header_2
 ---- RESULTS
 1,2
 3,4
@@ -65,13 +65,15 @@ INT,DOUBLE
 ====
 ---- QUERY
 set max_scan_range_length=0;
-select count(*) from functional.table_with_header_2
+select count(*) from table_with_header_2
 ---- RESULTS
 3
 ---- TYPES
 BIGINT
 ====
 ---- QUERY
+# This test is only supported on uncompressed tables, since we always only issue one
+# single scan range for a compressed file.
 set max_scan_range_length=2;
 set abort_on_error=1;
 select c1, c2 from functional.table_with_header_2
@@ -81,6 +83,8 @@ increasing max_scan_range_length to a value larger than the size of the file's h
 INT,DOUBLE
 ====
 ---- QUERY
+# This test is only supported on uncompressed tables, since we always only issue one
+# single scan range for a compressed file.
 set max_scan_range_length=2;
 set abort_on_error=0;
 select c1, c2 from functional.table_with_header_2
@@ -91,7 +95,7 @@ INT,DOUBLE
 ====
 ---- QUERY
 set max_scan_range_length=30;
-select c1, c2 from functional.table_with_header_2
+select c1, c2 from table_with_header_2
 ---- RESULTS
 1,2
 3,4
@@ -101,24 +105,24 @@ INT,DOUBLE
 ====
 ---- QUERY
 set max_scan_range_length=30;
-select count(*) from functional.table_with_header_2
+select count(*) from table_with_header_2
 ---- RESULTS
 3
 ---- TYPES
 BIGINT
 ====
 ---- QUERY
-drop table if exists mixed;
-create table mixed (kf smallint) partitioned by (year smallint) stored as textfile;
-alter table mixed add partition (year=2012);
-alter table mixed add partition (year=2013);
-alter table mixed partition (year=2013) set fileformat parquet;
-insert into mixed partition (year=2012) values (1),(2),(3);
-insert into mixed partition (year=2013) values (4),(5),(6);
-alter table mixed set tblproperties("skip.header.line.count"="1");
-alter table mixed set fileformat parquet;
-alter table mixed set tblproperties("skip.header.line.count"="2");
-select * from mixed;
+drop table if exists $UNIQUE_DB.mixed;
+create table $UNIQUE_DB.mixed (kf smallint) partitioned by (year smallint) stored as textfile;
+alter table $UNIQUE_DB.mixed add partition (year=2012);
+alter table $UNIQUE_DB.mixed add partition (year=2013);
+alter table $UNIQUE_DB.mixed partition (year=2013) set fileformat parquet;
+insert into $UNIQUE_DB.mixed partition (year=2012) values (1),(2),(3);
+insert into $UNIQUE_DB.mixed partition (year=2013) values (4),(5),(6);
+alter table $UNIQUE_DB.mixed set tblproperties("skip.header.line.count"="1");
+alter table $UNIQUE_DB.mixed set fileformat parquet;
+alter table $UNIQUE_DB.mixed set tblproperties("skip.header.line.count"="2");
+select * from $UNIQUE_DB.mixed;
 ---- RESULTS
 3,2012
 4,2013
@@ -128,5 +132,5 @@ select * from mixed;
 SMALLINT,SMALLINT
 ====
 ---- QUERY
-drop table mixed;
+drop table $UNIQUE_DB.mixed;
 ====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/tests/common/impala_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index af0ed1d..f7602af 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -274,7 +274,7 @@ class ImpalaTestSuite(BaseTestSuite):
 
 
   def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False,
-      encoding=None):
+      encoding=None, test_file_vars=None):
     """
     Runs the queries in the specified test based on the vector values
 
@@ -285,6 +285,9 @@ class ImpalaTestSuite(BaseTestSuite):
     Additionally, the encoding for all test data can be specified using the 'encoding'
     parameter. This is useful when data is ingested in a different encoding (ex.
     latin). If not set, the default system encoding will be used.
+    If a dict 'test_file_vars' is provided, then all keys will be replaced with their
+    values in queries before they are executed. Callers need to avoid using reserved key
+    names, see 'reserved_keywords' below.
     """
     table_format_info = vector.get_value('table_format')
     exec_options = vector.get_value('exec_option')
@@ -336,6 +339,15 @@ class ImpalaTestSuite(BaseTestSuite):
           .replace('$SECONDARY_FILESYSTEM', os.getenv("SECONDARY_FILESYSTEM") or str()))
       if use_db: query = query.replace('$DATABASE', use_db)
 
+      reserved_keywords = ["$DATABASE", "$FILESYSTEM_PREFIX", "$GROUP_NAME",
+                           "$IMPALA_HOME", "$NAMENODE", "$QUERY", "$SECONDARY_FILESYSTEM"]
+
+      if test_file_vars:
+        for key, value in test_file_vars.iteritems():
+          if key in reserved_keywords:
+            raise RuntimeError("Key {0} is reserved".format(key))
+          query = query.replace(key, value)
+
       if 'QUERY_NAME' in test_section:
         LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME'])
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12f3ecce/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index b0e2e80..5dbe02a 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -122,7 +122,7 @@ class TestUnmatchedSchema(ImpalaTestSuite):
     cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
     # Avro has a more advanced schema evolution process which is covered in more depth
     # in the test_avro_schema_evolution test suite.
-    cls.ImpalaTestMatrix.add_constraint(\
+    cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('table_format').file_format != 'avro')
 
   def _create_test_table(self, vector):
@@ -574,8 +574,8 @@ class TestTextScanRangeLengths(ImpalaTestSuite):
     super(TestTextScanRangeLengths, cls).add_test_dimensions()
     cls.ImpalaTestMatrix.add_dimension(
         ImpalaTestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
-    cls.ImpalaTestMatrix.add_constraint(lambda v:\
-        v.get_value('table_format').file_format == 'text' and\
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('table_format').file_format == 'text' and
         v.get_value('table_format').compression_codec == 'none')
 
   def test_text_scanner(self, vector):
@@ -605,8 +605,8 @@ class TestTextSplitDelimiters(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestTextSplitDelimiters, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_constraint(lambda v:\
-        v.get_value('table_format').file_format == 'text' and\
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('table_format').file_format == 'text' and
         v.get_value('table_format').compression_codec == 'none')
 
   def test_text_split_delimiters(self, vector, unique_database):
@@ -682,11 +682,13 @@ class TestTextScanRangeLengths(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestTextScanRangeLengths, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_constraint(
-      lambda v: v.get_value('table_format').file_format == 'text')
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('table_format').file_format == 'text' and
+        v.get_value('table_format').compression_codec in ['none', 'gzip'])
 
   def test_text_scanner_with_header(self, vector, unique_database):
-    self.run_test_case('QueryTest/hdfs-text-scan-with-header', vector, unique_database)
+    self.run_test_case('QueryTest/hdfs-text-scan-with-header', vector,
+                       test_file_vars={'$UNIQUE_DB': unique_database})
 
 
 # Missing Coverage: No coverage for truncated files errors or scans.
@@ -708,8 +710,8 @@ class TestScanTruncatedFiles(ImpalaTestSuite):
     # strategy.
     # TODO: Test other file formats
     if cls.exploration_strategy() == 'exhaustive':
-      cls.ImpalaTestMatrix.add_constraint(lambda v:\
-          v.get_value('table_format').file_format == 'text' and\
+      cls.ImpalaTestMatrix.add_constraint(lambda v:
+          v.get_value('table_format').file_format == 'text' and
           v.get_value('table_format').compression_codec == 'none')
     else:
       cls.ImpalaTestMatrix.add_constraint(lambda v: False)