You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/02/01 01:42:54 UTC
[impala] branch master updated: IMPALA-6772: Enable
test_scanners_fuzz for ORC
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 443da21 IMPALA-6772: Enable test_scanners_fuzz for ORC
443da21 is described below
commit 443da2172cc10b8b732606d1d4465d4386022f19
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Sat Jan 25 05:06:35 2020 -0800
IMPALA-6772: Enable test_scanners_fuzz for ORC
Add test coverage for randomly corrupt ORC files by adding orc in tests
of test_scanners_fuzz.py. Also add two additional queries for nested
types.
Tests:
- Ran test_scanners_fuzz.py 780 rounds (took 43h).
- Ran test_scanners_fuzz.py for orc/def/block 1081 rounds (took 24h).
Change-Id: I3233e5d9f555029d954b5ddd5858ea194afc06bf
Reviewed-on: http://gerrit.cloudera.org:8080/15062
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../functional/functional_schema_template.sql | 23 ++++++
.../datasets/functional/schema_constraints.csv | 5 ++
tests/query_test/test_scanners_fuzz.py | 85 ++++++++++++++--------
3 files changed, 81 insertions(+), 32 deletions(-)
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 959e143..3ffed1d 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -2280,6 +2280,29 @@ SELECT * from functional.{table_name};
---- DATASET
functional
---- BASE_TABLE_NAME
+uncomp_src_alltypes
+---- CREATE_HIVE
+CREATE TABLE {db_name}{db_suffix}.{table_name} LIKE functional.alltypes STORED AS ORC;
+---- DEPENDENT_LOAD_HIVE
+SET orc.compress=NONE;
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (year, month)
+SELECT id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
+FROM functional.alltypes;
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+uncomp_src_decimal_tbl
+---- CREATE_HIVE
+CREATE TABLE {db_name}{db_suffix}.{table_name} LIKE functional.decimal_tbl STORED AS ORC;
+---- DEPENDENT_LOAD_HIVE
+SET orc.compress=NONE;
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (d6)
+SELECT d1, d2, d3, d4, d5, d6 FROM functional.decimal_tbl;
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
testescape_16_lf
---- CREATE
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index b5599a1..018d104 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -276,3 +276,8 @@ table_name:bucketed_ext_table, constraint:exclude, table_format:kudu/none/none
table_name:bucketed_table, constraint:exclude, table_format:hbase/none/none
table_name:bucketed_table, constraint:exclude, table_format:kudu/none/none
table_name:bucketed_table, constraint:exclude, table_format:text/lzo/block
+
+# The uncompressed ORC tables are mainly used in test_scanners_fuzz.py to avoid creating
+# them each time when running the test. Developers may run this test many times locally.
+table_name:uncomp_src_alltypes, constraint:restrict_to, table_format:orc/def/block
+table_name:uncomp_src_decimal_tbl, constraint:restrict_to, table_format:orc/def/block
diff --git a/tests/query_test/test_scanners_fuzz.py b/tests/query_test/test_scanners_fuzz.py
index 1a505ca..943473b 100644
--- a/tests/query_test/test_scanners_fuzz.py
+++ b/tests/query_test/test_scanners_fuzz.py
@@ -68,10 +68,8 @@ class TestScannersFuzzing(ImpalaTestSuite):
'mem_limit' : cls.MEM_LIMITS,
'debug_action' : cls.DEBUG_ACTION_VALUES}))
# TODO: enable for more table formats once they consistently pass the fuzz test.
- # TODO(IMPALA-6772): enable for ORC formats once a new version after release-1.4.3
- # of ORC library is released.
cls.ImpalaTestMatrix.add_constraint(lambda v:
- v.get_value('table_format').file_format in ('avro', 'parquet') or
+ v.get_value('table_format').file_format in ('avro', 'parquet', 'orc') or
(v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec in ('none', 'lzo')))
@@ -106,18 +104,33 @@ class TestScannersFuzzing(ImpalaTestSuite):
table_name = "complextypestbl"
src_db = QueryTestSectionReader.get_db_name(table_format)
- if table_format.file_format != 'parquet': pytest.skip()
- self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10)
-
- def test_fuzz_uncompressed_parquet(self, vector, unique_database):
- """Parquet tables in default schema are compressed, so in order
+ if table_format.file_format not in ['parquet', 'orc']: pytest.skip()
+ # Additional queries to scan the nested values.
+ custom_queries = [
+ "select count(*) from ("
+ " select distinct t.id, a.pos, a.item, aa.pos, aa.item, m.key, m.value,"
+ " ma.key, ma.value, t.nested_struct.* "
+ " from complextypestbl t, t.int_array a, t.int_array_array.item aa, "
+ " t.int_map m, t.int_map_array.item ma) q",
+
+ "select count(*) from ("
+ " select t.id, t.nested_struct.a, b.pos, b.item, i.e, i.f, m.key,"
+ " arr.pos, arr.item "
+ " from complextypestbl t, t.nested_struct.b, t.nested_struct.c.d.item i,"
+ " t.nested_struct.g m, m.value.h.i arr) q",
+ ]
+ self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10,
+ custom_queries=custom_queries)
+
+ def test_fuzz_uncompressed_parquet_orc(self, vector, unique_database):
+ """Parquet/ORC tables in default schema are compressed, so in order
to do the fuzz_test on an uncompressed parquet table, this test
clones from an existing parquet table into a new table with
- no compression.
+ no compression. This uncompressed ORC tables are generated by
+ data loading in advance, so we don't need to generate them here.
"""
table_format = vector.get_value('table_format')
- if vector.get_value('table_format').compression_codec != 'none': pytest.skip()
- if table_format.file_format != 'parquet': pytest.skip()
+ if table_format.file_format not in ['parquet', 'orc']: pytest.skip()
"""Even when the compression_codec is none, the default compression type is snappy
so compression codec is changed explicitly to be none.
@@ -126,18 +139,23 @@ class TestScannersFuzzing(ImpalaTestSuite):
tbl_list = ["alltypes", "decimal_tbl"]
for orig_tbl_name in tbl_list:
- src_table_name = "parquet_uncomp_src_" + orig_tbl_name
- fuzz_table_name = "parquet_uncomp_dst_" + orig_tbl_name
- fq_tbl_name = unique_database + "." + src_table_name
- create_tbl = ("create table {0} stored as parquet as select * from"
- " functional_parquet.{1}".format(fq_tbl_name, orig_tbl_name))
- self.execute_query(create_tbl)
- self.run_fuzz_test(vector, unique_database, src_table_name, unique_database,
- fuzz_table_name, 10)
+ src_table_name = "uncomp_src_" + orig_tbl_name
+ fuzz_table_name = "uncomp_dst_" + orig_tbl_name
+ if table_format.file_format == 'parquet':
+ fq_tbl_name = unique_database + "." + src_table_name
+ create_tbl = ("create table {0} stored as parquet as select * from"
+ " functional_parquet.{1}".format(fq_tbl_name, orig_tbl_name))
+ self.execute_query(create_tbl)
+ self.run_fuzz_test(vector, unique_database, src_table_name, unique_database,
+ fuzz_table_name, 10)
+ else:
+ self.run_fuzz_test(vector, "functional_orc_def", src_table_name, unique_database,
+ fuzz_table_name, 10)
# TODO: add test coverage for additional data types like char and varchar
- def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, num_copies=1):
+ def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, num_copies=1,
+ custom_queries=None):
""" Do some basic fuzz testing: create a copy of an existing table with randomly
corrupted files and make sure that we don't crash or behave in an unexpected way.
'unique_database' is used for the table, so it will be cleaned up automatically.
@@ -190,6 +208,8 @@ class TestScannersFuzzing(ImpalaTestSuite):
'select count(*) from (select distinct * from {0}.{1}) q'.format(
fuzz_db, fuzz_table),
'select count(*) from {0}.{1} q'.format(fuzz_db, fuzz_table)]
+ if custom_queries is not None:
+ queries = queries + [s.format(fuzz_db, fuzz_table) for s in custom_queries]
for query, batch_size, disable_codegen in \
itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES):
@@ -210,11 +230,9 @@ class TestScannersFuzzing(ImpalaTestSuite):
# E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
# (IMPALA-4013).
table_format = vector.get_value('table_format')
- if table_format.file_format != 'parquet' \
- and not (table_format.file_format == 'text' and \
- table_format.compression_codec != 'none') \
- and not table_format.file_format == 'rc' \
- and not table_format.file_format == 'seq':
+ if table_format.file_format not in ['parquet', 'orc', 'rc', 'seq'] \
+ and not (table_format.file_format == 'text' and
+ table_format.compression_codec != 'none'):
raise
def walk_and_corrupt_table_data(self, tmp_table_dir, num_copies, rng):
@@ -270,10 +288,13 @@ class TestScannersFuzzing(ImpalaTestSuite):
path, flip_offset, data[flip_offset], flip_val))
data[flip_offset] = flip_val
- if rng.random() < 0.4:
- truncation = rng.randint(0, len(data))
- LOG.info("corrupt file: Truncate {0} to {1}".format(path, truncation))
- data = data[:truncation]
-
- with open(path, "wb") as f:
- f.write(data)
+ if rng.random() < 0.4: # delete random part of the file
+ beg = rng.randint(0, len(data) - 1)
+ end = rng.randint(beg, len(data))
+ LOG.info("corrupt file: Remove range [{0}, {1}) in {2}".format(beg, end, path))
+ with open(path, "wb") as f:
+ f.write(data[:beg])
+ f.write(data[end:])
+ else:
+ with open(path, "wb") as f:
+ f.write(data)