You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/02/01 01:42:54 UTC

[impala] branch master updated: IMPALA-6772: Enable test_scanners_fuzz for ORC

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 443da21  IMPALA-6772: Enable test_scanners_fuzz for ORC
443da21 is described below

commit 443da2172cc10b8b732606d1d4465d4386022f19
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Sat Jan 25 05:06:35 2020 -0800

    IMPALA-6772: Enable test_scanners_fuzz for ORC
    
    Add test coverage for randomly corrupt ORC files by adding orc in tests
    of test_scanners_fuzz.py. Also add two additional queries for nested
    types.
    
    Tests:
     - Ran test_scanners_fuzz.py 780 rounds (took 43h).
     - Ran test_scanners_fuzz.py for orc/def/block 1081 rounds (took 24h).
    
    Change-Id: I3233e5d9f555029d954b5ddd5858ea194afc06bf
    Reviewed-on: http://gerrit.cloudera.org:8080/15062
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../functional/functional_schema_template.sql      | 23 ++++++
 .../datasets/functional/schema_constraints.csv     |  5 ++
 tests/query_test/test_scanners_fuzz.py             | 85 ++++++++++++++--------
 3 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 959e143..3ffed1d 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -2280,6 +2280,29 @@ SELECT * from functional.{table_name};
 ---- DATASET
 functional
 ---- BASE_TABLE_NAME
+uncomp_src_alltypes
+---- CREATE_HIVE
+CREATE TABLE {db_name}{db_suffix}.{table_name} LIKE functional.alltypes STORED AS ORC;
+---- DEPENDENT_LOAD_HIVE
+SET orc.compress=NONE;
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (year, month)
+SELECT id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
+FROM functional.alltypes;
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+uncomp_src_decimal_tbl
+---- CREATE_HIVE
+CREATE TABLE {db_name}{db_suffix}.{table_name} LIKE functional.decimal_tbl STORED AS ORC;
+---- DEPENDENT_LOAD_HIVE
+SET orc.compress=NONE;
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (d6)
+SELECT d1, d2, d3, d4, d5, d6 FROM functional.decimal_tbl;
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
 testescape_16_lf
 ---- CREATE
 CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index b5599a1..018d104 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -276,3 +276,8 @@ table_name:bucketed_ext_table, constraint:exclude, table_format:kudu/none/none
 table_name:bucketed_table, constraint:exclude, table_format:hbase/none/none
 table_name:bucketed_table, constraint:exclude, table_format:kudu/none/none
 table_name:bucketed_table, constraint:exclude, table_format:text/lzo/block
+
+# The uncompressed ORC tables are mainly used in test_scanners_fuzz.py to avoid creating
+# them each time when running the test. Developers may run this test many times locally.
+table_name:uncomp_src_alltypes, constraint:restrict_to, table_format:orc/def/block
+table_name:uncomp_src_decimal_tbl, constraint:restrict_to, table_format:orc/def/block
diff --git a/tests/query_test/test_scanners_fuzz.py b/tests/query_test/test_scanners_fuzz.py
index 1a505ca..943473b 100644
--- a/tests/query_test/test_scanners_fuzz.py
+++ b/tests/query_test/test_scanners_fuzz.py
@@ -68,10 +68,8 @@ class TestScannersFuzzing(ImpalaTestSuite):
           'mem_limit' : cls.MEM_LIMITS,
           'debug_action' : cls.DEBUG_ACTION_VALUES}))
     # TODO: enable for more table formats once they consistently pass the fuzz test.
-    # TODO(IMPALA-6772): enable for ORC formats once a new version after release-1.4.3
-    # of ORC library is released.
     cls.ImpalaTestMatrix.add_constraint(lambda v:
-        v.get_value('table_format').file_format in ('avro', 'parquet') or
+        v.get_value('table_format').file_format in ('avro', 'parquet', 'orc') or
         (v.get_value('table_format').file_format == 'text' and
           v.get_value('table_format').compression_codec in ('none', 'lzo')))
 
@@ -106,18 +104,33 @@ class TestScannersFuzzing(ImpalaTestSuite):
     table_name = "complextypestbl"
     src_db = QueryTestSectionReader.get_db_name(table_format)
 
-    if table_format.file_format != 'parquet': pytest.skip()
-    self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10)
-
-  def test_fuzz_uncompressed_parquet(self, vector, unique_database):
-    """Parquet tables in default schema are compressed, so in order
+    if table_format.file_format not in ['parquet', 'orc']: pytest.skip()
+    # Additional queries to scan the nested values.
+    custom_queries = [
+      "select count(*) from ("
+      "  select distinct t.id, a.pos, a.item, aa.pos, aa.item, m.key, m.value,"
+      "    ma.key, ma.value, t.nested_struct.* "
+      "  from complextypestbl t, t.int_array a, t.int_array_array.item aa, "
+      "    t.int_map m, t.int_map_array.item ma) q",
+
+      "select count(*) from ("
+      "  select t.id, t.nested_struct.a, b.pos, b.item, i.e, i.f, m.key,"
+      "    arr.pos, arr.item "
+      "  from complextypestbl t, t.nested_struct.b, t.nested_struct.c.d.item i,"
+      "    t.nested_struct.g m, m.value.h.i arr) q",
+    ]
+    self.run_fuzz_test(vector, src_db, table_name, unique_database, table_name, 10,
+                       custom_queries=custom_queries)
+
+  def test_fuzz_uncompressed_parquet_orc(self, vector, unique_database):
+    """Parquet/ORC tables in default schema are compressed, so in order
        to do the fuzz_test on an uncompressed parquet table, this test
        clones from an existing parquet table into a new table with
-       no compression.
+       no compression. This uncompressed ORC tables are generated by
+       data loading in advance, so we don't need to generate them here.
     """
     table_format = vector.get_value('table_format')
-    if vector.get_value('table_format').compression_codec != 'none': pytest.skip()
-    if table_format.file_format != 'parquet': pytest.skip()
+    if table_format.file_format not in ['parquet', 'orc']: pytest.skip()
 
     """Even when the compression_codec is none, the default compression type is snappy
        so compression codec is changed explicitly to be none.
@@ -126,18 +139,23 @@ class TestScannersFuzzing(ImpalaTestSuite):
 
     tbl_list = ["alltypes", "decimal_tbl"]
     for orig_tbl_name in tbl_list:
-      src_table_name = "parquet_uncomp_src_" + orig_tbl_name
-      fuzz_table_name = "parquet_uncomp_dst_" + orig_tbl_name
-      fq_tbl_name = unique_database + "." + src_table_name
-      create_tbl = ("create table {0} stored as parquet as select * from"
-          " functional_parquet.{1}".format(fq_tbl_name, orig_tbl_name))
-      self.execute_query(create_tbl)
-      self.run_fuzz_test(vector, unique_database, src_table_name, unique_database,
-          fuzz_table_name, 10)
+      src_table_name = "uncomp_src_" + orig_tbl_name
+      fuzz_table_name = "uncomp_dst_" + orig_tbl_name
+      if table_format.file_format == 'parquet':
+        fq_tbl_name = unique_database + "." + src_table_name
+        create_tbl = ("create table {0} stored as parquet as select * from"
+            " functional_parquet.{1}".format(fq_tbl_name, orig_tbl_name))
+        self.execute_query(create_tbl)
+        self.run_fuzz_test(vector, unique_database, src_table_name, unique_database,
+                           fuzz_table_name, 10)
+      else:
+        self.run_fuzz_test(vector, "functional_orc_def", src_table_name, unique_database,
+                           fuzz_table_name, 10)
 
   # TODO: add test coverage for additional data types like char and varchar
 
-  def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, num_copies=1):
+  def run_fuzz_test(self, vector, src_db, src_table, fuzz_db, fuzz_table, num_copies=1,
+                    custom_queries=None):
     """ Do some basic fuzz testing: create a copy of an existing table with randomly
     corrupted files and make sure that we don't crash or behave in an unexpected way.
     'unique_database' is used for the table, so it will be cleaned up automatically.
@@ -190,6 +208,8 @@ class TestScannersFuzzing(ImpalaTestSuite):
         'select count(*) from (select distinct * from {0}.{1}) q'.format(
             fuzz_db, fuzz_table),
         'select count(*) from {0}.{1} q'.format(fuzz_db, fuzz_table)]
+    if custom_queries is not None:
+      queries = queries + [s.format(fuzz_db, fuzz_table) for s in custom_queries]
 
     for query, batch_size, disable_codegen in \
         itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES):
@@ -210,11 +230,9 @@ class TestScannersFuzzing(ImpalaTestSuite):
         # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
         # (IMPALA-4013).
         table_format = vector.get_value('table_format')
-        if table_format.file_format != 'parquet' \
-            and not (table_format.file_format == 'text' and \
-            table_format.compression_codec != 'none') \
-            and not table_format.file_format == 'rc' \
-            and not table_format.file_format == 'seq':
+        if table_format.file_format not in ['parquet', 'orc', 'rc', 'seq'] \
+            and not (table_format.file_format == 'text' and
+            table_format.compression_codec != 'none'):
           raise
 
   def walk_and_corrupt_table_data(self, tmp_table_dir, num_copies, rng):
@@ -270,10 +288,13 @@ class TestScannersFuzzing(ImpalaTestSuite):
           path, flip_offset, data[flip_offset], flip_val))
       data[flip_offset] = flip_val
 
-    if rng.random() < 0.4:
-      truncation = rng.randint(0, len(data))
-      LOG.info("corrupt file: Truncate {0} to {1}".format(path, truncation))
-      data = data[:truncation]
-
-    with open(path, "wb") as f:
-      f.write(data)
+    if rng.random() < 0.4:  # delete random part of the file
+      beg = rng.randint(0, len(data) - 1)
+      end = rng.randint(beg, len(data))
+      LOG.info("corrupt file: Remove range [{0}, {1}) in {2}".format(beg, end, path))
+      with open(path, "wb") as f:
+        f.write(data[:beg])
+        f.write(data[end:])
+    else:
+      with open(path, "wb") as f:
+        f.write(data)