You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/12/15 23:01:11 UTC
[45/50] [abbrv] incubator-impala git commit: IMPALA-4659: fuzz test
fixes
IMPALA-4659: fuzz test fixes
* Apply a 512m mem_limit to all fuzz tests. This limits aggregate memory
consumption to ~5GB per daemon(assuming 10 concurrent tests).
* Refactor the exec option handling to use the exec_option dimension.
This avoids executing the test multiple times redundantly
* Remove the xfails to reduce noise, since there is no immediate plan to
fix the product bugs. Instead just pass the tests.
Testing:
Ran in a loop for ~1h to flush out flakiness.
Change-Id: Ie1942ceef252ec3e6171a0a54722b66a7d9abbd7
Reviewed-on: http://gerrit.cloudera.org:8080/5502
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/246acba0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/246acba0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/246acba0
Branch: refs/heads/hadoop-next
Commit: 246acba0b37f61e708c17ea55f2300ef96908fd1
Parents: fc4ee65
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Dec 14 11:04:20 2016 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Dec 15 01:31:28 2016 +0000
----------------------------------------------------------------------
tests/query_test/test_scanners_fuzz.py | 66 ++++++++++++++++++-----------
1 file changed, 41 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/246acba0/tests/query_test/test_scanners_fuzz.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners_fuzz.py b/tests/query_test/test_scanners_fuzz.py
index 76cc62a..81d20a4 100644
--- a/tests/query_test/test_scanners_fuzz.py
+++ b/tests/query_test/test_scanners_fuzz.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from copy import copy
+import itertools
import os
import pytest
import random
@@ -22,12 +24,25 @@ import shutil
import tempfile
import time
from subprocess import check_call
+from tests.common.test_dimensions import create_exec_option_dimension_from_dict
from tests.common.impala_test_suite import ImpalaTestSuite, LOG
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
# Random fuzz testing of HDFS scanners. Existing tables for any HDFS file format
# are corrupted in random ways to flush out bugs with handling of corrupted data.
class TestScannersFuzzing(ImpalaTestSuite):
+ # Use abort_on_error = False to ensure we scan all the files.
+ ABORT_ON_ERROR_VALUES = [False]
+
+ # Only run on all nodes - num_nodes=1 would not provide additional coverage.
+ NUM_NODES_VALUES = [0]
+
+ # Limit memory to avoid causing other concurrent tests to fail.
+ MEM_LIMITS = ['512m']
+
+ # Test the codegen and non-codegen paths.
+ DISABLE_CODEGEN_VALUES = [True, False]
+
# Test a range of batch sizes to exercise different corner cases.
BATCH_SIZES = [0, 1, 16, 10000]
@@ -38,6 +53,11 @@ class TestScannersFuzzing(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestScannersFuzzing, cls).add_test_dimensions()
+ cls.TestMatrix.add_dimension(
+ create_exec_option_dimension_from_dict({
+ 'abort_on_error' : cls.ABORT_ON_ERROR_VALUES,
+ 'num_nodes' : cls.NUM_NODES_VALUES,
+ 'mem_limit' : cls.MEM_LIMITS}))
# TODO: enable for more table formats once they consistently pass the fuzz test.
cls.TestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ('avro', 'parquet') or
@@ -120,36 +140,32 @@ class TestScannersFuzzing(ImpalaTestSuite):
# Execute a query that tries to read all the columns and rows in the file.
# Also execute a count(*) that materializes no columns, since different code
# paths are exercised.
- # Use abort_on_error=0 to ensure we scan all the files.
queries = [
'select count(*) from (select distinct * from {0}.{1}) q'.format(
unique_database, table),
'select count(*) from {0}.{1} q'.format(unique_database, table)]
- xfail_msgs = []
- for query in queries:
- for batch_size in self.BATCH_SIZES:
- query_options = {'abort_on_error': '0', 'batch_size': batch_size}
- try:
- result = self.execute_query(query, query_options = query_options)
- LOG.info('\n'.join(result.log))
- except Exception as e:
- if 'memory limit exceeded' in str(e).lower():
- # Memory limit error should fail query.
- continue
- msg = "Should not throw error when abort_on_error=0: '{0}'".format(e)
- LOG.error(msg)
- # Parquet and compressed text can fail the query for some parse errors.
- # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
- # (IMPALA-4013).
- if table_format.file_format == 'parquet' or \
- (table_format.file_format == 'text' and
- table_format.compression_codec != 'none'):
- xfail_msgs.append(msg)
- else:
- raise
- if len(xfail_msgs) != 0:
- pytest.xfail('\n'.join(xfail_msgs))
+ for query, batch_size, disable_codegen in \
+ itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES):
+ query_options = copy(vector.get_value('exec_option'))
+ query_options['batch_size'] = batch_size
+ query_options['disable_codegen'] = disable_codegen
+ try:
+ result = self.execute_query(query, query_options = query_options)
+ LOG.info('\n'.join(result.log))
+ except Exception as e:
+ if 'memory limit exceeded' in str(e).lower():
+ # Memory limit error should fail query.
+ continue
+ msg = "Should not throw error when abort_on_error=0: '{0}'".format(e)
+ LOG.error(msg)
+ # Parquet and compressed text can fail the query for some parse errors.
+ # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
+ # (IMPALA-4013).
+ if table_format.file_format != 'parquet' \
+ and not (table_format.file_format == 'text' and
+ table_format.compression_codec != 'none'):
+ raise
def walk_and_corrupt_table_data(self, tmp_table_dir, num_copies, rng):
""" Walks a local copy of a HDFS table directory. Returns a list of partitions, each