You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/04/11 16:17:10 UTC
[impala] 01/02: IMPALA-11227: FE OOM in TestParquetBloomFilter.test_fallback_from_dict_if_no_bloom_tbl_props
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 6358db7587cc2218e4523dae16ac5e362edf37ef
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Wed Apr 6 16:12:33 2022 +0200
IMPALA-11227: FE OOM in TestParquetBloomFilter.test_fallback_from_dict_if_no_bloom_tbl_props
The huge values clause of the insert SQL statement in
TestParquetBloomFilter.test_fallback_from_dict_if_no_bloom_tbl_props
could cause an OutOfMemory error in the FE.
We use a SQL statement with a huge values clause (more than 40 000
elements) to insert values into a parquet table in some tests, and the
size of the SQL statement string sometimes causes an OOM error.
After this change, we create these parquet tables with a CTAS from an
existing table, avoiding any long SQL statements.
Change-Id: I923cc9ba4b6829a2f15e93365f2849b89248598b
Reviewed-on: http://gerrit.cloudera.org:8080/18387
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
tests/query_test/test_parquet_bloom_filter.py | 28 +++++++++++++--------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/tests/query_test/test_parquet_bloom_filter.py b/tests/query_test/test_parquet_bloom_filter.py
index 998cfd34f..eda93999b 100644
--- a/tests/query_test/test_parquet_bloom_filter.py
+++ b/tests/query_test/test_parquet_bloom_filter.py
@@ -161,9 +161,9 @@ class TestParquetBloomFilter(ImpalaTestSuite):
# Query an element that is and one that is not present in the table and check whether
# we correctly do not skip and skip the row group, respectively.
self._query_element_check_profile(vector, str(unique_database), tbl_name, column_name,
- 0, ['NumBloomFilteredRowGroups: 0 (0)'], ['NumBloomFilteredRowGroups: 1 (1)'])
+ 2, ['NumBloomFilteredRowGroups: 0 (0)'], ['NumBloomFilteredRowGroups: 1 (1)'])
self._query_element_check_profile(vector, str(unique_database), tbl_name, column_name,
- 1, ['NumBloomFilteredRowGroups: 1 (1)'], ['NumBloomFilteredRowGroups: 0 (0)'])
+ 3, ['NumBloomFilteredRowGroups: 1 (1)'], ['NumBloomFilteredRowGroups: 0 (0)'])
def test_fallback_from_dict_if_no_bloom_tbl_props(self, vector, unique_database,
tmpdir):
@@ -196,6 +196,7 @@ class TestParquetBloomFilter(ImpalaTestSuite):
result_in_table = self.execute_query(query_stmt.format(col_name=col_name,
db=db_name, tbl=tbl_name, value=element),
vector.get_value('exec_option'))
+
for s in strings_in_profile:
assert s in result_in_table.runtime_profile
for s in strings_not_in_profile:
@@ -208,24 +209,21 @@ class TestParquetBloomFilter(ImpalaTestSuite):
fpp = 0.05
bitset_size = self._optimal_bitset_size(ndv, fpp)
- # We create a table with a single BIGINT column, optionally with table properties for
- # Bloom filtering.
- create_stmt = 'create table {db}.{tbl} ({col_name} BIGINT) stored as parquet'
- if bloom_tbl_prop:
- create_stmt += ' TBLPROPERTIES("parquet.bloom.filter.columns"="{col_name}:{size}")'
- create_stmt = create_stmt.format(
- db=db_name, tbl=tbl_name, col_name=column_name, size=bitset_size)
+ bloom_tbl_props = \
+ 'TBLPROPERTIES("parquet.bloom.filter.columns"="{col_name}:{size}")'.format(
+ col_name=column_name, size=bitset_size)
- # We only insert even numbers so an odd number should be filtered out based on the
- # Bloom filter.
- values = ['({})'.format(i * 2) for i in range(ndv)]
- insert_stmt = 'insert into {db}.{tbl} values {values}'.format(
- db=db_name, tbl=tbl_name, values=','.join(values))
+ # Create a parquet table containing only even numbers so an odd number should be
+ # filtered out based on the Bloom filter (if there is one).
+ create_stmt_template = 'create table {db}.{tbl} stored as parquet {tbl_props} \
+ as (select (row_number() over (order by o_orderkey)) * 2 as {col} \
+ from tpch_parquet.orders limit {ndv})'
+ create_stmt = create_stmt_template.format(db=db_name, tbl=tbl_name,
+ tbl_props=bloom_tbl_props if bloom_tbl_prop else "", col=column_name, ndv=ndv)
vector.get_value('exec_option')['num_nodes'] = 1
vector.get_value('exec_option')['parquet_bloom_filter_write'] = 'IF_NO_DICT'
self.execute_query(create_stmt, vector.get_value('exec_option'))
- self.execute_query(insert_stmt, vector.get_value('exec_option'))
def _optimal_bitset_size(self, ndv, fpp):
""" Based on ParquetBloomFilter::OptimalByteSize() in