You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/04/11 16:17:10 UTC

[impala] 01/02: IMPALA-11227: FE OOM in TestParquetBloomFilter.test_fallback_from_dict_if_no_bloom_tbl_props

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 6358db7587cc2218e4523dae16ac5e362edf37ef
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Wed Apr 6 16:12:33 2022 +0200

    IMPALA-11227: FE OOM in TestParquetBloomFilter.test_fallback_from_dict_if_no_bloom_tbl_props
    
    The huge values clause of the insert SQL statement in
    TestParquetBloomFilter.test_fallback_from_dict_if_no_bloom_tbl_props
    could cause an OutOfMemory error in the FE.
    
    We use a SQL statement with a huge values clause (more than 40 000
    elements) to insert values into a parquet table in some tests, and the
    size of the SQL statement string sometimes causes an OOM error.
    
    After this change, we create these parquet tables with a CTAS from an
    existing table, avoiding any long SQL statements.
    
    Change-Id: I923cc9ba4b6829a2f15e93365f2849b89248598b
    Reviewed-on: http://gerrit.cloudera.org:8080/18387
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/query_test/test_parquet_bloom_filter.py | 28 +++++++++++++--------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/tests/query_test/test_parquet_bloom_filter.py b/tests/query_test/test_parquet_bloom_filter.py
index 998cfd34f..eda93999b 100644
--- a/tests/query_test/test_parquet_bloom_filter.py
+++ b/tests/query_test/test_parquet_bloom_filter.py
@@ -161,9 +161,9 @@ class TestParquetBloomFilter(ImpalaTestSuite):
     # Query an element that is and one that is not present in the table and check whether
     # we correctly do not skip and skip the row group, respectively.
     self._query_element_check_profile(vector, str(unique_database), tbl_name, column_name,
-        0, ['NumBloomFilteredRowGroups: 0 (0)'], ['NumBloomFilteredRowGroups: 1 (1)'])
+        2, ['NumBloomFilteredRowGroups: 0 (0)'], ['NumBloomFilteredRowGroups: 1 (1)'])
     self._query_element_check_profile(vector, str(unique_database), tbl_name, column_name,
-        1, ['NumBloomFilteredRowGroups: 1 (1)'], ['NumBloomFilteredRowGroups: 0 (0)'])
+        3, ['NumBloomFilteredRowGroups: 1 (1)'], ['NumBloomFilteredRowGroups: 0 (0)'])
 
   def test_fallback_from_dict_if_no_bloom_tbl_props(self, vector, unique_database,
       tmpdir):
@@ -196,6 +196,7 @@ class TestParquetBloomFilter(ImpalaTestSuite):
     result_in_table = self.execute_query(query_stmt.format(col_name=col_name,
         db=db_name, tbl=tbl_name, value=element),
         vector.get_value('exec_option'))
+
     for s in strings_in_profile:
       assert s in result_in_table.runtime_profile
     for s in strings_not_in_profile:
@@ -208,24 +209,21 @@ class TestParquetBloomFilter(ImpalaTestSuite):
     fpp = 0.05
     bitset_size = self._optimal_bitset_size(ndv, fpp)
 
-    # We create a table with a single BIGINT column, optionally with table properties for
-    # Bloom filtering.
-    create_stmt = 'create table {db}.{tbl} ({col_name} BIGINT) stored as parquet'
-    if bloom_tbl_prop:
-      create_stmt += ' TBLPROPERTIES("parquet.bloom.filter.columns"="{col_name}:{size}")'
-    create_stmt = create_stmt.format(
-        db=db_name, tbl=tbl_name, col_name=column_name, size=bitset_size)
+    bloom_tbl_props = \
+        'TBLPROPERTIES("parquet.bloom.filter.columns"="{col_name}:{size}")'.format(
+            col_name=column_name, size=bitset_size)
 
-    # We only insert even numbers so an odd number should be filtered out based on the
-    # Bloom filter.
-    values = ['({})'.format(i * 2) for i in range(ndv)]
-    insert_stmt = 'insert into {db}.{tbl} values {values}'.format(
-        db=db_name, tbl=tbl_name, values=','.join(values))
+    # Create a parquet table containing only even numbers so an odd number should be
+    # filtered out based on the Bloom filter (if there is one).
+    create_stmt_template = 'create table {db}.{tbl} stored as parquet {tbl_props} \
+        as (select (row_number() over (order by o_orderkey)) * 2 as {col} \
+        from tpch_parquet.orders limit {ndv})'
+    create_stmt = create_stmt_template.format(db=db_name, tbl=tbl_name,
+        tbl_props=bloom_tbl_props if bloom_tbl_prop else "", col=column_name, ndv=ndv)
 
     vector.get_value('exec_option')['num_nodes'] = 1
     vector.get_value('exec_option')['parquet_bloom_filter_write'] = 'IF_NO_DICT'
     self.execute_query(create_stmt, vector.get_value('exec_option'))
-    self.execute_query(insert_stmt, vector.get_value('exec_option'))
 
   def _optimal_bitset_size(self, ndv, fpp):
     """ Based on ParquetBloomFilter::OptimalByteSize() in