You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by wz...@apache.org on 2022/05/15 22:27:58 UTC

[impala] 02/03: IMPALA-11283: Push-down IS_NULL and NOT_NULL predicates to iceberg

This is an automated email from the ASF dual-hosted git repository.

wzhou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 7a7934ffba70a1448be081b288cfc37e0c846d93
Author: LPL <li...@sensorsdata.cn>
AuthorDate: Sat May 7 16:29:21 2022 +0800

    IMPALA-11283: Push-down IS_NULL and NOT_NULL predicates to iceberg
    
    This patch implements pushing the IS_NULL and NOT_NULL predicates down
    to Iceberg.
    
    Testing:
      - Added end-to-end test
    
    Change-Id: I9c3608af67b552bebc55dcc5526f61f5439967bf
    Reviewed-on: http://gerrit.cloudera.org:8080/18504
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../org/apache/impala/planner/IcebergScanNode.java |  19 +
 .../iceberg-is-null-predicate-push-down.test       | 403 +++++++++++++++++++++
 tests/query_test/test_iceberg.py                   |   5 +
 3 files changed, 427 insertions(+)

diff --git a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
index 1035cd367..f3d521d13 100644
--- a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
@@ -36,6 +36,7 @@ import org.apache.impala.analysis.BoolLiteral;
 import org.apache.impala.analysis.DateLiteral;
 import org.apache.impala.analysis.Expr;
 import org.apache.impala.analysis.InPredicate;
+import org.apache.impala.analysis.IsNullPredicate;
 import org.apache.impala.analysis.LiteralExpr;
 import org.apache.impala.analysis.MultiAggregateInfo;
 import org.apache.impala.analysis.NumericLiteral;
@@ -216,6 +217,8 @@ public class IcebergScanNode extends HdfsScanNode {
       convertIcebergPredicate(analyzer, (BinaryPredicate) expr);
     } else if (expr instanceof InPredicate) {
       convertIcebergPredicate(analyzer, (InPredicate) expr);
+    } else if (expr instanceof IsNullPredicate) {
+      convertIcebergPredicate((IsNullPredicate) expr);
     }
   }
 
@@ -272,6 +275,22 @@ public class IcebergScanNode extends HdfsScanNode {
     icebergPredicates_.add(Expressions.in(col.getName(), values));
   }
 
+  private void convertIcebergPredicate(IsNullPredicate predicate) {
+    // Do not convert if there is an implicit cast
+    if (!(predicate.getChild(0) instanceof SlotRef)) return;
+    SlotRef ref = (SlotRef) predicate.getChild(0);
+
+    // If predicate contains map/struct, this column would be null
+    Column col = ref.getDesc().getColumn();
+    if (col == null) return;
+
+    if (predicate.isNotNull()) {
+      icebergPredicates_.add(Expressions.notNull(col.getName()));
+    } else{
+      icebergPredicates_.add(Expressions.isNull(col.getName()));
+    }
+  }
+
   private Object getIcebergValue(Analyzer analyzer, SlotRef ref, LiteralExpr literal)
       throws ImpalaException {
     IcebergColumn iceCol = (IcebergColumn) ref.getDesc().getColumn();
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-is-null-predicate-push-down.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-is-null-predicate-push-down.test
new file mode 100644
index 000000000..e9b01cfbf
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-is-null-predicate-push-down.test
@@ -0,0 +1,403 @@
+====
+---- QUERY
+# Test BIGINT/DOUBLE/STRING/TIMESTAMP/DATE/DECIMAL IsNullPredicate push-down
+create table ice_is_null_pred_pd (
+  col_i INT,
+  col_bi BIGINT,
+  col_db DOUBLE,
+  col_str STRING,
+  col_ts TIMESTAMP,
+  col_dt DATE,
+  col_dc1 DECIMAL(9, 3),
+  col_dc2 DECIMAL(18, 3),
+  col_dc3 DECIMAL(38, 3)
+) partitioned by spec (col_i) stored as iceberg tblproperties ('write.format.default' = 'parquet');
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+insert into
+  ice_is_null_pred_pd
+values
+  (0, NULL, 2.71820, '_1700-01-01 00:00:00', '1400-01-01 00:00:00', DATE'1400-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+  (1, 12345678901, NULL, 'A1700-01-01 00:00:00', '1400-01-01 01:01:01', DATE'1400-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+  (2, 12345678902, 2.71822, NULL, '1400-01-01 02:02:02', DATE'1400-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+  (3, 12345678903, 2.71823, '1700-01-01 00:00:00', '1500-01-01 00:00:00', DATE'1500-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+  (4, 12345678904, 2.71824, '1700-01-01 00:00:00', NULL, DATE'1500-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+  (5, 12345678905, 2.71825, '1700-01-01 00:00:00', '1500-01-01 02:02:02', NULL, 123.450, 1234567890.120, 1234567890123456789.010),
+  (6, 12345678906, 2.71826, '1700-01-01 00:00:00', '1600-01-01 00:00:00', DATE'1600-01-01', NULL, 1234567890.120, 1234567890123456789.010),
+  (7, 12345678907, 2.71827, '1700-01-01 00:00:00', '1600-01-01 01:01:01', DATE'1600-01-01', 123.450, NULL, 1234567890123456789.010),
+  (8, 12345678908, 2.71828, '1700-01-01 00:00:00', '1600-01-01 02:02:02', DATE'1600-01-01', 123.450, 1234567890.120, NULL);
+select count(1) from ice_is_null_pred_pd;
+---- RESULTS
+9
+---- TYPES
+BIGINT
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 0
+aggregation(SUM, NumFileMetadataRead): 9
+====
+---- QUERY
+show files in ice_is_null_pred_pd;
+---- RESULTS
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=0/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=1/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=2/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=3/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=4/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=5/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=6/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=7/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=8/.*.0.parq','.*',''
+---- TYPES
+STRING, STRING, STRING
+====
+---- QUERY
+# Start testing predicate push-down for bigint column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_bi is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_bi as double) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_bi as TIMESTAMP) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_bi + 1 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_bi is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for double column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_db is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_db as decimal(6,5)) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_db is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for string column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_str is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  col_i
+from
+  ice_is_null_pred_pd
+where
+  cast(col_str as TIMESTAMP) is null
+order by 1;
+---- RESULTS
+0
+1
+2
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_str is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for timestamp column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_ts is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_ts as string) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_ts is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for date column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dt is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_dt as string) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dt is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for decimal(9,3) column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dc1 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_dc1 as double) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dc1 is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for decimal(18,3) column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dc2 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_dc2 as double) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dc2 is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for decimal(38,3) column
+# The IS_NULL predicate matches 1 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dc3 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  cast(col_dc3 as string) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+  count(1)
+from
+  ice_is_null_pred_pd
+where
+  col_dc3 is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
\ No newline at end of file
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 3a64810d5..727540449 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -617,3 +617,8 @@ class TestIcebergTable(ImpalaTestSuite):
   def test_in_predicate_push_down(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-in-predicate-push-down', vector,
                        use_db=unique_database)
+
+  @pytest.mark.execute_serially
+  def test_is_null_predicate_push_down(self, vector, unique_database):
+    self.run_test_case('QueryTest/iceberg-is-null-predicate-push-down', vector,
+                       use_db=unique_database)