You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by wz...@apache.org on 2022/05/15 22:27:58 UTC
[impala] 02/03: IMPALA-11283: Push-down IS_NULL and NOT_NULL predicates to iceberg
This is an automated email from the ASF dual-hosted git repository.
wzhou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 7a7934ffba70a1448be081b288cfc37e0c846d93
Author: LPL <li...@sensorsdata.cn>
AuthorDate: Sat May 7 16:29:21 2022 +0800
IMPALA-11283: Push-down IS_NULL and NOT_NULL predicates to iceberg
This patch implements pushing the IS_NULL and NOT_NULL predicates down
to Iceberg.
Testing:
- Added end-to-end test
Change-Id: I9c3608af67b552bebc55dcc5526f61f5439967bf
Reviewed-on: http://gerrit.cloudera.org:8080/18504
Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../org/apache/impala/planner/IcebergScanNode.java | 19 +
.../iceberg-is-null-predicate-push-down.test | 403 +++++++++++++++++++++
tests/query_test/test_iceberg.py | 5 +
3 files changed, 427 insertions(+)
diff --git a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
index 1035cd367..f3d521d13 100644
--- a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
@@ -36,6 +36,7 @@ import org.apache.impala.analysis.BoolLiteral;
import org.apache.impala.analysis.DateLiteral;
import org.apache.impala.analysis.Expr;
import org.apache.impala.analysis.InPredicate;
+import org.apache.impala.analysis.IsNullPredicate;
import org.apache.impala.analysis.LiteralExpr;
import org.apache.impala.analysis.MultiAggregateInfo;
import org.apache.impala.analysis.NumericLiteral;
@@ -216,6 +217,8 @@ public class IcebergScanNode extends HdfsScanNode {
convertIcebergPredicate(analyzer, (BinaryPredicate) expr);
} else if (expr instanceof InPredicate) {
convertIcebergPredicate(analyzer, (InPredicate) expr);
+ } else if (expr instanceof IsNullPredicate) {
+ convertIcebergPredicate((IsNullPredicate) expr);
}
}
@@ -272,6 +275,22 @@ public class IcebergScanNode extends HdfsScanNode {
icebergPredicates_.add(Expressions.in(col.getName(), values));
}
+ private void convertIcebergPredicate(IsNullPredicate predicate) {
+ // Do not convert if there is an implicit cast
+ if (!(predicate.getChild(0) instanceof SlotRef)) return;
+ SlotRef ref = (SlotRef) predicate.getChild(0);
+
+ // If predicate contains map/struct, this column would be null
+ Column col = ref.getDesc().getColumn();
+ if (col == null) return;
+
+ if (predicate.isNotNull()) {
+ icebergPredicates_.add(Expressions.notNull(col.getName()));
+ } else{
+ icebergPredicates_.add(Expressions.isNull(col.getName()));
+ }
+ }
+
private Object getIcebergValue(Analyzer analyzer, SlotRef ref, LiteralExpr literal)
throws ImpalaException {
IcebergColumn iceCol = (IcebergColumn) ref.getDesc().getColumn();
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-is-null-predicate-push-down.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-is-null-predicate-push-down.test
new file mode 100644
index 000000000..e9b01cfbf
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-is-null-predicate-push-down.test
@@ -0,0 +1,403 @@
+====
+---- QUERY
+# Test BIGINT/DOUBLE/STRING/TIMESTAMP/DATE/DECIMAL IsNullPredicate push-down
+create table ice_is_null_pred_pd (
+ col_i INT,
+ col_bi BIGINT,
+ col_db DOUBLE,
+ col_str STRING,
+ col_ts TIMESTAMP,
+ col_dt DATE,
+ col_dc1 DECIMAL(9, 3),
+ col_dc2 DECIMAL(18, 3),
+ col_dc3 DECIMAL(38, 3)
+) partitioned by spec (col_i) stored as iceberg tblproperties ('write.format.default' = 'parquet');
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+insert into
+ ice_is_null_pred_pd
+values
+ (0, NULL, 2.71820, '_1700-01-01 00:00:00', '1400-01-01 00:00:00', DATE'1400-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+ (1, 12345678901, NULL, 'A1700-01-01 00:00:00', '1400-01-01 01:01:01', DATE'1400-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+ (2, 12345678902, 2.71822, NULL, '1400-01-01 02:02:02', DATE'1400-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+ (3, 12345678903, 2.71823, '1700-01-01 00:00:00', '1500-01-01 00:00:00', DATE'1500-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+ (4, 12345678904, 2.71824, '1700-01-01 00:00:00', NULL, DATE'1500-01-01', 123.450, 1234567890.120, 1234567890123456789.010),
+ (5, 12345678905, 2.71825, '1700-01-01 00:00:00', '1500-01-01 02:02:02', NULL, 123.450, 1234567890.120, 1234567890123456789.010),
+ (6, 12345678906, 2.71826, '1700-01-01 00:00:00', '1600-01-01 00:00:00', DATE'1600-01-01', NULL, 1234567890.120, 1234567890123456789.010),
+ (7, 12345678907, 2.71827, '1700-01-01 00:00:00', '1600-01-01 01:01:01', DATE'1600-01-01', 123.450, NULL, 1234567890123456789.010),
+ (8, 12345678908, 2.71828, '1700-01-01 00:00:00', '1600-01-01 02:02:02', DATE'1600-01-01', 123.450, 1234567890.120, NULL);
+select count(1) from ice_is_null_pred_pd;
+---- RESULTS
+9
+---- TYPES
+BIGINT
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 0
+aggregation(SUM, NumFileMetadataRead): 9
+====
+---- QUERY
+show files in ice_is_null_pred_pd;
+---- RESULTS
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=0/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=1/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=2/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=3/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=4/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=5/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=6/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=7/.*.0.parq','.*',''
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_is_null_pred_pd/data/col_i=8/.*.0.parq','.*',''
+---- TYPES
+STRING, STRING, STRING
+====
+---- QUERY
+# Start testing predicate push-down for bigint column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_bi is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_bi as double) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_bi as TIMESTAMP) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_bi + 1 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_bi is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for double column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_db is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_db as decimal(6,5)) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_db is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for string column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_str is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ col_i
+from
+ ice_is_null_pred_pd
+where
+ cast(col_str as TIMESTAMP) is null
+order by 1;
+---- RESULTS
+0
+1
+2
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_str is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for timestamp column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_ts is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_ts as string) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_ts is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for date column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dt is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_dt as string) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dt is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for decimal(9,3) column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dc1 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_dc1 as double) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dc1 is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for decimal(18,3) column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dc2 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_dc2 as double) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dc2 is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
+---- QUERY
+# Start testing predicate push-down for decimal(38,3) column
+# The IS_NULL predicate matches 1 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dc3 is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+====
+---- QUERY
+# The IS_NULL predicate will not push down
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ cast(col_dc3 as string) is null;
+---- RESULTS
+1
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 9
+====
+---- QUERY
+# The NOT_NULL predicate matches 8 row group
+select
+ count(1)
+from
+ ice_is_null_pred_pd
+where
+ col_dc3 is not null;
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 8
+====
\ No newline at end of file
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 3a64810d5..727540449 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -617,3 +617,8 @@ class TestIcebergTable(ImpalaTestSuite):
def test_in_predicate_push_down(self, vector, unique_database):
self.run_test_case('QueryTest/iceberg-in-predicate-push-down', vector,
use_db=unique_database)
+
+ @pytest.mark.execute_serially
+ def test_is_null_predicate_push_down(self, vector, unique_database):
+ self.run_test_case('QueryTest/iceberg-is-null-predicate-push-down', vector,
+ use_db=unique_database)