You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/06/10 13:15:11 UTC

[impala] branch master updated: IMPALA-11346: Migrated partitioned Iceberg tables might return ERROR when WHERE condition is used on partition column

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a1536bd1 IMPALA-11346: Migrated partitioned Iceberg tables might return ERROR when WHERE condition is used on partition column
1a1536bd1 is described below

commit 1a1536bd1d0162a168877a6f33dd75f9544a82f3
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Thu Jun 9 17:21:28 2022 +0200

    IMPALA-11346: Migrated partitioned Iceberg tables might return ERROR when WHERE condition is used on partition column
    
    Identity-partitioned columns are not necessarily stored in the data
    files. E.g. when we migrate a legacy partitioned table to Iceberg
    without rewriting the data files, the partition columns won't be
    present in the files.
    
    The Parquet scanner does a few optimizations to eliminate row groups,
    i.e. filtering based on stats, bloom filters, etc. When a column is
    not present in the data file that has some predicate on, then it is
    assumed that the whole row group doesn't pass the filtering criteria.
    
    But for Iceberg some files might contain partition columns, while
    other files doesn't, so we need to prepare the scanners to handle
    such cases.
    
    The ORC scanner doesn't have that many optimizations so it didn't
    ran into this issue.
    
    Testing:
     * e2e tests
    
    Change-Id: Ie706317888981f634d792fb570f3eab1ec11a4f4
    Reviewed-on: http://gerrit.cloudera.org:8080/18605
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Reviewed-by: Tamas Mate <tm...@apache.org>
    Reviewed-by: <li...@sensorsdata.cn>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/file-metadata-utils.cc                 |   1 -
 be/src/exec/parquet/hdfs-parquet-scanner.cc        |   4 +
 .../queries/QueryTest/iceberg-migrated-tables.test | 138 +++++++++++++++++++++
 3 files changed, 142 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/file-metadata-utils.cc b/be/src/exec/file-metadata-utils.cc
index 467b522da..2e21bce33 100644
--- a/be/src/exec/file-metadata-utils.cc
+++ b/be/src/exec/file-metadata-utils.cc
@@ -139,7 +139,6 @@ void FileMetadataUtils::AddIcebergColumns(MemPool* mem_pool, Tuple** template_tu
 bool FileMetadataUtils::IsValuePartitionCol(const SlotDescriptor* slot_desc) {
   DCHECK(context_ != nullptr);
   DCHECK(file_desc_ != nullptr);
-  if (slot_desc->parent() != scan_node_->tuple_desc()) return false;
   if (slot_desc->col_pos() < scan_node_->num_partition_keys() &&
       !slot_desc->IsVirtual()) {
     return true;
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 8718331a9..a32294c73 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -566,6 +566,7 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
     RETURN_IF_ERROR(ResolveSchemaForStatFiltering(slot_desc, &missing_field, &node));
 
     if (missing_field) {
+      if (!file_metadata_utils_.NeedDataInFile(slot_desc)) continue;
       // We are selecting a column that is not in the file. We would set its slot to NULL
       // during the scan, so any predicate would evaluate to false. Return early. NULL
       // comparisons cannot happen here, since predicates with NULL literals are filtered
@@ -707,6 +708,7 @@ Status HdfsParquetScanner::EvaluateOverlapForRowGroup(
     RETURN_IF_ERROR(ResolveSchemaForStatFiltering(slot_desc, &missing_field, &node));
 
     if (missing_field) {
+      if (!file_metadata_utils_.NeedDataInFile(slot_desc)) continue;
       // We are selecting a column that is not in the file. We would set its slot to NULL
       // during the scan, so any predicate would evaluate to false. Return early. NULL
       // comparisons cannot happen here, since predicates with NULL literals are filtered
@@ -1981,6 +1983,8 @@ Status HdfsParquetScanner::CreateColIdx2EqConjunctMap() {
       }
 
       if (missing_field) {
+        if (!file_metadata_utils_.NeedDataInFile(slot_desc)) continue;
+
         return Status(Substitute(
             "Unable to find SchemaNode for path '$0' in the schema of file '$1'.",
             PrintPath(*scan_node_->hdfs_table(), slot_desc->col_path()), filename()));
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test
index d1549c529..ba278fcd4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test
@@ -11,6 +11,54 @@ select * from functional_parquet.iceberg_alltypes_part
 INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
 ====
 ---- QUERY
+# Queries with WHERE clauses
+select * from functional_parquet.iceberg_alltypes_part
+where i = 1;
+---- RESULTS
+1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part
+where i = 3;
+---- RESULTS
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part
+where p_int = 1;
+---- RESULTS
+1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part
+where p_int = 2;
+---- RESULTS
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part
+where p_bool = true;
+---- RESULTS
+1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part
+where p_bool = false;
+---- RESULTS
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
 # Read only the partition columns.
 select p_bool, p_int, p_bigint, p_float,
        p_double, p_decimal, p_date, p_string
@@ -33,6 +81,54 @@ select * from functional_parquet.iceberg_alltypes_part_orc
 INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
 ====
 ---- QUERY
+# Queries with WHERE clauses
+select * from functional_parquet.iceberg_alltypes_part_orc
+where i = 1;
+---- RESULTS
+1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part_orc
+where i = 3;
+---- RESULTS
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part_orc
+where p_int = 1;
+---- RESULTS
+1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part_orc
+where p_int = 2;
+---- RESULTS
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part_orc
+where p_bool = true;
+---- RESULTS
+1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
+select * from functional_parquet.iceberg_alltypes_part_orc
+where p_bool = false;
+---- RESULTS
+---- TYPES
+INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
+====
+---- QUERY
 # Read only the partition columns.
 select p_bool, p_int, p_bigint, p_float,
        p_double, p_decimal, p_date, p_string
@@ -58,6 +154,23 @@ select * from functional_parquet.iceberg_legacy_partition_schema_evolution
 BIGINT, DOUBLE, DECIMAL, INT
 ====
 ---- QUERY
+select * from functional_parquet.iceberg_legacy_partition_schema_evolution
+where p_int_long = 1;
+---- RESULTS
+1,1.100000023841858,2.718,2
+1,1.100000023841858,3.141,1
+---- TYPES
+BIGINT, DOUBLE, DECIMAL, INT
+====
+---- QUERY
+select * from functional_parquet.iceberg_legacy_partition_schema_evolution
+where p_dec_dec = 2.718;
+---- RESULTS
+1,1.100000023841858,2.718,2
+---- TYPES
+BIGINT, DOUBLE, DECIMAL, INT
+====
+---- QUERY
 # Read only the partition columns.
 select p_int_long, p_float_double, p_dec_dec
 from functional_parquet.iceberg_legacy_partition_schema_evolution;
@@ -82,6 +195,23 @@ select * from functional_parquet.iceberg_legacy_partition_schema_evolution_orc
 BIGINT, DOUBLE, DECIMAL, INT
 ====
 ---- QUERY
+select * from functional_parquet.iceberg_legacy_partition_schema_evolution_orc
+where p_int_long = 1;
+---- RESULTS
+1,1.100000023841858,2.718,2
+1,1.100000023841858,3.141,1
+---- TYPES
+BIGINT, DOUBLE, DECIMAL, INT
+====
+---- QUERY
+select * from functional_parquet.iceberg_legacy_partition_schema_evolution_orc
+where p_dec_dec = 2.718;
+---- RESULTS
+1,1.100000023841858,2.718,2
+---- TYPES
+BIGINT, DOUBLE, DECIMAL, INT
+====
+---- QUERY
 # Read only the partition columns.
 select p_int_long, p_float_double, p_dec_dec
 from functional_parquet.iceberg_legacy_partition_schema_evolution_orc;
@@ -106,3 +236,11 @@ select * from only_part_cols;
 ---- TYPES
 INT, STRING
 ====
+---- QUERY
+select * from only_part_cols
+where i = 2 and s = 's'
+---- RESULTS
+2,'s'
+---- TYPES
+INT, STRING
+====