You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by la...@apache.org on 2024/01/14 12:40:53 UTC
(impala) 01/02: IMPALA-12706: Fix nested struct querying for Iceberg metadata tables

This is an automated email from the ASF dual-hosted git repository.

laszlog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 74617537b5c805327349ef0ac5c79b84dc1e786d
Author: Tamas Mate <tm...@apache.org>
AuthorDate: Thu Jan 11 14:26:54 2024 +0100

    IMPALA-12706: Fix nested struct querying for Iceberg metadata tables
    
    This commit fixes a DCHECK failure when querying a struct inside a
    struct. The previous field accessor creation logic was trying to find
    the ColumnDescriptor for a struct inside a struct and hit a DCHECK
    because there are no ColumnDescriptors for struct fields. The logic
    has been reworked to only use ColumnDescriptors for top level columns.
    
    Testing:
     - Added E2E test to cover this case
    
    Change-Id: Iadd029a4edc500bd8d8fca3f958903c2dbe09e8e
    Reviewed-on: http://gerrit.cloudera.org:8080/20883
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../iceberg-metadata/iceberg-metadata-scan-node.cc | 33 ++++++++++------------
 .../iceberg-metadata/iceberg-metadata-scan-node.h  |  8 +++---
 .../queries/QueryTest/iceberg-metadata-tables.test | 10 +++++++
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.cc b/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.cc
index d779992fb..5210b7cd1 100644
--- a/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.cc
+++ b/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.cc
@@ -91,30 +91,27 @@ Status IcebergMetadataScanNode::CreateFieldAccessors() {
   JNIEnv* env = JniUtil::GetJNIEnv();
   if (env == nullptr) return Status("Failed to get/create JVM");
   for (SlotDescriptor* slot_desc: tuple_desc_->slots()) {
-    if (slot_desc->type().IsStructType()) {
-      // Get the top level struct's field id from the ColumnDescriptor then recursively
-      // get the field ids for struct fields
-      int field_id = tuple_desc_->table_desc()->GetColumnDesc(slot_desc).field_id();
-      RETURN_IF_ERROR(AddAccessorForFieldId(env, field_id, slot_desc->id()));
-      RETURN_IF_ERROR(CreateFieldAccessors(env, slot_desc));
-    } else if (slot_desc->col_path().size() > 1) {
-      DCHECK(!slot_desc->type().IsComplexType());
-      // Slot that is child of a struct without tuple, can occur when a struct member is
-      // in the select list. ColumnType has a tree structure, and this loop finds the
-      // STRUCT node that stores the primitive type. Because, that struct node has the
-      // field id list of its childs.
+    int field_id = -1;
+    if (slot_desc->col_path().size() == 1) {
+      // Top level slots have ColumnDescriptors that store the field ids.
+      field_id = tuple_desc_->table_desc()->GetColumnDesc(slot_desc).field_id();
+    } else {
+      // Non top level slots are fields of a nested type. This code path is to handle
+      // slots that does not have their nested type's tuple available.
+      // This loop finds the struct ColumnType node that stores the slot as it has the
+      // field id list of its children.
       int root_type_index = slot_desc->col_path()[0];
       ColumnType* current_type = &const_cast<ColumnType&>(
           tuple_desc_->table_desc()->col_descs()[root_type_index].type());
       for (int i = 1; i < slot_desc->col_path().size() - 1; ++i) {
         current_type = &current_type->children[slot_desc->col_path()[i]];
       }
-      int field_id = current_type->field_ids[slot_desc->col_path().back()];
-      RETURN_IF_ERROR(AddAccessorForFieldId(env, field_id, slot_desc->id()));
-    } else {
-      // For primitives in the top level tuple, use the ColumnDescriptor
-      int field_id = tuple_desc_->table_desc()->GetColumnDesc(slot_desc).field_id();
-      RETURN_IF_ERROR(AddAccessorForFieldId(env, field_id, slot_desc->id()));
+      field_id = current_type->field_ids[slot_desc->col_path().back()];
+    }
+    DCHECK_NE(field_id, -1);
+    RETURN_IF_ERROR(AddAccessorForFieldId(env, field_id, slot_desc->id()));
+    if (slot_desc->type().IsStructType()) {
+      RETURN_IF_ERROR(CreateFieldAccessors(env, slot_desc));
     }
   }
   return Status::OK();
diff --git a/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.h b/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.h
index 64f1d3aa5..ae15efbd4 100644
--- a/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.h
+++ b/be/src/exec/iceberg-metadata/iceberg-metadata-scan-node.h
@@ -121,10 +121,10 @@ class IcebergMetadataScanNode : public ScanNode {
   Status GetCatalogTable(jobject* jtable);
 
   /// Populates the jaccessors_ map by creating the accessors for the columns in the JVM.
-  /// To create a field accessor for a column the Iceberg field id is needed. For
-  /// primitive type columns that are not a field of a struct, this can be found in the
-  /// ColumnDescriptor. However, ColumnDescriptors are not available for struct fields,
-  /// in this case the SlotDescriptor can be used.
+  /// To create a field accessor for a column the Iceberg field id is needed. For columns
+  /// that are not a field of a struct, this can be found in the ColumnDescriptor.
+  /// However, ColumnDescriptors are not available for struct fields, in this case the
+  /// ColumnType of the SlotDescriptor can be used.
   Status CreateFieldAccessors();
 
   /// Recursive part of the Accessor collection, when there is a struct in the tuple.
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test
index 0f58dae99..a559e13b6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test
@@ -470,6 +470,16 @@ select readable_metrics from functional_parquet.iceberg_query_metadata.entries;
 STRING
 ====
 ---- QUERY
+select readable_metrics.i from functional_parquet.iceberg_query_metadata.entries;
+---- RESULTS
+'{"column_size":47,"value_count":1,"null_value_count":0,"nan_value_count":null,"lower_bound":3,"upper_bound":3}'
+'{"column_size":47,"value_count":1,"null_value_count":0,"nan_value_count":null,"lower_bound":2,"upper_bound":2}'
+'{"column_size":47,"value_count":1,"null_value_count":0,"nan_value_count":null,"lower_bound":1,"upper_bound":1}'
+'{"column_size":null,"value_count":null,"null_value_count":null,"nan_value_count":null,"lower_bound":null,"upper_bound":null}'
+---- TYPES
+STRING
+====
+---- QUERY
 select snapshot_id, readable_metrics from functional_parquet.iceberg_query_metadata.entries;
 ---- RESULTS
 row_regex:[1-9]\d*|0,'{"i":{"column_size":47,"value_count":1,"null_value_count":0,"nan_value_count":null,"lower_bound":3,"upper_bound":3}}'