You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2021/07/14 01:49:14 UTC
[impala] 03/03: IMPALA-10703: Fix crash on reading ACID table while printing SchemaPath of tuple/slots.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 77283d87d8ed4747aa8d2f5caa9f2d4cf751e3ce
Author: Amogh Margoor <am...@cloudera.com>
AuthorDate: Wed Jul 7 10:52:35 2021 -0700

    IMPALA-10703: Fix crash on reading ACID table while printing SchemaPath of tuple/slots.
    
    While reading ACID ORC file, the SchemaPath from TupleDescriptor
    or SlotDescriptor are converted to fully qualified path via
    PrintPath on few codepaths. PrintPath needs non-canonical table
    path though. For non-ACID table this will be same as SchemaPath
    of tuple/slot. However for ACID tables, it will be different as
    file schema and table schema are not same.
    E.g., ACID table foo(id int) will look like following in file:
    
    {
      operation: int,
      originalTransaction: bigInt,
      bucket: int,
      rowId: bigInt,
      currentTransaction: bigInt,
      row: struct<id: int>
    }
    So SchemaPath for id will [5, 0], but PrintPath would not
    understand that. It needs to be converted into table path [1]
    as table schema looks like this:
    
    {
      row_id: struct < ...ACID Columns>
      id: int
    }
    
    Testing:
    1. Manually ran queries against functional_orc_def.complextypestbl
       with log level 3. These queries were crashing earlier.
    2. Ran existing regression tests on DEBUG build for few changes not
       behind VLOG(3).
    
    Change-Id: Ib7f15c31e0f8fc5d90555d1f2d51313eaffeb074
    Reviewed-on: http://gerrit.cloudera.org:8080/17658
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/hdfs-orc-scanner.cc  | 40 +++++++++++++++++++++++++++++++++++-----
 be/src/exec/orc-metadata-utils.h | 36 ++++++++++++++++++------------------
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index d3dbe2f..abfab5e 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -364,6 +364,31 @@ bool HdfsOrcScanner::IsMissingField(const SlotDescriptor* slot) {
   return missing_field_slots_.find(slot) != missing_field_slots_.end();
 }
 
+// Fetch fully qualified name for 'col_path' by converting it into non-canonical
+// table path.
+string PrintColPath(const HdfsTableDescriptor& hdfs_table, const SchemaPath& col_path,
+  const unique_ptr<OrcSchemaResolver>& schema_resolver) {
+  SchemaPath table_col_path, file_col_path;
+  if (col_path.size() > 0) {
+    DCHECK(schema_resolver != nullptr);
+    // Convert 'col_path' to non-canonical table path 'table_col_path'.
+    schema_resolver->TranslateColPaths(col_path, &table_col_path, &file_col_path);
+    auto it = table_col_path.begin();
+    // remove initial -1s from the table_col_path
+    // -1 is present to represent some of the constructs in ACID table which are not
+    // present in table schema
+    while (it != table_col_path.end()) {
+      if (*it == -1) {
+        it = table_col_path.erase(it);
+      } else {
+        break;
+      }
+    }
+  }
+
+  return PrintPath(hdfs_table, table_col_path);
+}
+
 Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
     list<const orc::Type*>* selected_nodes, stack<const SlotDescriptor*>* pos_slots) {
   const orc::Type* node = nullptr;
@@ -374,7 +399,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
       &pos_field, &missing_field));
   if (missing_field) {
     return Status(Substitute("Could not find nested column '$0' in file '$1'.",
-        PrintPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path()), filename()));
+        PrintColPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path(),
+        schema_resolver_), filename()));
   }
   tuple_to_col_id_.insert({&tuple_desc, node->getColumnId()});
   if (tuple_desc.byte_size() == 0) {
@@ -388,7 +414,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
     // will skip the whole array column. So we select 'c3' for this case.
     selected_type_ids_.push_back(node->getMaximumColumnId());
     VLOG(3) << "Add ORC column " << node->getMaximumColumnId() << " for empty tuple "
-        << PrintPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path());
+        << PrintColPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path(),
+           schema_resolver_);
     return Status::OK();
   }
 
@@ -413,7 +440,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
         // If the collection column is missing, the whole scan range should return 0 rows
         // since we're selecting children column(s) of the collection.
         return Status(Substitute("Could not find nested column '$0' in file '$1'.",
-            PrintPath(*scan_node_->hdfs_table(), slot_desc->col_path()), filename()));
+            PrintColPath(*scan_node_->hdfs_table(), slot_desc->col_path(),
+            schema_resolver_), filename()));
       }
       // In this case, we are selecting a column/subcolumn that is not in the file.
       // Update the template tuple to put a NULL in this slot.
@@ -449,7 +477,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
       RETURN_IF_ERROR(ResolveColumns(*item_tuple_desc, selected_nodes, pos_slots));
     } else {
       VLOG(3) << "Add ORC column " << node->getColumnId() << " for "
-          << PrintPath(*scan_node_->hdfs_table(), slot_desc->col_path());
+          << PrintColPath(*scan_node_->hdfs_table(), slot_desc->col_path(),
+             schema_resolver_);
       selected_nodes->push_back(node);
     }
   }
@@ -518,7 +547,8 @@ Status HdfsOrcScanner::SelectColumns(const TupleDescriptor& tuple_desc) {
     if (HasChildrenSelected(*array_node, selected_type_ids_)) continue;
     selected_type_ids_.push_back(array_node->getMaximumColumnId());
     VLOG(3) << "Add ORC column " << array_node->getMaximumColumnId() << " for "
-        << PrintPath(*scan_node_->hdfs_table(), pos_slot_desc->col_path());
+        << PrintColPath(*scan_node_->hdfs_table(), pos_slot_desc->col_path(),
+           schema_resolver_);
     selected_nodes.push_back(array_node);
   }
 
diff --git a/be/src/exec/orc-metadata-utils.h b/be/src/exec/orc-metadata-utils.h
index a600dbf..af23afb 100644
--- a/be/src/exec/orc-metadata-utils.h
+++ b/be/src/exec/orc-metadata-utils.h
@@ -58,24 +58,6 @@ class OrcSchemaResolver {
   /// Returns true if 'col_path' refers to an ACID column.
   bool IsAcidColumn(const SchemaPath& col_path) const;
 
- private:
-  TSchemaResolutionStrategy::type schema_resolution_strategy_;
-
-  /// Resolve column based on position. This only works when the fields in the HMS
-  /// table schema match the file schema (apart from Hive ACID schema differences which
-  /// are being handled).
-  Status ResolveColumnByPosition(const SchemaPath& col_path, const orc::Type** node,
-      bool* pos_field, bool* missing_field) const;
-
-  /// Resolve column based on the Iceberg field ids. This way we will retrieve the
-  /// Iceberg field ids from the HMS table via 'col_path', then find the corresponding
-  /// field in the ORC file.
-  Status ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node,
-      bool* pos_field, bool* missing_field) const;
-
-  /// Finds child of 'node' that has Iceberg field id equals to 'field_id'.
-  const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const;
-
   /// Translates 'col_path' to non-canonical table and file paths. These non-canonical
   /// paths have the same lengths. To achieve that they might contain -1 values that must
   /// be ignored. These paths are useful for tables that have different table and file
@@ -103,6 +85,24 @@ class OrcSchemaResolver {
   void TranslateColPaths(const SchemaPath& col_path,
       SchemaPath* table_col_path, SchemaPath* file_col_path) const;
 
+ private:
+  TSchemaResolutionStrategy::type schema_resolution_strategy_;
+
+  /// Resolve column based on position. This only works when the fields in the HMS
+  /// table schema match the file schema (apart from Hive ACID schema differences which
+  /// are being handled).
+  Status ResolveColumnByPosition(const SchemaPath& col_path, const orc::Type** node,
+      bool* pos_field, bool* missing_field) const;
+
+  /// Resolve column based on the Iceberg field ids. This way we will retrieve the
+  /// Iceberg field ids from the HMS table via 'col_path', then find the corresponding
+  /// field in the ORC file.
+  Status ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node,
+      bool* pos_field, bool* missing_field) const;
+
+  /// Finds child of 'node' that has Iceberg field id equals to 'field_id'.
+  const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const;
+
   SchemaPath GetCanonicalSchemaPath(const SchemaPath& col_path, int last_idx) const;
 
   /// Sets 'is_file_full_acid_' based on the file schema.