You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2021/07/14 01:49:14 UTC
[impala] 03/03: IMPALA-10703: Fix crash on reading ACID table while
printing SchemaPath of tuple/slots.
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 77283d87d8ed4747aa8d2f5caa9f2d4cf751e3ce
Author: Amogh Margoor <am...@cloudera.com>
AuthorDate: Wed Jul 7 10:52:35 2021 -0700
IMPALA-10703: Fix crash on reading ACID table while printing SchemaPath of tuple/slots.
While reading ACID ORC file, the SchemaPath from TupleDescriptor
or SlotDescriptor are converted to fully qualified path via
PrintPath on few codepaths. PrintPath needs non-canonical table
path though. For non-ACID table this will be same as SchemaPath
of tuple/slot. However for ACID tables, it will be different as
file schema and table schema are not same.
E.g., ACID table foo(id int) will look like following in file:
{
operation: int,
originalTransaction: bigInt,
bucket: int,
rowId: bigInt,
currentTransaction: bigInt,
row: struct<id: int>
}
So SchemaPath for id will [5, 0], but PrintPath would not
understand that. It needs to be converted into table path [1]
as table schema looks like this:
{
row_id: struct < ...ACID Columns>
id: int
}
Testing:
1. Manually ran queries against functional_orc_def.complextypestbl
with log level 3. These queries were crashing earlier.
2. Ran existing regression tests on DEBUG build for few changes not
behind VLOG(3).
Change-Id: Ib7f15c31e0f8fc5d90555d1f2d51313eaffeb074
Reviewed-on: http://gerrit.cloudera.org:8080/17658
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/hdfs-orc-scanner.cc | 40 +++++++++++++++++++++++++++++++++++-----
be/src/exec/orc-metadata-utils.h | 36 ++++++++++++++++++------------------
2 files changed, 53 insertions(+), 23 deletions(-)
diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index d3dbe2f..abfab5e 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -364,6 +364,31 @@ bool HdfsOrcScanner::IsMissingField(const SlotDescriptor* slot) {
return missing_field_slots_.find(slot) != missing_field_slots_.end();
}
+// Fetch fully qualified name for 'col_path' by converting it into non-canonical
+// table path.
+string PrintColPath(const HdfsTableDescriptor& hdfs_table, const SchemaPath& col_path,
+ const unique_ptr<OrcSchemaResolver>& schema_resolver) {
+ SchemaPath table_col_path, file_col_path;
+ if (col_path.size() > 0) {
+ DCHECK(schema_resolver != nullptr);
+ // Convert 'col_path' to non-canonical table path 'table_col_path'.
+ schema_resolver->TranslateColPaths(col_path, &table_col_path, &file_col_path);
+ auto it = table_col_path.begin();
+ // remove initial -1s from the table_col_path
+ // -1 is present to represent some of the constructs in ACID table which are not
+ // present in table schema
+ while (it != table_col_path.end()) {
+ if (*it == -1) {
+ it = table_col_path.erase(it);
+ } else {
+ break;
+ }
+ }
+ }
+
+ return PrintPath(hdfs_table, table_col_path);
+}
+
Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
list<const orc::Type*>* selected_nodes, stack<const SlotDescriptor*>* pos_slots) {
const orc::Type* node = nullptr;
@@ -374,7 +399,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
&pos_field, &missing_field));
if (missing_field) {
return Status(Substitute("Could not find nested column '$0' in file '$1'.",
- PrintPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path()), filename()));
+ PrintColPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path(),
+ schema_resolver_), filename()));
}
tuple_to_col_id_.insert({&tuple_desc, node->getColumnId()});
if (tuple_desc.byte_size() == 0) {
@@ -388,7 +414,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
// will skip the whole array column. So we select 'c3' for this case.
selected_type_ids_.push_back(node->getMaximumColumnId());
VLOG(3) << "Add ORC column " << node->getMaximumColumnId() << " for empty tuple "
- << PrintPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path());
+ << PrintColPath(*scan_node_->hdfs_table(), tuple_desc.tuple_path(),
+ schema_resolver_);
return Status::OK();
}
@@ -413,7 +440,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
// If the collection column is missing, the whole scan range should return 0 rows
// since we're selecting children column(s) of the collection.
return Status(Substitute("Could not find nested column '$0' in file '$1'.",
- PrintPath(*scan_node_->hdfs_table(), slot_desc->col_path()), filename()));
+ PrintColPath(*scan_node_->hdfs_table(), slot_desc->col_path(),
+ schema_resolver_), filename()));
}
// In this case, we are selecting a column/subcolumn that is not in the file.
// Update the template tuple to put a NULL in this slot.
@@ -449,7 +477,8 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
RETURN_IF_ERROR(ResolveColumns(*item_tuple_desc, selected_nodes, pos_slots));
} else {
VLOG(3) << "Add ORC column " << node->getColumnId() << " for "
- << PrintPath(*scan_node_->hdfs_table(), slot_desc->col_path());
+ << PrintColPath(*scan_node_->hdfs_table(), slot_desc->col_path(),
+ schema_resolver_);
selected_nodes->push_back(node);
}
}
@@ -518,7 +547,8 @@ Status HdfsOrcScanner::SelectColumns(const TupleDescriptor& tuple_desc) {
if (HasChildrenSelected(*array_node, selected_type_ids_)) continue;
selected_type_ids_.push_back(array_node->getMaximumColumnId());
VLOG(3) << "Add ORC column " << array_node->getMaximumColumnId() << " for "
- << PrintPath(*scan_node_->hdfs_table(), pos_slot_desc->col_path());
+ << PrintColPath(*scan_node_->hdfs_table(), pos_slot_desc->col_path(),
+ schema_resolver_);
selected_nodes.push_back(array_node);
}
diff --git a/be/src/exec/orc-metadata-utils.h b/be/src/exec/orc-metadata-utils.h
index a600dbf..af23afb 100644
--- a/be/src/exec/orc-metadata-utils.h
+++ b/be/src/exec/orc-metadata-utils.h
@@ -58,24 +58,6 @@ class OrcSchemaResolver {
/// Returns true if 'col_path' refers to an ACID column.
bool IsAcidColumn(const SchemaPath& col_path) const;
- private:
- TSchemaResolutionStrategy::type schema_resolution_strategy_;
-
- /// Resolve column based on position. This only works when the fields in the HMS
- /// table schema match the file schema (apart from Hive ACID schema differences which
- /// are being handled).
- Status ResolveColumnByPosition(const SchemaPath& col_path, const orc::Type** node,
- bool* pos_field, bool* missing_field) const;
-
- /// Resolve column based on the Iceberg field ids. This way we will retrieve the
- /// Iceberg field ids from the HMS table via 'col_path', then find the corresponding
- /// field in the ORC file.
- Status ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node,
- bool* pos_field, bool* missing_field) const;
-
- /// Finds child of 'node' that has Iceberg field id equals to 'field_id'.
- const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const;
-
/// Translates 'col_path' to non-canonical table and file paths. These non-canonical
/// paths have the same lengths. To achieve that they might contain -1 values that must
/// be ignored. These paths are useful for tables that have different table and file
@@ -103,6 +85,24 @@ class OrcSchemaResolver {
void TranslateColPaths(const SchemaPath& col_path,
SchemaPath* table_col_path, SchemaPath* file_col_path) const;
+ private:
+ TSchemaResolutionStrategy::type schema_resolution_strategy_;
+
+ /// Resolve column based on position. This only works when the fields in the HMS
+ /// table schema match the file schema (apart from Hive ACID schema differences which
+ /// are being handled).
+ Status ResolveColumnByPosition(const SchemaPath& col_path, const orc::Type** node,
+ bool* pos_field, bool* missing_field) const;
+
+ /// Resolve column based on the Iceberg field ids. This way we will retrieve the
+ /// Iceberg field ids from the HMS table via 'col_path', then find the corresponding
+ /// field in the ORC file.
+ Status ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node,
+ bool* pos_field, bool* missing_field) const;
+
+ /// Finds child of 'node' that has Iceberg field id equals to 'field_id'.
+ const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const;
+
SchemaPath GetCanonicalSchemaPath(const SchemaPath& col_path, int last_idx) const;
/// Sets 'is_file_full_acid_' based on the file schema.