You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2022/04/28 20:38:32 UTC

[arrow-datafusion] branch master updated: Introduce new `DataFusionError::SchemaError` type (#2371)

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new af6be6e8c Introduce new `DataFusionError::SchemaError` type (#2371)
af6be6e8c is described below

commit af6be6e8c42ff4309b179113d66c55c395b7005c
Author: Andy Grove <an...@gmail.com>
AuthorDate: Thu Apr 28 14:38:27 2022 -0600

    Introduce new `DataFusionError::SchemaError` type (#2371)
---
 datafusion/common/src/dfschema.rs           | 84 +++++++++++++++--------------
 datafusion/common/src/error.rs              | 72 ++++++++++++++++++++++++-
 datafusion/common/src/lib.rs                |  2 +-
 datafusion/core/src/logical_plan/builder.rs | 29 +++++-----
 datafusion/core/src/sql/planner.rs          |  2 +-
 5 files changed, 132 insertions(+), 57 deletions(-)

diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
index 33c4193e2..209c5e3ed 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -22,7 +22,7 @@ use std::collections::{HashMap, HashSet};
 use std::convert::TryFrom;
 use std::sync::Arc;
 
-use crate::error::{DataFusionError, Result};
+use crate::error::{DataFusionError, Result, SchemaError};
 use crate::Column;
 
 use arrow::compute::can_cast_types;
@@ -67,16 +67,19 @@ impl DFSchema {
         for field in &fields {
             if let Some(qualifier) = field.qualifier() {
                 if !qualified_names.insert((qualifier, field.name())) {
-                    return Err(DataFusionError::Plan(format!(
-                        "Schema contains duplicate qualified field name '{}'",
-                        field.qualified_name()
-                    )));
+                    return Err(DataFusionError::SchemaError(
+                        SchemaError::DuplicateQualifiedField {
+                            qualifier: qualifier.to_string(),
+                            name: field.name().to_string(),
+                        },
+                    ));
                 }
             } else if !unqualified_names.insert(field.name()) {
-                return Err(DataFusionError::Plan(format!(
-                    "Schema contains duplicate unqualified field name '{}'",
-                    field.name()
-                )));
+                return Err(DataFusionError::SchemaError(
+                    SchemaError::DuplicateUnqualifiedField {
+                        name: field.name().to_string(),
+                    },
+                ));
             }
         }
 
@@ -94,11 +97,12 @@ impl DFSchema {
         });
         for (qualifier, name) in &qualified_names {
             if unqualified_names.contains(name) {
-                return Err(DataFusionError::Plan(format!(
-                    "Schema contains qualified field name '{}.{}' \
-                    and unqualified field name '{}' which would be ambiguous",
-                    qualifier, name, name
-                )));
+                return Err(DataFusionError::SchemaError(
+                    SchemaError::AmbiguousReference {
+                        qualifier: Some(qualifier.to_string()),
+                        name: name.to_string(),
+                    },
+                ));
             }
         }
         Ok(Self { fields, metadata })
@@ -178,11 +182,11 @@ impl DFSchema {
             }
         }
 
-        Err(DataFusionError::Plan(format!(
-            "No field named '{}'. Valid fields are {}.",
-            name,
-            self.get_field_names()
-        )))
+        Err(DataFusionError::SchemaError(SchemaError::FieldNotExist {
+            qualifier: None,
+            name: name.to_string(),
+            valid_fields: Some(self.get_field_names()),
+        }))
     }
 
     pub fn index_of_column_by_name(
@@ -206,12 +210,11 @@ impl DFSchema {
             })
             .map(|(idx, _)| idx);
         match matches.next() {
-            None => Err(DataFusionError::Plan(format!(
-                "No field named '{}.{}'. Valid fields are {}.",
-                qualifier.unwrap_or("<unqualified>"),
-                name,
-                self.get_field_names()
-            ))),
+            None => Err(DataFusionError::SchemaError(SchemaError::FieldNotExist {
+                qualifier: qualifier.map(|s| s.to_string()),
+                name: name.to_string(),
+                valid_fields: Some(self.get_field_names()),
+            })),
             Some(idx) => match matches.next() {
                 None => Ok(idx),
                 // found more than one matches
@@ -262,16 +265,18 @@ impl DFSchema {
     pub fn field_with_unqualified_name(&self, name: &str) -> Result<&DFField> {
         let matches = self.fields_with_unqualified_name(name);
         match matches.len() {
-            0 => Err(DataFusionError::Plan(format!(
-                "No field with unqualified name '{}'. Valid fields are {}.",
-                name,
-                self.get_field_names()
-            ))),
+            0 => Err(DataFusionError::SchemaError(SchemaError::FieldNotExist {
+                qualifier: None,
+                name: name.to_string(),
+                valid_fields: Some(self.get_field_names()),
+            })),
             1 => Ok(matches[0]),
-            _ => Err(DataFusionError::Plan(format!(
-                "Ambiguous reference to field named '{}'",
-                name
-            ))),
+            _ => Err(DataFusionError::SchemaError(
+                SchemaError::AmbiguousReference {
+                    qualifier: None,
+                    name: name.to_string(),
+                },
+            )),
         }
     }
 
@@ -314,7 +319,7 @@ impl DFSchema {
             .try_for_each(|(l_field, r_field)| {
                 if !can_cast_types(r_field.data_type(), l_field.data_type()) {
                     Err(DataFusionError::Plan(
-                        format!("Column {} (type: {}) is not compatible wiht column {} (type: {})",
+                        format!("Column {} (type: {}) is not compatible with column {} (type: {})",
                             r_field.name(),
                             r_field.data_type(),
                             l_field.name(),
@@ -350,7 +355,7 @@ impl DFSchema {
     }
 
     /// Get comma-separated list of field names for use in error messages
-    fn get_field_names(&self) -> String {
+    fn get_field_names(&self) -> Vec<String> {
         self.fields
             .iter()
             .map(|f| match f.qualifier() {
@@ -358,7 +363,6 @@ impl DFSchema {
                 None => format!("'{}'", f.name()),
             })
             .collect::<Vec<_>>()
-            .join(", ")
     }
 
     /// Get metadata of this schema
@@ -679,7 +683,7 @@ mod tests {
         let join = left.join(&right);
         assert!(join.is_err());
         assert_eq!(
-            "Error during planning: Schema contains duplicate \
+            "Schema error: Schema contains duplicate \
         qualified field name \'t1.c0\'",
             &format!("{}", join.err().unwrap())
         );
@@ -693,7 +697,7 @@ mod tests {
         let join = left.join(&right);
         assert!(join.is_err());
         assert_eq!(
-            "Error during planning: Schema contains duplicate \
+            "Schema error: Schema contains duplicate \
         unqualified field name \'c0\'",
             &format!("{}", join.err().unwrap())
         );
@@ -728,7 +732,7 @@ mod tests {
         let join = left.join(&right);
         assert!(join.is_err());
         assert_eq!(
-            "Error during planning: Schema contains qualified \
+            "Schema error: Schema contains qualified \
         field name \'t1.c0\' and unqualified field name \'c0\' which would be ambiguous",
             &format!("{}", join.err().unwrap())
         );
diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs
index 4a82ac3e9..5e784f4ac 100644
--- a/datafusion/common/src/error.rs
+++ b/datafusion/common/src/error.rs
@@ -62,8 +62,11 @@ pub enum DataFusionError {
     // This error is raised when one of those invariants is not verified during execution.
     Internal(String),
     /// This error happens whenever a plan is not valid. Examples include
-    /// impossible casts, schema inference not possible and non-unique column names.
+    /// impossible casts.
     Plan(String),
+    /// This error happens with schema-related errors, such as schema inference not possible
+    /// and non-unique column names.
+    SchemaError(SchemaError),
     /// Error returned during execution of the query.
     /// Examples include files not found, errors in parsing certain types.
     Execution(String),
@@ -78,6 +81,70 @@ pub enum DataFusionError {
     JITError(ModuleError),
 }
 
+/// Schema-related errors
+#[derive(Debug)]
+pub enum SchemaError {
+    /// Schema contains a (possibly) qualified and unqualified field with same unqualified name
+    AmbiguousReference {
+        qualifier: Option<String>,
+        name: String,
+    },
+    /// Schema contains duplicate qualified field name
+    DuplicateQualifiedField { qualifier: String, name: String },
+    /// Schema contains duplicate unqualified field name
+    DuplicateUnqualifiedField { name: String },
+    /// No field with this name
+    FieldNotExist {
+        qualifier: Option<String>,
+        name: String,
+        valid_fields: Option<Vec<String>>,
+    },
+}
+
+impl Display for SchemaError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::FieldNotExist {
+                qualifier,
+                name,
+                valid_fields,
+            } => {
+                write!(f, "No field named ")?;
+                if let Some(q) = qualifier {
+                    write!(f, "'{}.{}'", q, name)?;
+                } else {
+                    write!(f, "'{}'", name)?;
+                }
+                if let Some(field_names) = valid_fields {
+                    write!(f, ". Valid fields are {}", field_names.join(", "))?;
+                }
+                write!(f, ".")
+            }
+            Self::DuplicateQualifiedField { qualifier, name } => {
+                write!(
+                    f,
+                    "Schema contains duplicate qualified field name '{}.{}'",
+                    qualifier, name
+                )
+            }
+            Self::DuplicateUnqualifiedField { name } => {
+                write!(
+                    f,
+                    "Schema contains duplicate unqualified field name '{}'",
+                    name
+                )
+            }
+            Self::AmbiguousReference { qualifier, name } => {
+                if let Some(q) = qualifier {
+                    write!(f, "Schema contains qualified field name '{}.{}' and unqualified field name '{}' which would be ambiguous", q, name, name)
+                } else {
+                    write!(f, "Ambiguous reference to unqualified field '{}'", name)
+                }
+            }
+        }
+    }
+}
+
 impl From<io::Error> for DataFusionError {
     fn from(e: io::Error) -> Self {
         DataFusionError::IoError(e)
@@ -159,6 +226,9 @@ impl Display for DataFusionError {
             DataFusionError::Plan(ref desc) => {
                 write!(f, "Error during planning: {}", desc)
             }
+            DataFusionError::SchemaError(ref desc) => {
+                write!(f, "Schema error: {}", desc)
+            }
             DataFusionError::Execution(ref desc) => {
                 write!(f, "Execution error: {}", desc)
             }
diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index d39020f72..824b0ca6c 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -24,5 +24,5 @@ mod scalar;
 
 pub use column::Column;
 pub use dfschema::{DFField, DFSchema, DFSchemaRef, ExprSchema, ToDFSchema};
-pub use error::{DataFusionError, Result};
+pub use error::{DataFusionError, Result, SchemaError};
 pub use scalar::{ScalarType, ScalarValue};
diff --git a/datafusion/core/src/logical_plan/builder.rs b/datafusion/core/src/logical_plan/builder.rs
index 0b0fdebb6..4f00bec60 100644
--- a/datafusion/core/src/logical_plan/builder.rs
+++ b/datafusion/core/src/logical_plan/builder.rs
@@ -1202,6 +1202,7 @@ pub(crate) fn expand_qualified_wildcard(
 #[cfg(test)]
 mod tests {
     use arrow::datatypes::{DataType, Field};
+    use datafusion_common::SchemaError;
     use datafusion_expr::expr_fn::exists;
 
     use crate::logical_plan::StringifiedPlan;
@@ -1428,16 +1429,16 @@ mod tests {
         .project(vec![col("id"), col("first_name").alias("id")]);
 
         match plan {
-            Err(DataFusionError::Plan(e)) => {
-                assert_eq!(
-                    e,
-                    "Schema contains qualified field name 'employee_csv.id' \
-                    and unqualified field name 'id' which would be ambiguous"
-                );
+            Err(DataFusionError::SchemaError(SchemaError::AmbiguousReference {
+                qualifier,
+                name,
+            })) => {
+                assert_eq!("employee_csv", qualifier.unwrap().as_str());
+                assert_eq!("id", &name);
                 Ok(())
             }
             _ => Err(DataFusionError::Plan(
-                "Plan should have returned an DataFusionError::Plan".to_string(),
+                "Plan should have returned an DataFusionError::SchemaError".to_string(),
             )),
         }
     }
@@ -1454,16 +1455,16 @@ mod tests {
         .aggregate(vec![col("state")], vec![sum(col("salary")).alias("state")]);
 
         match plan {
-            Err(DataFusionError::Plan(e)) => {
-                assert_eq!(
-                    e,
-                    "Schema contains qualified field name 'employee_csv.state' and \
-                    unqualified field name 'state' which would be ambiguous"
-                );
+            Err(DataFusionError::SchemaError(SchemaError::AmbiguousReference {
+                qualifier,
+                name,
+            })) => {
+                assert_eq!("employee_csv", qualifier.unwrap().as_str());
+                assert_eq!("state", &name);
                 Ok(())
             }
             _ => Err(DataFusionError::Plan(
-                "Plan should have returned an DataFusionError::Plan".to_string(),
+                "Plan should have returned an DataFusionError::SchemaError".to_string(),
             )),
         }
     }
diff --git a/datafusion/core/src/sql/planner.rs b/datafusion/core/src/sql/planner.rs
index 253d3f454..b34b78d5f 100644
--- a/datafusion/core/src/sql/planner.rs
+++ b/datafusion/core/src/sql/planner.rs
@@ -3764,7 +3764,7 @@ mod tests {
         let err = logical_plan(sql).expect_err("query should have failed");
         assert_eq!(
             "Plan(\"Column Int64(1) (type: Int64) is \
-            not compatible wiht column IntervalMonthDayNano\
+            not compatible with column IntervalMonthDayNano\
             (\\\"950737950189618795196236955648\\\") \
             (type: Interval(MonthDayNano))\")",
             format!("{:?}", err)