You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "Jefffrey (via GitHub)" <gi...@apache.org> on 2023/02/20 10:16:29 UTC
[GitHub] [arrow-datafusion] Jefffrey commented on a diff in pull request #5343: Support catalog.schema.table.column in SQL SELECT and WHERE

Jefffrey commented on code in PR #5343:
URL: https://github.com/apache/arrow-datafusion/pull/5343#discussion_r1111730595


##########
datafusion/common/src/dfschema.rs:
##########
@@ -573,21 +580,21 @@ impl ExprSchema for DFSchema {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct DFField {
     /// Optional qualifier (usually a table or relation name)
-    qualifier: Option<String>,
+    qualifier: Option<OwnedTableReference>,
     /// Arrow field definition
     field: Field,
 }
 
 impl DFField {
     /// Creates a new `DFField`
-    pub fn new(
-        qualifier: Option<&str>,
+    pub fn new<R: Into<OwnedTableReference>>(
+        qualifier: Option<R>,

Review Comment:
   originally had this way to try ease the breaking change, since would still support Option<&str>, but can see will cause pain if trying to pass in just None itself. maybe its better to just have this as Option<OwnedTableReference> and have the callers call into() (or maintain old method and create a new one?)



##########
datafusion/common/src/error.rs:
##########
@@ -120,29 +121,29 @@ macro_rules! plan_err {
 #[derive(Debug)]
 pub enum SchemaError {
     /// Schema contains a (possibly) qualified and unqualified field with same unqualified name
-    AmbiguousReference {
-        qualifier: Option<String>,
+    AmbiguousReference { field: Column },
+    /// Schema contains duplicate qualified field name
+    DuplicateQualifiedField {
+        qualifier: OwnedTableReference,
         name: String,
     },
-    /// Schema contains duplicate qualified field name
-    DuplicateQualifiedField { qualifier: String, name: String },
     /// Schema contains duplicate unqualified field name
     DuplicateUnqualifiedField { name: String },
     /// No field with this name
     FieldNotFound {
-        field: Column,
+        field: Box<Column>,

Review Comment:
   
   
   otherwise clippy lint about large Err variant appeared
   



##########
datafusion/common/src/dfschema.rs:
##########
@@ -677,7 +687,7 @@ mod tests {
         // lookup with unqualified name "t1.c0"
         let err = schema.index_of_column(&col).err().unwrap();
         assert_eq!(
-            "Schema error: No field named 't1.c0'. Valid fields are 't1'.'c0', 't1'.'c1'.",
+            r#"Schema error: No field named "t1.c0". Valid fields are "t1"."c0", "t1"."c1"."#,

Review Comment:
   this was a separate change i decided to add, mainly since i believe having quoted identifiers allows for more robust messages as itll properly account for any special characters within and allows users to clearly delineate identifiers in that case
   
   e.g. in old way a user could have an ident like `t1'.'t2` which would show in error as `'t1'.'t2'` even though it is a single ident, it looks like two 



##########
datafusion/common/src/column.rs:
##########
@@ -27,15 +28,18 @@ use std::sync::Arc;
 /// A named reference to a qualified field in a schema.
 #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
 pub struct Column {
-    /// relation/table name.
-    pub relation: Option<String>,
+    /// relation/table reference.
+    pub relation: Option<OwnedTableReference>,
     /// field/column name.
     pub name: String,
 }
 
 impl Column {
     /// Create Column from optional qualifier and name
-    pub fn new(relation: Option<impl Into<String>>, name: impl Into<String>) -> Self {
+    pub fn new(
+        relation: Option<impl Into<OwnedTableReference>>,

Review Comment:
   by having this as Into<OwnedTableReference>, means if passing in a string it will be parsed. this has the side-effect of if passing in just a string like `C1` for the column name, itll be normalized to `c1` (wheres previously it wasn't, since was passed straight through)
   
   if this breaking behaviour is too large, can keep the old method (and just create an `OwnedTableReference::Bare` from it) and have a new method which accepts `Option<OwnedTableReference>` for the relation
   
   see documentation change that was needed due to `col(...)` behaviour changing for reference



##########
datafusion/sql/src/expr/identifier.rs:
##########
@@ -69,44 +105,100 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 })?;
             Ok(Expr::ScalarVariable(ty, var_names))
         } else {
-            // only support "schema.table" type identifiers here
-            let (name, relation) = match idents_to_table_reference(
-                ids,
-                self.options.enable_ident_normalization,
-            )? {
-                OwnedTableReference::Partial { schema, table } => (table, schema),
-                r @ OwnedTableReference::Bare { .. }
-                | r @ OwnedTableReference::Full { .. } => {
-                    return Err(DataFusionError::Plan(format!(
-                        "Unsupported compound identifier '{r:?}'",
-                    )));
-                }
-            };
+            let ids = ids
+                .into_iter()
+                .map(|id| {
+                    if self.options.enable_ident_normalization {
+                        normalize_ident(id)
+                    } else {
+                        id.value
+                    }
+                })
+                .collect::<Vec<_>>();
 
-            // Try and find the reference in schema
-            match schema.field_with_qualified_name(&relation, &name) {
-                Ok(_) => {
-                    // found an exact match on a qualified name so this is a table.column identifier
-                    Ok(Expr::Column(Column {
-                        relation: Some(relation),
-                        name,
-                    }))
+            // Possibilities we search with, in order from top to bottom for each len:

Review Comment:
   main  fix for original issue is in here



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org