You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/10/05 21:26:50 UTC

[arrow-datafusion] branch master updated: Use column aliases specified by `WITH` statements (#3717)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 23682f6c4 Use column aliases specified by `WITH` statements (#3717)
23682f6c4 is described below

commit 23682f6c40c25f259c4ba97a637affca05af2052
Author: Batuhan Taskaya <is...@gmail.com>
AuthorDate: Thu Oct 6 00:26:43 2022 +0300

    Use column aliases specified by `WITH` statements (#3717)
---
 datafusion/sql/src/planner.rs | 121 +++++++++++++++++++++++++++++++++---------
 1 file changed, 97 insertions(+), 24 deletions(-)

diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index f3a0c4850..9a4e29228 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -59,8 +59,8 @@ use sqlparser::ast::{
     BinaryOperator, DataType as SQLDataType, DateTimeField, Expr as SQLExpr, FunctionArg,
     FunctionArgExpr, Ident, Join, JoinConstraint, JoinOperator, ObjectName,
     Offset as SQLOffset, Query, Select, SelectItem, SetExpr, SetOperator,
-    ShowCreateObject, ShowStatementFilter, TableFactor, TableWithJoins, TimezoneInfo,
-    TrimWhereField, UnaryOperator, Value, Values as SQLValues,
+    ShowCreateObject, ShowStatementFilter, TableAlias, TableFactor, TableWithJoins,
+    TimezoneInfo, TrimWhereField, UnaryOperator, Value, Values as SQLValues,
 };
 use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption};
 use sqlparser::ast::{ObjectType, OrderByExpr, Statement};
@@ -376,6 +376,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                     &mut ctes.clone(),
                     outer_query_schema,
                 )?;
+
+                // Each `WITH` block can change the column names in the last
+                // projection (e.g. "WITH table(t1, t2) AS SELECT 1, 2").
+                let logical_plan = self.apply_table_alias(logical_plan, cte.alias)?;
+
                 ctes.insert(cte_name, logical_plan);
             }
         }
@@ -785,33 +790,40 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             }
         };
         if let Some(alias) = alias {
-            let columns_alias = alias.clone().columns;
-            if columns_alias.is_empty() {
-                // sqlparser-rs encodes AS t as an empty list of column alias
-                Ok(plan)
-            } else if columns_alias.len() != plan.schema().fields().len() {
-                Err(DataFusionError::Plan(format!(
-                    "Source table contains {} columns but only {} names given as column alias",
-                    plan.schema().fields().len(),
-                    columns_alias.len(),
-                )))
-            } else {
-                Ok(LogicalPlanBuilder::from(plan.clone())
-                    .project_with_alias(
-                        plan.schema().fields().iter().zip(columns_alias.iter()).map(
-                            |(field, ident)| {
-                                col(field.name()).alias(&normalize_ident(ident))
-                            },
-                        ),
-                        Some(normalize_ident(&alias.name)),
-                    )?
-                    .build()?)
-            }
+            self.apply_table_alias(plan, alias)
         } else {
             Ok(plan)
         }
     }
 
+    /// Apply the given TableAlias to the top-level projection.
+    fn apply_table_alias(
+        &self,
+        plan: LogicalPlan,
+        alias: TableAlias,
+    ) -> Result<LogicalPlan> {
+        let columns_alias = alias.clone().columns;
+        if columns_alias.is_empty() {
+            // sqlparser-rs encodes AS t as an empty list of column alias
+            Ok(plan)
+        } else if columns_alias.len() != plan.schema().fields().len() {
+            Err(DataFusionError::Plan(format!(
+                "Source table contains {} columns but only {} names given as column alias",
+                plan.schema().fields().len(),
+                columns_alias.len(),
+            )))
+        } else {
+            Ok(LogicalPlanBuilder::from(plan.clone())
+                .project_with_alias(
+                    plan.schema().fields().iter().zip(columns_alias.iter()).map(
+                        |(field, ident)| col(field.name()).alias(&normalize_ident(ident)),
+                    ),
+                    Some(normalize_ident(&alias.name)),
+                )?
+                .build()?)
+        }
+    }
+
     /// Generate a logic plan from selection clause, the function contain optimization for cross join to inner join
     /// Related PR: <https://github.com/apache/arrow-datafusion/pull/1566>
     fn plan_selection(
@@ -5046,6 +5058,67 @@ mod tests {
         quick_test(sql, expected)
     }
 
+    #[test]
+    fn cte_with_no_column_names() {
+        let sql = "WITH \
+        numbers AS ( \
+            SELECT 1 as a, 2 as b, 3 as c \
+        ) \
+        SELECT * FROM numbers;";
+
+        let expected = "Projection: numbers.a, numbers.b, numbers.c\
+        \n  Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c, alias=numbers\
+        \n    EmptyRelation";
+
+        quick_test(sql, expected)
+    }
+
+    #[test]
+    fn cte_with_column_names() {
+        let sql = "WITH \
+        numbers(a, b, c) AS ( \
+            SELECT 1, 2, 3 \
+        ) \
+        SELECT * FROM numbers;";
+
+        let expected = "Projection: numbers.a, numbers.b, numbers.c\
+        \n  Projection: numbers.Int64(1) AS a, numbers.Int64(2) AS b, numbers.Int64(3) AS c, alias=numbers\
+        \n    Projection: Int64(1), Int64(2), Int64(3), alias=numbers\
+        \n      EmptyRelation";
+
+        quick_test(sql, expected)
+    }
+
+    #[test]
+    fn cte_with_column_aliases_precedence() {
+        // The end result should always be what CTE specification says
+        let sql = "WITH \
+        numbers(a, b, c) AS ( \
+            SELECT 1 as x, 2 as y, 3 as z \
+        ) \
+        SELECT * FROM numbers;";
+
+        let expected = "Projection: numbers.a, numbers.b, numbers.c\
+        \n  Projection: numbers.x AS a, numbers.y AS b, numbers.z AS c, alias=numbers\
+        \n    Projection: Int64(1) AS x, Int64(2) AS y, Int64(3) AS z, alias=numbers\
+        \n      EmptyRelation";
+
+        quick_test(sql, expected)
+    }
+
+    #[test]
+    fn cte_unbalanced_number_of_columns() {
+        let sql = "WITH \
+        numbers(a) AS ( \
+            SELECT 1, 2, 3 \
+        ) \
+        SELECT * FROM numbers;";
+
+        let expected = "Error during planning: Source table contains 3 columns but only 1 names given as column alias";
+        let result = logical_plan(sql).err().unwrap();
+        assert_eq!(expected, format!("{}", result));
+    }
+
     #[test]
     fn aggregate_with_rollup() {
         let sql = "SELECT id, state, age, COUNT(*) FROM person GROUP BY id, ROLLUP (state, age)";