You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by dh...@apache.org on 2023/06/28 11:30:59 UTC

[arrow-datafusion] branch main updated: Properly project grouping set expressions (#6777)

This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new a76b09eb0e Properly project grouping set expressions (#6777)
a76b09eb0e is described below

commit a76b09eb0ecd9d4cb6f8eab21c504db3f1f2e81a
Author: Faiaz Sanaulla <10...@users.noreply.github.com>
AuthorDate: Wed Jun 28 13:30:54 2023 +0200

    Properly project grouping set expressions (#6777)
    
    * update version to 26.0.0
    
    * update Cargo.lock
    
    * changelog
    
    * prettier
    
    * update changelog
    
    * VTX-1613: update ignore rule
    
    * VTX-1613: revert
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: handle grouping sets
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: debug
    
    * VTX-1613: cleanup & fix for grouping set
    
    * VTX-1613: cleanup
    
    * VTX-1613: cleanup
    
    * VTX-1613: cleanup
    
    * VTX-1613: cleanup
    
    * VTX-1613: cleanup
    
    * VTX-1613: cleanup
    
    * VTX-1613: cleanup
    
    * VTX-1613: fix import
    
    ---------
    
    Co-authored-by: Andy Grove <an...@gmail.com>
---
 datafusion-cli/Cargo.lock                          |  8 +--
 .../optimizer/src/common_subexpr_eliminate.rs      | 72 ++++++++++++++++++++--
 2 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 93d304ccfe..f46bbaee92 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1788,9 +1788,9 @@ dependencies = [
 
 [[package]]
 name = "ipnet"
-version = "2.7.2"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f"
+checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
 
 [[package]]
 name = "itertools"
@@ -3386,9 +3386,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
 [[package]]
 name = "uuid"
-version = "1.3.4"
+version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa2982af2eec27de306107c027578ff7f423d65f7250e40ce0fea8f45248b81"
+checksum = "d023da39d1fde5a8a3fe1f3e01ca9632ada0a63e9797de55a879d6e2236277be"
 dependencies = [
  "getrandom",
 ]
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 42318d4181..c0b47fbb02 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -273,9 +273,7 @@ impl CommonSubexprEliminate {
 
             let mut proj_exprs = vec![];
             for expr in &new_group_expr {
-                let out_col: Column =
-                    expr.to_field(&new_input_schema)?.qualified_column();
-                proj_exprs.push(Expr::Column(out_col));
+                extract_expressions(expr, &new_input_schema, &mut proj_exprs)?
             }
             for (expr_rewritten, expr_orig) in rewritten.into_iter().zip(new_aggr_expr) {
                 if expr_rewritten == expr_orig {
@@ -488,6 +486,22 @@ fn build_recover_project_plan(schema: &DFSchema, input: LogicalPlan) -> LogicalP
     )
 }
 
+fn extract_expressions(
+    expr: &Expr,
+    schema: &DFSchema,
+    result: &mut Vec<Expr>,
+) -> Result<()> {
+    if let Expr::GroupingSet(groupings) = expr {
+        for e in groupings.distinct_expr() {
+            result.push(Expr::Column(e.to_field(schema)?.qualified_column()))
+        }
+    } else {
+        result.push(Expr::Column(expr.to_field(schema)?.qualified_column()));
+    }
+
+    Ok(())
+}
+
 /// Which type of [expressions](Expr) should be considered for rewriting?
 #[derive(Debug, Clone, Copy)]
 enum ExprMask {
@@ -773,8 +787,8 @@ mod test {
         avg, col, lit, logical_plan::builder::LogicalPlanBuilder, sum,
     };
     use datafusion_expr::{
-        AccumulatorFactoryFunction, AggregateUDF, ReturnTypeFunction, Signature,
-        StateTypeFunction, Volatility,
+        grouping_set, AccumulatorFactoryFunction, AggregateUDF, ReturnTypeFunction,
+        Signature, StateTypeFunction, Volatility,
     };
 
     use crate::optimizer::OptimizerContext;
@@ -1251,4 +1265,52 @@ mod test {
 
         Ok(())
     }
+
+    #[test]
+    fn test_extract_expressions_from_grouping_set() -> Result<()> {
+        let mut result = Vec::with_capacity(3);
+        let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("c")]]);
+        let schema = DFSchema::new_with_metadata(
+            vec![
+                DFField::new_unqualified("a", DataType::Int32, false),
+                DFField::new_unqualified("b", DataType::Int32, false),
+                DFField::new_unqualified("c", DataType::Int32, false),
+            ],
+            HashMap::default(),
+        )?;
+        extract_expressions(&grouping, &schema, &mut result)?;
+
+        assert!(result.len() == 3);
+        Ok(())
+    }
+
+    #[test]
+    fn test_extract_expressions_from_grouping_set_with_identical_expr() -> Result<()> {
+        let mut result = Vec::with_capacity(2);
+        let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("a")]]);
+        let schema = DFSchema::new_with_metadata(
+            vec![
+                DFField::new_unqualified("a", DataType::Int32, false),
+                DFField::new_unqualified("b", DataType::Int32, false),
+            ],
+            HashMap::default(),
+        )?;
+        extract_expressions(&grouping, &schema, &mut result)?;
+
+        assert!(result.len() == 2);
+        Ok(())
+    }
+
+    #[test]
+    fn test_extract_expressions_from_col() -> Result<()> {
+        let mut result = Vec::with_capacity(1);
+        let schema = DFSchema::new_with_metadata(
+            vec![DFField::new_unqualified("a", DataType::Int32, false)],
+            HashMap::default(),
+        )?;
+        extract_expressions(&col("a"), &schema, &mut result)?;
+
+        assert!(result.len() == 1);
+        Ok(())
+    }
 }