You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by dh...@apache.org on 2023/06/28 11:30:59 UTC
[arrow-datafusion] branch main updated: Properly project grouping set expressions (#6777)
This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new a76b09eb0e Properly project grouping set expressions (#6777)
a76b09eb0e is described below
commit a76b09eb0ecd9d4cb6f8eab21c504db3f1f2e81a
Author: Faiaz Sanaulla <10...@users.noreply.github.com>
AuthorDate: Wed Jun 28 13:30:54 2023 +0200
Properly project grouping set expressions (#6777)
* update version to 26.0.0
* update Cargo.lock
* changelog
* prettier
* update changelog
* VTX-1613: update ignore rule
* VTX-1613: revert
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: handle grouping sets
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: debug
* VTX-1613: cleanup & fix for grouping set
* VTX-1613: cleanup
* VTX-1613: cleanup
* VTX-1613: cleanup
* VTX-1613: cleanup
* VTX-1613: cleanup
* VTX-1613: cleanup
* VTX-1613: cleanup
* VTX-1613: fix import
---------
Co-authored-by: Andy Grove <an...@gmail.com>
---
datafusion-cli/Cargo.lock | 8 +--
.../optimizer/src/common_subexpr_eliminate.rs | 72 ++++++++++++++++++++--
2 files changed, 71 insertions(+), 9 deletions(-)
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 93d304ccfe..f46bbaee92 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1788,9 +1788,9 @@ dependencies = [
[[package]]
name = "ipnet"
-version = "2.7.2"
+version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f"
+checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
[[package]]
name = "itertools"
@@ -3386,9 +3386,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "uuid"
-version = "1.3.4"
+version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa2982af2eec27de306107c027578ff7f423d65f7250e40ce0fea8f45248b81"
+checksum = "d023da39d1fde5a8a3fe1f3e01ca9632ada0a63e9797de55a879d6e2236277be"
dependencies = [
"getrandom",
]
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 42318d4181..c0b47fbb02 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -273,9 +273,7 @@ impl CommonSubexprEliminate {
let mut proj_exprs = vec![];
for expr in &new_group_expr {
- let out_col: Column =
- expr.to_field(&new_input_schema)?.qualified_column();
- proj_exprs.push(Expr::Column(out_col));
+ extract_expressions(expr, &new_input_schema, &mut proj_exprs)?
}
for (expr_rewritten, expr_orig) in rewritten.into_iter().zip(new_aggr_expr) {
if expr_rewritten == expr_orig {
@@ -488,6 +486,22 @@ fn build_recover_project_plan(schema: &DFSchema, input: LogicalPlan) -> LogicalP
)
}
+fn extract_expressions(
+ expr: &Expr,
+ schema: &DFSchema,
+ result: &mut Vec<Expr>,
+) -> Result<()> {
+ if let Expr::GroupingSet(groupings) = expr {
+ for e in groupings.distinct_expr() {
+ result.push(Expr::Column(e.to_field(schema)?.qualified_column()))
+ }
+ } else {
+ result.push(Expr::Column(expr.to_field(schema)?.qualified_column()));
+ }
+
+ Ok(())
+}
+
/// Which type of [expressions](Expr) should be considered for rewriting?
#[derive(Debug, Clone, Copy)]
enum ExprMask {
@@ -773,8 +787,8 @@ mod test {
avg, col, lit, logical_plan::builder::LogicalPlanBuilder, sum,
};
use datafusion_expr::{
- AccumulatorFactoryFunction, AggregateUDF, ReturnTypeFunction, Signature,
- StateTypeFunction, Volatility,
+ grouping_set, AccumulatorFactoryFunction, AggregateUDF, ReturnTypeFunction,
+ Signature, StateTypeFunction, Volatility,
};
use crate::optimizer::OptimizerContext;
@@ -1251,4 +1265,52 @@ mod test {
Ok(())
}
+
+ #[test]
+ fn test_extract_expressions_from_grouping_set() -> Result<()> {
+ let mut result = Vec::with_capacity(3);
+ let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("c")]]);
+ let schema = DFSchema::new_with_metadata(
+ vec![
+ DFField::new_unqualified("a", DataType::Int32, false),
+ DFField::new_unqualified("b", DataType::Int32, false),
+ DFField::new_unqualified("c", DataType::Int32, false),
+ ],
+ HashMap::default(),
+ )?;
+ extract_expressions(&grouping, &schema, &mut result)?;
+
+ assert!(result.len() == 3);
+ Ok(())
+ }
+
+ #[test]
+ fn test_extract_expressions_from_grouping_set_with_identical_expr() -> Result<()> {
+ let mut result = Vec::with_capacity(2);
+ let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("a")]]);
+ let schema = DFSchema::new_with_metadata(
+ vec![
+ DFField::new_unqualified("a", DataType::Int32, false),
+ DFField::new_unqualified("b", DataType::Int32, false),
+ ],
+ HashMap::default(),
+ )?;
+ extract_expressions(&grouping, &schema, &mut result)?;
+
+ assert!(result.len() == 2);
+ Ok(())
+ }
+
+ #[test]
+ fn test_extract_expressions_from_col() -> Result<()> {
+ let mut result = Vec::with_capacity(1);
+ let schema = DFSchema::new_with_metadata(
+ vec![DFField::new_unqualified("a", DataType::Int32, false)],
+ HashMap::default(),
+ )?;
+ extract_expressions(&col("a"), &schema, &mut result)?;
+
+ assert!(result.len() == 1);
+ Ok(())
+ }
}