You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2022/04/21 20:32:42 UTC

[arrow-datafusion] branch master updated: Move case/when expressions to expr crate (#2311)

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c51a4933 Move case/when expressions to expr crate (#2311)
0c51a4933 is described below

commit 0c51a493382f81feedb61a999d8e246b8d3bbe62
Author: Andy Grove <ag...@apache.org>
AuthorDate: Thu Apr 21 14:32:38 2022 -0600

    Move case/when expressions to expr crate (#2311)
---
 datafusion/core/src/logical_plan/builder.rs        |   2 +-
 datafusion/core/src/logical_plan/expr.rs           | 161 +--------------------
 datafusion/core/src/logical_plan/mod.rs            |   3 +-
 datafusion/expr/Cargo.toml                         |   4 +-
 datafusion/expr/README.md                          |   2 +-
 datafusion/expr/src/conditional_expressions.rs     | 107 ++++++++++++++
 datafusion/expr/src/expr_fn.rs                     |  13 +-
 .../src/logical_plan => expr/src}/expr_schema.rs   |  67 ++++++++-
 datafusion/expr/src/lib.rs                         |  17 ++-
 9 files changed, 203 insertions(+), 173 deletions(-)

diff --git a/datafusion/core/src/logical_plan/builder.rs b/datafusion/core/src/logical_plan/builder.rs
index e417b0dc1..6b08b5beb 100644
--- a/datafusion/core/src/logical_plan/builder.rs
+++ b/datafusion/core/src/logical_plan/builder.rs
@@ -23,7 +23,7 @@ use crate::datasource::{
     MemTable, TableProvider,
 };
 use crate::error::{DataFusionError, Result};
-use crate::logical_plan::expr_schema::ExprSchemable;
+use crate::logical_expr::ExprSchemable;
 use crate::logical_plan::plan::{
     Aggregate, Analyze, EmptyRelation, Explain, Filter, Join, Projection, Sort,
     SubqueryAlias, TableScan, ToStringifiedPlan, Union, Window,
diff --git a/datafusion/core/src/logical_plan/expr.rs b/datafusion/core/src/logical_plan/expr.rs
index 2c9113696..8935170cf 100644
--- a/datafusion/core/src/logical_plan/expr.rs
+++ b/datafusion/core/src/logical_plan/expr.rs
@@ -23,7 +23,6 @@ use crate::error::Result;
 use crate::logical_plan::ExprSchemable;
 use crate::logical_plan::{DFField, DFSchema};
 use arrow::datatypes::DataType;
-use datafusion_common::DataFusionError;
 pub use datafusion_common::{Column, ExprSchema};
 pub use datafusion_expr::expr_fn::*;
 use datafusion_expr::AccumulatorFunctionImplementation;
@@ -35,97 +34,8 @@ use datafusion_expr::{AggregateUDF, ScalarUDF};
 use datafusion_expr::{
     ReturnTypeFunction, ScalarFunctionImplementation, Signature, Volatility,
 };
-use std::collections::HashSet;
 use std::sync::Arc;
 
-/// Helper struct for building [Expr::Case]
-pub struct CaseBuilder {
-    expr: Option<Box<Expr>>,
-    when_expr: Vec<Expr>,
-    then_expr: Vec<Expr>,
-    else_expr: Option<Box<Expr>>,
-}
-
-impl CaseBuilder {
-    pub fn when(&mut self, when: Expr, then: Expr) -> CaseBuilder {
-        self.when_expr.push(when);
-        self.then_expr.push(then);
-        CaseBuilder {
-            expr: self.expr.clone(),
-            when_expr: self.when_expr.clone(),
-            then_expr: self.then_expr.clone(),
-            else_expr: self.else_expr.clone(),
-        }
-    }
-    pub fn otherwise(&mut self, else_expr: Expr) -> Result<Expr> {
-        self.else_expr = Some(Box::new(else_expr));
-        self.build()
-    }
-
-    pub fn end(&self) -> Result<Expr> {
-        self.build()
-    }
-
-    fn build(&self) -> Result<Expr> {
-        // collect all "then" expressions
-        let mut then_expr = self.then_expr.clone();
-        if let Some(e) = &self.else_expr {
-            then_expr.push(e.as_ref().to_owned());
-        }
-
-        let then_types: Vec<DataType> = then_expr
-            .iter()
-            .map(|e| match e {
-                Expr::Literal(_) => e.get_type(&DFSchema::empty()),
-                _ => Ok(DataType::Null),
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        if then_types.contains(&DataType::Null) {
-            // cannot verify types until execution type
-        } else {
-            let unique_types: HashSet<&DataType> = then_types.iter().collect();
-            if unique_types.len() != 1 {
-                return Err(DataFusionError::Plan(format!(
-                    "CASE expression 'then' values had multiple data types: {:?}",
-                    unique_types
-                )));
-            }
-        }
-
-        Ok(Expr::Case {
-            expr: self.expr.clone(),
-            when_then_expr: self
-                .when_expr
-                .iter()
-                .zip(self.then_expr.iter())
-                .map(|(w, t)| (Box::new(w.clone()), Box::new(t.clone())))
-                .collect(),
-            else_expr: self.else_expr.clone(),
-        })
-    }
-}
-
-/// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression.
-pub fn case(expr: Expr) -> CaseBuilder {
-    CaseBuilder {
-        expr: Some(Box::new(expr)),
-        when_expr: vec![],
-        then_expr: vec![],
-        else_expr: None,
-    }
-}
-
-/// Create a CASE WHEN statement with boolean WHEN expressions and no base expression.
-pub fn when(when: Expr, then: Expr) -> CaseBuilder {
-    CaseBuilder {
-        expr: None,
-        when_expr: vec![when],
-        then_expr: vec![then],
-        else_expr: None,
-    }
-}
-
 /// Combines an array of filter expressions into a single filter expression
 /// consisting of the input filter expressions joined with logical AND.
 /// Returns None if the filters array is empty.
@@ -248,26 +158,10 @@ pub fn call_fn(name: impl AsRef<str>, args: Vec<Expr>) -> Result<Expr> {
 
 #[cfg(test)]
 mod tests {
-    use super::super::{col, lit, when};
+    use super::super::{col, lit};
     use super::*;
     use datafusion_expr::expr_fn::binary_expr;
 
-    #[test]
-    fn case_when_same_literal_then_types() -> Result<()> {
-        let _ = when(col("state").eq(lit("CO")), lit(303))
-            .when(col("state").eq(lit("NY")), lit(212))
-            .end()?;
-        Ok(())
-    }
-
-    #[test]
-    fn case_when_different_literal_then_types() {
-        let maybe_expr = when(col("state").eq(lit("CO")), lit(303))
-            .when(col("state").eq(lit("NY")), lit("212"))
-            .end();
-        assert!(maybe_expr.is_err());
-    }
-
     #[test]
     fn digest_function_definitions() {
         if let Expr::ScalarFunction { fun, args } = digest(col("tableA.a"), lit("md5")) {
@@ -301,57 +195,4 @@ mod tests {
             combine_filters(&[filter1.clone(), filter2.clone(), filter3.clone()]);
         assert_eq!(result, Some(and(and(filter1, filter2), filter3)));
     }
-
-    #[test]
-    fn expr_schema_nullability() {
-        let expr = col("foo").eq(lit(1));
-        assert!(!expr.nullable(&MockExprSchema::new()).unwrap());
-        assert!(expr
-            .nullable(&MockExprSchema::new().with_nullable(true))
-            .unwrap());
-    }
-
-    #[test]
-    fn expr_schema_data_type() {
-        let expr = col("foo");
-        assert_eq!(
-            DataType::Utf8,
-            expr.get_type(&MockExprSchema::new().with_data_type(DataType::Utf8))
-                .unwrap()
-        );
-    }
-
-    struct MockExprSchema {
-        nullable: bool,
-        data_type: DataType,
-    }
-
-    impl MockExprSchema {
-        fn new() -> Self {
-            Self {
-                nullable: false,
-                data_type: DataType::Null,
-            }
-        }
-
-        fn with_nullable(mut self, nullable: bool) -> Self {
-            self.nullable = nullable;
-            self
-        }
-
-        fn with_data_type(mut self, data_type: DataType) -> Self {
-            self.data_type = data_type;
-            self
-        }
-    }
-
-    impl ExprSchema for MockExprSchema {
-        fn nullable(&self, _col: &Column) -> Result<bool> {
-            Ok(self.nullable)
-        }
-
-        fn data_type(&self, _col: &Column) -> Result<&DataType> {
-            Ok(&self.data_type)
-        }
-    }
 }
diff --git a/datafusion/core/src/logical_plan/mod.rs b/datafusion/core/src/logical_plan/mod.rs
index 5ce0ff911..488a2b954 100644
--- a/datafusion/core/src/logical_plan/mod.rs
+++ b/datafusion/core/src/logical_plan/mod.rs
@@ -25,7 +25,6 @@ pub(crate) mod builder;
 mod dfschema;
 mod expr;
 mod expr_rewriter;
-mod expr_schema;
 mod expr_simplier;
 mod expr_visitor;
 mod operators;
@@ -38,6 +37,7 @@ pub use builder::{
 pub use datafusion_expr::expr_fn::binary_expr;
 pub use dfschema::{DFField, DFSchema, DFSchemaRef, ToDFSchema};
 
+pub use crate::logical_expr::ExprSchemable;
 pub use expr::{
     abs, acos, and, approx_distinct, approx_percentile_cont, array, ascii, asin, atan,
     avg, bit_length, btrim, call_fn, case, ceil, character_length, chr, coalesce, col,
@@ -55,7 +55,6 @@ pub use expr_rewriter::{
     normalize_col, normalize_cols, replace_col, rewrite_sort_cols_by_aggs,
     unnormalize_col, unnormalize_cols, ExprRewritable, ExprRewriter, RewriteRecursion,
 };
-pub use expr_schema::ExprSchemable;
 pub use expr_simplier::{ExprSimplifiable, SimplifyInfo};
 pub use expr_visitor::{ExprVisitable, ExpressionVisitor, Recursion};
 pub use operators::Operator;
diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml
index 7459490ef..4095d4ebc 100644
--- a/datafusion/expr/Cargo.toml
+++ b/datafusion/expr/Cargo.toml
@@ -17,14 +17,14 @@
 
 [package]
 name = "datafusion-expr"
-description = "Logical expression representation for DataFusion query engine"
+description = "Logical plan and expression representation for DataFusion query engine"
 version = "7.0.0"
 homepage = "https://github.com/apache/arrow-datafusion"
 repository = "https://github.com/apache/arrow-datafusion"
 readme = "../README.md"
 authors = ["Apache Arrow <de...@arrow.apache.org>"]
 license = "Apache-2.0"
-keywords = [ "arrow", "query", "sql" ]
+keywords = [ "datafusion", "logical", "plan", "expressions" ]
 edition = "2021"
 rust-version = "1.59"
 
diff --git a/datafusion/expr/README.md b/datafusion/expr/README.md
index 25ac79c22..6ce82347c 100644
--- a/datafusion/expr/README.md
+++ b/datafusion/expr/README.md
@@ -17,7 +17,7 @@
   under the License.
 -->
 
-# DataFusion Expr
+# DataFusion Logical Plan and Expressions
 
 This is an internal module for fundamental expression types of [DataFusion][df].
 
diff --git a/datafusion/expr/src/conditional_expressions.rs b/datafusion/expr/src/conditional_expressions.rs
index bafa2c724..0c5104a4b 100644
--- a/datafusion/expr/src/conditional_expressions.rs
+++ b/datafusion/expr/src/conditional_expressions.rs
@@ -15,7 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+///! Conditional expressions
+use crate::{expr_schema::ExprSchemable, Expr};
 use arrow::datatypes::DataType;
+use datafusion_common::{DFSchema, DataFusionError, Result};
+use std::collections::HashSet;
 
 /// Currently supported types by the coalesce function.
 /// The order of these types correspond to the order on which coercion applies
@@ -35,3 +39,106 @@ pub static SUPPORTED_COALESCE_TYPES: &[DataType] = &[
     DataType::Utf8,
     DataType::LargeUtf8,
 ];
+
+/// Helper struct for building [Expr::Case]
+pub struct CaseBuilder {
+    expr: Option<Box<Expr>>,
+    when_expr: Vec<Expr>,
+    then_expr: Vec<Expr>,
+    else_expr: Option<Box<Expr>>,
+}
+
+impl CaseBuilder {
+    pub fn new(
+        expr: Option<Box<Expr>>,
+        when_expr: Vec<Expr>,
+        then_expr: Vec<Expr>,
+        else_expr: Option<Box<Expr>>,
+    ) -> Self {
+        Self {
+            expr,
+            when_expr,
+            then_expr,
+            else_expr,
+        }
+    }
+    pub fn when(&mut self, when: Expr, then: Expr) -> CaseBuilder {
+        self.when_expr.push(when);
+        self.then_expr.push(then);
+        CaseBuilder {
+            expr: self.expr.clone(),
+            when_expr: self.when_expr.clone(),
+            then_expr: self.then_expr.clone(),
+            else_expr: self.else_expr.clone(),
+        }
+    }
+    pub fn otherwise(&mut self, else_expr: Expr) -> Result<Expr> {
+        self.else_expr = Some(Box::new(else_expr));
+        self.build()
+    }
+
+    pub fn end(&self) -> Result<Expr> {
+        self.build()
+    }
+
+    fn build(&self) -> Result<Expr> {
+        // collect all "then" expressions
+        let mut then_expr = self.then_expr.clone();
+        if let Some(e) = &self.else_expr {
+            then_expr.push(e.as_ref().to_owned());
+        }
+
+        let then_types: Vec<DataType> = then_expr
+            .iter()
+            .map(|e| match e {
+                Expr::Literal(_) => e.get_type(&DFSchema::empty()),
+                _ => Ok(DataType::Null),
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        if then_types.contains(&DataType::Null) {
+            // cannot verify types until execution type
+        } else {
+            let unique_types: HashSet<&DataType> = then_types.iter().collect();
+            if unique_types.len() != 1 {
+                return Err(DataFusionError::Plan(format!(
+                    "CASE expression 'then' values had multiple data types: {:?}",
+                    unique_types
+                )));
+            }
+        }
+
+        Ok(Expr::Case {
+            expr: self.expr.clone(),
+            when_then_expr: self
+                .when_expr
+                .iter()
+                .zip(self.then_expr.iter())
+                .map(|(w, t)| (Box::new(w.clone()), Box::new(t.clone())))
+                .collect(),
+            else_expr: self.else_expr.clone(),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{col, lit, when};
+
+    #[test]
+    fn case_when_same_literal_then_types() -> Result<()> {
+        let _ = when(col("state").eq(lit("CO")), lit(303))
+            .when(col("state").eq(lit("NY")), lit(212))
+            .end()?;
+        Ok(())
+    }
+
+    #[test]
+    fn case_when_different_literal_then_types() {
+        let maybe_expr = when(col("state").eq(lit("CO")), lit(303))
+            .when(col("state").eq(lit("NY")), lit("212"))
+            .end();
+        assert!(maybe_expr.is_err());
+    }
+}
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index d9cd6a552..a723f5306 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -15,8 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Expr fn module contains the functional definitions for expressions.
+//! Functions for creating logical expressions
 
+use crate::conditional_expressions::CaseBuilder;
 use crate::{aggregate_function, built_in_function, lit, Expr, Operator};
 
 /// Create a column expression based on a qualified or unqualified column name
@@ -306,6 +307,16 @@ pub fn coalesce(args: Vec<Expr>) -> Expr {
     }
 }
 
+/// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression.
+pub fn case(expr: Expr) -> CaseBuilder {
+    CaseBuilder::new(Some(Box::new(expr)), vec![], vec![], None)
+}
+
+/// Create a CASE WHEN statement with boolean WHEN expressions and no base expression.
+pub fn when(when: Expr, then: Expr) -> CaseBuilder {
+    CaseBuilder::new(None, vec![when], vec![then], None)
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/datafusion/core/src/logical_plan/expr_schema.rs b/datafusion/expr/src/expr_schema.rs
similarity index 85%
rename from datafusion/core/src/logical_plan/expr_schema.rs
rename to datafusion/expr/src/expr_schema.rs
index e0cef2979..a216281b3 100644
--- a/datafusion/core/src/logical_plan/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -16,12 +16,12 @@
 // under the License.
 
 use super::Expr;
-use crate::logical_expr::{aggregate_function, function, window_function};
+use crate::binary_rule::binary_operator_data_type;
+use crate::field_util::get_indexed_field;
+use crate::{aggregate_function, function, window_function};
 use arrow::compute::can_cast_types;
 use arrow::datatypes::DataType;
 use datafusion_common::{DFField, DFSchema, DataFusionError, ExprSchema, Result};
-use datafusion_expr::binary_rule::binary_operator_data_type;
-use datafusion_expr::field_util::get_indexed_field;
 
 /// trait to allow expr to typable with respect to a schema
 pub trait ExprSchemable {
@@ -236,3 +236,64 @@ impl ExprSchemable for Expr {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{col, lit};
+    use arrow::datatypes::DataType;
+    use datafusion_common::Column;
+
+    #[test]
+    fn expr_schema_nullability() {
+        let expr = col("foo").eq(lit(1));
+        assert!(!expr.nullable(&MockExprSchema::new()).unwrap());
+        assert!(expr
+            .nullable(&MockExprSchema::new().with_nullable(true))
+            .unwrap());
+    }
+
+    #[test]
+    fn expr_schema_data_type() {
+        let expr = col("foo");
+        assert_eq!(
+            DataType::Utf8,
+            expr.get_type(&MockExprSchema::new().with_data_type(DataType::Utf8))
+                .unwrap()
+        );
+    }
+
+    struct MockExprSchema {
+        nullable: bool,
+        data_type: DataType,
+    }
+
+    impl MockExprSchema {
+        fn new() -> Self {
+            Self {
+                nullable: false,
+                data_type: DataType::Null,
+            }
+        }
+
+        fn with_nullable(mut self, nullable: bool) -> Self {
+            self.nullable = nullable;
+            self
+        }
+
+        fn with_data_type(mut self, data_type: DataType) -> Self {
+            self.data_type = data_type;
+            self
+        }
+    }
+
+    impl ExprSchema for MockExprSchema {
+        fn nullable(&self, _col: &Column) -> Result<bool> {
+            Ok(self.nullable)
+        }
+
+        fn data_type(&self, _col: &Column) -> Result<&DataType> {
+            Ok(&self.data_type)
+        }
+    }
+}
diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs
index 7586dbf60..b1e822077 100644
--- a/datafusion/expr/src/lib.rs
+++ b/datafusion/expr/src/lib.rs
@@ -24,6 +24,7 @@ mod columnar_value;
 pub mod conditional_expressions;
 pub mod expr;
 pub mod expr_fn;
+pub mod expr_schema;
 pub mod field_util;
 pub mod function;
 mod literal;
@@ -43,12 +44,24 @@ pub use aggregate_function::AggregateFunction;
 pub use built_in_function::BuiltinScalarFunction;
 pub use columnar_value::{ColumnarValue, NullColumnarValue};
 pub use expr::Expr;
-pub use expr_fn::{col, sum};
+pub use expr_fn::{
+    abs, acos, and, approx_distinct, approx_percentile_cont, array, ascii, asin, atan,
+    avg, bit_length, btrim, case, ceil, character_length, chr, coalesce, col, concat,
+    concat_expr, concat_ws, concat_ws_expr, cos, count, count_distinct, date_part,
+    date_trunc, digest, exp, floor, in_list, initcap, left, length, ln, log10, log2,
+    lower, lpad, ltrim, max, md5, min, now, now_expr, nullif, octet_length, or, random,
+    regexp_match, regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim,
+    sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos,
+    substr, sum, tan, to_hex, to_timestamp_micros, to_timestamp_millis,
+    to_timestamp_seconds, translate, trim, trunc, upper, when,
+};
+pub use expr_schema::ExprSchemable;
 pub use function::{
     AccumulatorFunctionImplementation, ReturnTypeFunction, ScalarFunctionImplementation,
     StateTypeFunction,
 };
 pub use literal::{lit, lit_timestamp_nano, Literal, TimestampLiteral};
+pub use logical_plan::{LogicalPlan, PlanVisitor};
 pub use nullif::SUPPORTED_NULLIF_TYPES;
 pub use operator::Operator;
 pub use signature::{Signature, TypeSignature, Volatility};
@@ -57,5 +70,3 @@ pub use udaf::AggregateUDF;
 pub use udf::ScalarUDF;
 pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits};
 pub use window_function::{BuiltInWindowFunction, WindowFunction};
-
-pub use logical_plan::{LogicalPlan, PlanVisitor};