You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2024/02/04 16:01:52 UTC

(arrow-datafusion) branch main updated: docs: add docs and example showing how to get the expression data type (#9118)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 86a2ab0885 docs: add docs and example showing how to get the expression data type (#9118)
86a2ab0885 is described below

commit 86a2ab0885259b32464c8ed88632c5d594e7b665
Author: Niko <ul...@hotmail.com>
AuthorDate: Sun Feb 4 16:01:47 2024 +0000

    docs: add docs and example showing how to get the expression data type (#9118)
    
    * add docs showing examples of getting a data type of the logical expression
    
    Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
    
    * fix references in docsctings
    
    Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
    
    * implement minor changes suggested after the code review
    
    Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
    
    ---------
    
    Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
---
 datafusion-examples/examples/expr_api.rs           | 46 ++++++++++++++++++++++
 datafusion/expr/src/expr_schema.rs                 | 25 ++++++++++++
 .../library-user-guide/working-with-exprs.md       | 28 +++++++++++++
 3 files changed, 99 insertions(+)

diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs
index 19e70dc419..8079c3e141 100644
--- a/datafusion-examples/examples/expr_api.rs
+++ b/datafusion-examples/examples/expr_api.rs
@@ -18,6 +18,7 @@
 use arrow::array::{BooleanArray, Int32Array};
 use arrow::record_batch::RecordBatch;
 use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use datafusion::common::{DFField, DFSchema};
 use datafusion::error::Result;
 use datafusion::optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext};
 use datafusion::physical_expr::execution_props::ExecutionProps;
@@ -29,6 +30,7 @@ use datafusion_common::{ScalarValue, ToDFSchema};
 use datafusion_expr::expr::BinaryExpr;
 use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::{ColumnarValue, ExprSchemable, Operator};
+use std::collections::HashMap;
 use std::sync::Arc;
 
 /// This example demonstrates the DataFusion [`Expr`] API.
@@ -45,6 +47,7 @@ use std::sync::Arc;
 /// 2. Evaluate [`Exprs`] against data: [`evaluate_demo`]
 /// 3. Simplify expressions: [`simplify_demo`]
 /// 4. Analyze predicates for boundary ranges: [`range_analysis_demo`]
+/// 5. Get the types of the expressions: [`expression_type_demo`]
 #[tokio::main]
 async fn main() -> Result<()> {
     // The easiest way to do create expressions is to use the
@@ -68,6 +71,9 @@ async fn main() -> Result<()> {
     // See how to analyze ranges in expressions
     range_analysis_demo()?;
 
+    // See how to determine the data types of expressions
+    expression_type_demo()?;
+
     Ok(())
 }
 
@@ -256,3 +262,43 @@ pub fn physical_expr(schema: &Schema, expr: Expr) -> Result<Arc<dyn PhysicalExpr
 
     create_physical_expr(&expr, df_schema.as_ref(), &props)
 }
+
+/// This function shows how to use `Expr::get_type` to retrieve the DataType
+/// of an expression
+fn expression_type_demo() -> Result<()> {
+    let expr = col("c");
+
+    // To determine the DataType of an expression, DataFusion must know the
+    // types of the input expressions. You can provide this information using
+    // a schema. In this case we create a schema where the column `c` is of
+    // type Utf8 (a String / VARCHAR)
+    let schema = DFSchema::new_with_metadata(
+        vec![DFField::new_unqualified("c", DataType::Utf8, true)],
+        HashMap::new(),
+    )
+    .unwrap();
+    assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap()));
+
+    // Using a schema where the column `foo` is of type Int32
+    let schema = DFSchema::new_with_metadata(
+        vec![DFField::new_unqualified("c", DataType::Int32, true)],
+        HashMap::new(),
+    )
+    .unwrap();
+    assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap()));
+
+    // Get the type of an expression that adds 2 columns. Adding an Int32
+    // and Float32 results in Float32 type
+    let expr = col("c1") + col("c2");
+    let schema = DFSchema::new_with_metadata(
+        vec![
+            DFField::new_unqualified("c1", DataType::Int32, true),
+            DFField::new_unqualified("c2", DataType::Float32, true),
+        ],
+        HashMap::new(),
+    )
+    .unwrap();
+    assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap()));
+
+    Ok(())
+}
diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs
index d30f304a26..517d7a35f7 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -58,6 +58,31 @@ impl ExprSchemable for Expr {
     ///
     /// Note: [DFSchema] implements [ExprSchema].
     ///
+    /// # Examples
+    ///
+    /// ## Get the type of an expression that adds 2 columns. Adding an Int32
+    /// ## and Float32 results in Float32 type
+    ///
+    /// ```
+    /// # use arrow::datatypes::DataType;
+    /// # use datafusion_common::{DFField, DFSchema};
+    /// # use datafusion_expr::{col, ExprSchemable};
+    /// # use std::collections::HashMap;
+    ///
+    /// fn main() {
+    ///   let expr = col("c1") + col("c2");
+    ///   let schema = DFSchema::new_with_metadata(
+    ///     vec![
+    ///       DFField::new_unqualified("c1", DataType::Int32, true),
+    ///       DFField::new_unqualified("c2", DataType::Float32, true),
+    ///       ],
+    ///       HashMap::new(),
+    ///   )
+    ///   .unwrap();
+    ///   assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap()));
+    /// }
+    /// ```
+    ///
     /// # Errors
     ///
     /// This function errors when it is not possible to compute its
diff --git a/docs/source/library-user-guide/working-with-exprs.md b/docs/source/library-user-guide/working-with-exprs.md
index 96be8ef7f1..b7e9248a7c 100644
--- a/docs/source/library-user-guide/working-with-exprs.md
+++ b/docs/source/library-user-guide/working-with-exprs.md
@@ -180,6 +180,34 @@ Projection: Int64(1) + Int64(1) AS added_one
 
 I.e. the `add_one` UDF has been inlined into the projection.
 
+## Getting the data type of the expression
+
+The `arrow::datatypes::DataType` of the expression can be obtained by calling the `get_type` given something that implements `Expr::Schemable`, for example a `DFschema` object:
+
+```rust
+use arrow_schema::DataType;
+use datafusion::common::{DFField, DFSchema};
+use datafusion::logical_expr::{col, ExprSchemable};
+use std::collections::HashMap;
+
+let expr = col("c1") + col("c2");
+let schema = DFSchema::new_with_metadata(
+    vec![
+        DFField::new_unqualified("c1", DataType::Int32, true),
+        DFField::new_unqualified("c2", DataType::Float32, true),
+    ],
+    HashMap::new(),
+)
+.unwrap();
+print!("type = {}", expr.get_type(&schema).unwrap());
+```
+
+This results in the following output:
+
+```text
+type = Float32
+```
+
 ## Conclusion
 
 In this guide, we've seen how to create `Expr`s programmatically and how to rewrite them. This is useful for simplifying and optimizing `Expr`s. We've also seen how to test our rule to ensure it works properly.