You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2024/02/04 16:01:52 UTC
(arrow-datafusion) branch main updated: docs: add docs and example showing how to get the expression data type (#9118)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 86a2ab0885 docs: add docs and example showing how to get the expression data type (#9118)
86a2ab0885 is described below
commit 86a2ab0885259b32464c8ed88632c5d594e7b665
Author: Niko <ul...@hotmail.com>
AuthorDate: Sun Feb 4 16:01:47 2024 +0000
docs: add docs and example showing how to get the expression data type (#9118)
* add docs showing examples of getting a data type of the logical expression
Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
* fix references in docsctings
Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
* implement minor changes suggested after the code review
Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
---------
Signed-off-by: Nikolay Ulmasov <ul...@hotmail.com>
---
datafusion-examples/examples/expr_api.rs | 46 ++++++++++++++++++++++
datafusion/expr/src/expr_schema.rs | 25 ++++++++++++
.../library-user-guide/working-with-exprs.md | 28 +++++++++++++
3 files changed, 99 insertions(+)
diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs
index 19e70dc419..8079c3e141 100644
--- a/datafusion-examples/examples/expr_api.rs
+++ b/datafusion-examples/examples/expr_api.rs
@@ -18,6 +18,7 @@
use arrow::array::{BooleanArray, Int32Array};
use arrow::record_batch::RecordBatch;
use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use datafusion::common::{DFField, DFSchema};
use datafusion::error::Result;
use datafusion::optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext};
use datafusion::physical_expr::execution_props::ExecutionProps;
@@ -29,6 +30,7 @@ use datafusion_common::{ScalarValue, ToDFSchema};
use datafusion_expr::expr::BinaryExpr;
use datafusion_expr::interval_arithmetic::Interval;
use datafusion_expr::{ColumnarValue, ExprSchemable, Operator};
+use std::collections::HashMap;
use std::sync::Arc;
/// This example demonstrates the DataFusion [`Expr`] API.
@@ -45,6 +47,7 @@ use std::sync::Arc;
/// 2. Evaluate [`Exprs`] against data: [`evaluate_demo`]
/// 3. Simplify expressions: [`simplify_demo`]
/// 4. Analyze predicates for boundary ranges: [`range_analysis_demo`]
+/// 5. Get the types of the expressions: [`expression_type_demo`]
#[tokio::main]
async fn main() -> Result<()> {
// The easiest way to do create expressions is to use the
@@ -68,6 +71,9 @@ async fn main() -> Result<()> {
// See how to analyze ranges in expressions
range_analysis_demo()?;
+ // See how to determine the data types of expressions
+ expression_type_demo()?;
+
Ok(())
}
@@ -256,3 +262,43 @@ pub fn physical_expr(schema: &Schema, expr: Expr) -> Result<Arc<dyn PhysicalExpr
create_physical_expr(&expr, df_schema.as_ref(), &props)
}
+
+/// This function shows how to use `Expr::get_type` to retrieve the DataType
+/// of an expression
+fn expression_type_demo() -> Result<()> {
+ let expr = col("c");
+
+ // To determine the DataType of an expression, DataFusion must know the
+ // types of the input expressions. You can provide this information using
+ // a schema. In this case we create a schema where the column `c` is of
+ // type Utf8 (a String / VARCHAR)
+ let schema = DFSchema::new_with_metadata(
+ vec![DFField::new_unqualified("c", DataType::Utf8, true)],
+ HashMap::new(),
+ )
+ .unwrap();
+ assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap()));
+
+ // Using a schema where the column `foo` is of type Int32
+ let schema = DFSchema::new_with_metadata(
+ vec![DFField::new_unqualified("c", DataType::Int32, true)],
+ HashMap::new(),
+ )
+ .unwrap();
+ assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap()));
+
+ // Get the type of an expression that adds 2 columns. Adding an Int32
+ // and Float32 results in Float32 type
+ let expr = col("c1") + col("c2");
+ let schema = DFSchema::new_with_metadata(
+ vec![
+ DFField::new_unqualified("c1", DataType::Int32, true),
+ DFField::new_unqualified("c2", DataType::Float32, true),
+ ],
+ HashMap::new(),
+ )
+ .unwrap();
+ assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap()));
+
+ Ok(())
+}
diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs
index d30f304a26..517d7a35f7 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -58,6 +58,31 @@ impl ExprSchemable for Expr {
///
/// Note: [DFSchema] implements [ExprSchema].
///
+ /// # Examples
+ ///
+ /// ## Get the type of an expression that adds 2 columns. Adding an Int32
+ /// ## and Float32 results in Float32 type
+ ///
+ /// ```
+ /// # use arrow::datatypes::DataType;
+ /// # use datafusion_common::{DFField, DFSchema};
+ /// # use datafusion_expr::{col, ExprSchemable};
+ /// # use std::collections::HashMap;
+ ///
+ /// fn main() {
+ /// let expr = col("c1") + col("c2");
+ /// let schema = DFSchema::new_with_metadata(
+ /// vec![
+ /// DFField::new_unqualified("c1", DataType::Int32, true),
+ /// DFField::new_unqualified("c2", DataType::Float32, true),
+ /// ],
+ /// HashMap::new(),
+ /// )
+ /// .unwrap();
+ /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap()));
+ /// }
+ /// ```
+ ///
/// # Errors
///
/// This function errors when it is not possible to compute its
diff --git a/docs/source/library-user-guide/working-with-exprs.md b/docs/source/library-user-guide/working-with-exprs.md
index 96be8ef7f1..b7e9248a7c 100644
--- a/docs/source/library-user-guide/working-with-exprs.md
+++ b/docs/source/library-user-guide/working-with-exprs.md
@@ -180,6 +180,34 @@ Projection: Int64(1) + Int64(1) AS added_one
I.e. the `add_one` UDF has been inlined into the projection.
+## Getting the data type of the expression
+
+The `arrow::datatypes::DataType` of the expression can be obtained by calling the `get_type` given something that implements `Expr::Schemable`, for example a `DFschema` object:
+
+```rust
+use arrow_schema::DataType;
+use datafusion::common::{DFField, DFSchema};
+use datafusion::logical_expr::{col, ExprSchemable};
+use std::collections::HashMap;
+
+let expr = col("c1") + col("c2");
+let schema = DFSchema::new_with_metadata(
+ vec![
+ DFField::new_unqualified("c1", DataType::Int32, true),
+ DFField::new_unqualified("c2", DataType::Float32, true),
+ ],
+ HashMap::new(),
+)
+.unwrap();
+print!("type = {}", expr.get_type(&schema).unwrap());
+```
+
+This results in the following output:
+
+```text
+type = Float32
+```
+
## Conclusion
In this guide, we've seen how to create `Expr`s programmatically and how to rewrite them. This is useful for simplifying and optimizing `Expr`s. We've also seen how to test our rule to ensure it works properly.