You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/02/26 12:40:40 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #5226: add a describe method on DataFrame like Polars

alamb commented on code in PR #5226:
URL: https://github.com/apache/arrow-datafusion/pull/5226#discussion_r1118075835


##########
datafusion/core/src/dataframe.rs:
##########
@@ -302,6 +306,177 @@ impl DataFrame {
         ))
     }
 
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.describe().await?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn describe(self) -> Result<()> {
+        Ok(print_batches(
+            &self.clone().collect_describe().await.unwrap(),
+        )?)
+    }
+
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.collect_describe().await.unwrap();
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn collect_describe(self) -> Result<Vec<RecordBatch>> {
+        //the functions now supported
+        let supported_describe_functions = vec!["count", "null_count", "max", "min"]; //"count",  "max", "min",
+
+        let fields_iter = self.schema().fields().iter();
+
+        //define describe column
+        let mut describe_schemas = fields_iter
+            .clone()
+            .map(|field| {
+                if field.data_type().is_numeric() {
+                    Field::new(field.name(), DataType::Float64, true)
+                } else {
+                    Field::new(field.name(), DataType::Utf8, true)
+                }
+            })
+            .collect::<Vec<_>>();
+        describe_schemas.insert(0, Field::new("describe", DataType::Utf8, false));
+
+        //collect recordBatch
+        let describe_record_batch = vec![
+            // count aggregation
+            self.clone()
+                .aggregate(
+                    vec![],
+                    fields_iter
+                        .clone()
+                        .map(|f| {
+                            Expr::Alias(
+                                Box::new(datafusion_expr::count(col(f.name()))),
+                                f.name().to_string(),
+                            )

Review Comment:
   I think you can write this more concisely like this (untested)
   
   ```suggestion
                               datafusion_expr::count(col(f.name()).alias(f.name())
   ```



##########
datafusion/core/src/dataframe.rs:
##########
@@ -302,6 +306,177 @@ impl DataFrame {
         ))
     }
 
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.describe().await?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn describe(self) -> Result<()> {
+        Ok(print_batches(

Review Comment:
   I think a nicer interface would be 
   
   ```rust
   pub async fn describe(self) -> Result<Self> {
   ```
   
   And then we don't need a separate `collect_describe` instead we could reuse the existing `collect()`
   
   One way might be to return a new `DataFrame` from the created RecordBatch -- perhaps following the model of
   
   https://docs.rs/datafusion/18.0.0/src/datafusion/execution/context.rs.html#719-730



##########
datafusion/core/src/dataframe.rs:
##########
@@ -302,6 +306,177 @@ impl DataFrame {
         ))
     }
 
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.describe().await?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn describe(self) -> Result<()> {
+        Ok(print_batches(
+            &self.clone().collect_describe().await.unwrap(),
+        )?)
+    }
+
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.collect_describe().await.unwrap();
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn collect_describe(self) -> Result<Vec<RecordBatch>> {
+        //the functions now supported
+        let supported_describe_functions = vec!["count", "null_count", "max", "min"]; //"count",  "max", "min",
+
+        let fields_iter = self.schema().fields().iter();
+
+        //define describe column
+        let mut describe_schemas = fields_iter
+            .clone()
+            .map(|field| {
+                if field.data_type().is_numeric() {
+                    Field::new(field.name(), DataType::Float64, true)
+                } else {
+                    Field::new(field.name(), DataType::Utf8, true)

Review Comment:
   I would expect that the schema for `count` and `null_count` were always `Int64` and the schema for min/max were always `Utf8`



##########
datafusion/core/src/dataframe.rs:
##########
@@ -302,6 +306,177 @@ impl DataFrame {
         ))
     }
 
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.describe().await?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn describe(self) -> Result<()> {
+        Ok(print_batches(
+            &self.clone().collect_describe().await.unwrap(),
+        )?)
+    }
+
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.collect_describe().await.unwrap();
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn collect_describe(self) -> Result<Vec<RecordBatch>> {
+        //the functions now supported
+        let supported_describe_functions = vec!["count", "null_count", "max", "min"]; //"count",  "max", "min",

Review Comment:
   Is the `// count` comment on purpose?



##########
datafusion/core/src/dataframe.rs:
##########
@@ -302,6 +306,177 @@ impl DataFrame {
         ))
     }
 
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.describe().await?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn describe(self) -> Result<()> {
+        Ok(print_batches(
+            &self.clone().collect_describe().await.unwrap(),
+        )?)
+    }
+
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.collect_describe().await.unwrap();
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn collect_describe(self) -> Result<Vec<RecordBatch>> {
+        //the functions now supported
+        let supported_describe_functions = vec!["count", "null_count", "max", "min"]; //"count",  "max", "min",
+
+        let fields_iter = self.schema().fields().iter();
+
+        //define describe column
+        let mut describe_schemas = fields_iter
+            .clone()
+            .map(|field| {
+                if field.data_type().is_numeric() {
+                    Field::new(field.name(), DataType::Float64, true)
+                } else {
+                    Field::new(field.name(), DataType::Utf8, true)
+                }
+            })
+            .collect::<Vec<_>>();
+        describe_schemas.insert(0, Field::new("describe", DataType::Utf8, false));
+
+        //collect recordBatch
+        let describe_record_batch = vec![
+            // count aggregation
+            self.clone()
+                .aggregate(
+                    vec![],
+                    fields_iter
+                        .clone()
+                        .map(|f| {
+                            Expr::Alias(
+                                Box::new(datafusion_expr::count(col(f.name()))),
+                                f.name().to_string(),
+                            )
+                        })
+                        .collect::<Vec<_>>(),
+                )?
+                .collect()
+                .await?,
+            // null_count aggregation
+            self.clone()
+                .aggregate(
+                    vec![],
+                    fields_iter
+                        .clone()
+                        .map(|f| {
+                            Expr::Alias(
+                                Box::new(datafusion_expr::count(
+                                    datafusion_expr::is_null(col(f.name())),
+                                )),
+                                f.name().to_string(),
+                            )
+                        })
+                        .collect::<Vec<_>>(),
+                )?
+                .collect()
+                .await?,
+            // max aggregation
+            self.clone()
+                .aggregate(
+                    vec![],
+                    fields_iter
+                        .clone()
+                        .filter(|f| matches!(f.data_type().is_numeric(), true))

Review Comment:
   I wonder why restrict the min/max aggregation to numeric fields?
   
   In order to get the min/max values in all columns to work, you could call `cast` to cast them to the same datatype



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org