You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "jiangzhx (via GitHub)" <gi...@apache.org> on 2023/02/27 03:07:42 UTC

[GitHub] [arrow-datafusion] jiangzhx commented on a diff in pull request #5226: add a describe method on DataFrame like Polars

jiangzhx commented on code in PR #5226:
URL: https://github.com/apache/arrow-datafusion/pull/5226#discussion_r1118239192


##########
datafusion/core/src/dataframe.rs:
##########
@@ -302,6 +306,177 @@ impl DataFrame {
         ))
     }
 
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.describe().await?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn describe(self) -> Result<()> {
+        Ok(print_batches(
+            &self.clone().collect_describe().await.unwrap(),
+        )?)
+    }
+
+    /// Summary statistics for a DataFrame. Only summarizes numeric datatypes at the moment and
+    /// returns nulls for non numeric datatypes. Try in keep output similar to pandas
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # use arrow::util::pretty;
+    /// # #[tokio::main]
+    /// # async fn main() -> Result<()> {
+    /// let ctx = SessionContext::new();
+    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;    
+    /// df.collect_describe().await.unwrap();
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub async fn collect_describe(self) -> Result<Vec<RecordBatch>> {
+        //the functions now supported
+        let supported_describe_functions = vec!["count", "null_count", "max", "min"]; //"count",  "max", "min",
+
+        let fields_iter = self.schema().fields().iter();
+
+        //define describe column
+        let mut describe_schemas = fields_iter
+            .clone()
+            .map(|field| {
+                if field.data_type().is_numeric() {
+                    Field::new(field.name(), DataType::Float64, true)
+                } else {
+                    Field::new(field.name(), DataType::Utf8, true)
+                }
+            })
+            .collect::<Vec<_>>();
+        describe_schemas.insert(0, Field::new("describe", DataType::Utf8, false));
+
+        //collect recordBatch
+        let describe_record_batch = vec![
+            // count aggregation
+            self.clone()
+                .aggregate(
+                    vec![],
+                    fields_iter
+                        .clone()
+                        .map(|f| {
+                            Expr::Alias(
+                                Box::new(datafusion_expr::count(col(f.name()))),
+                                f.name().to_string(),
+                            )

Review Comment:
   thanks,  This API is more convenient to use.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org