You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ho...@apache.org on 2017/08/19 01:11:09 UTC

spark git commit: [SPARK-21566][SQL][PYTHON] Python method for summary

Repository: spark
Updated Branches:
  refs/heads/master a2db5c576 -> 10be01848


[SPARK-21566][SQL][PYTHON] Python method for summary

## What changes were proposed in this pull request?

Adds the recently added `summary` method to the python dataframe interface.

## How was this patch tested?

Additional inline doctests.

Author: Andrew Ray <ra...@gmail.com>

Closes #18762 from aray/summary-py.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/10be0184
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/10be0184
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/10be0184

Branch: refs/heads/master
Commit: 10be01848ef28004a287940a4e8d8a044e14b257
Parents: a2db5c5
Author: Andrew Ray <ra...@gmail.com>
Authored: Fri Aug 18 18:10:54 2017 -0700
Committer: Holden Karau <ho...@us.ibm.com>
Committed: Fri Aug 18 18:10:54 2017 -0700

----------------------------------------------------------------------
 python/pyspark/sql/dataframe.py | 61 +++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/10be0184/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 5cd208b..d1b2a9c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -927,7 +927,7 @@ class DataFrame(object):
 
     @since("1.3.1")
     def describe(self, *cols):
-        """Computes statistics for numeric and string columns.
+        """Computes basic statistics for numeric and string columns.
 
         This include count, mean, stddev, min, and max. If no columns are
         given, this function computes statistics for all numerical or string columns.
@@ -955,12 +955,71 @@ class DataFrame(object):
         |    min|                 2|Alice|
         |    max|                 5|  Bob|
         +-------+------------------+-----+
+
+        Use summary for expanded statistics and control over which statistics to compute.
         """
         if len(cols) == 1 and isinstance(cols[0], list):
             cols = cols[0]
         jdf = self._jdf.describe(self._jseq(cols))
         return DataFrame(jdf, self.sql_ctx)
 
+    @since("2.3.0")
+    def summary(self, *statistics):
+        """Computes specified statistics for numeric and string columns. Available statistics are:
+        - count
+        - mean
+        - stddev
+        - min
+        - max
+        - arbitrary approximate percentiles specified as a percentage (eg, 75%)
+
+        If no statistics are given, this function computes count, mean, stddev, min,
+        approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+
+        .. note:: This function is meant for exploratory data analysis, as we make no
+            guarantee about the backward compatibility of the schema of the resulting DataFrame.
+
+        >>> df.summary().show()
+        +-------+------------------+-----+
+        |summary|               age| name|
+        +-------+------------------+-----+
+        |  count|                 2|    2|
+        |   mean|               3.5| null|
+        | stddev|2.1213203435596424| null|
+        |    min|                 2|Alice|
+        |    25%|               5.0| null|
+        |    50%|               5.0| null|
+        |    75%|               5.0| null|
+        |    max|                 5|  Bob|
+        +-------+------------------+-----+
+
+        >>> df.summary("count", "min", "25%", "75%", "max").show()
+        +-------+---+-----+
+        |summary|age| name|
+        +-------+---+-----+
+        |  count|  2|    2|
+        |    min|  2|Alice|
+        |    25%|5.0| null|
+        |    75%|5.0| null|
+        |    max|  5|  Bob|
+        +-------+---+-----+
+
+        To do a summary for specific columns first select them:
+
+        >>> df.select("age", "name").summary("count").show()
+        +-------+---+----+
+        |summary|age|name|
+        +-------+---+----+
+        |  count|  2|   2|
+        +-------+---+----+
+
+        See also describe for basic statistics.
+        """
+        if len(statistics) == 1 and isinstance(statistics[0], list):
+            statistics = statistics[0]
+        jdf = self._jdf.summary(self._jseq(statistics))
+        return DataFrame(jdf, self.sql_ctx)
+
     @ignore_unicode_prefix
     @since(1.3)
     def head(self, n=None):


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org