You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2024/03/28 12:24:35 UTC
(spark) branch master updated: [SPARK-47621][PYTHON][DOCS] Refine docstring of `try_sum`, `try_avg`, `avg`, `sum`, `mean`
This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new b594c4edb383 [SPARK-47621][PYTHON][DOCS] Refine docstring of `try_sum`, `try_avg`, `avg`, `sum`, `mean`
b594c4edb383 is described below
commit b594c4edb38364139adc3934b14284d9ed9c7d46
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Thu Mar 28 20:24:16 2024 +0800
[SPARK-47621][PYTHON][DOCS] Refine docstring of `try_sum`, `try_avg`, `avg`, `sum`, `mean`
### What changes were proposed in this pull request?
This PR refines docstring of `try_sum`, `try_avg`, `avg`, `sum`, `mean` with more descriptive examples.
### Why are the changes needed?
For better API reference documentation.
### Does this PR introduce _any_ user-facing change?
Yes, it fixes user-facing documentation.
### How was this patch tested?
Manually tested. GitHub Actions should verify them.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #45745 from HyukjinKwon/SPARK-47621.
Lead-authored-by: Hyukjin Kwon <gu...@apache.org>
Co-authored-by: Hyukjin Kwon <gu...@gmail.com>
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
python/pyspark/sql/functions/builtin.py | 149 ++++++++++++++++++++++++++++----
1 file changed, 130 insertions(+), 19 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index 59167ad9e736..386d28cca0c0 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -528,15 +528,45 @@ def try_avg(col: "ColumnOrName") -> Column:
Examples
--------
+ Example 1: Calculating the average age
+
>>> import pyspark.sql.functions as sf
- >>> spark.createDataFrame(
- ... [(1982, 15), (1990, 2)], ["birth", "age"]
- ... ).select(sf.try_avg("age")).show()
+ >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"])
+ >>> df.select(sf.try_avg("age")).show()
+------------+
|try_avg(age)|
+------------+
| 8.5|
+------------+
+
+ Example 2: Calculating the average age with None
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"])
+ >>> df.select(sf.try_avg("age")).show()
+ +------------+
+ |try_avg(age)|
+ +------------+
+ | 3.0|
+ +------------+
+
+ Example 3: Overflow results in NULL when ANSI mode is on
+
+ >>> from decimal import Decimal
+ >>> import pyspark.sql.functions as sf
+ >>> origin = spark.conf.get("spark.sql.ansi.enabled")
+ >>> spark.conf.set("spark.sql.ansi.enabled", "true")
+ >>> try:
+ ... df = spark.createDataFrame(
+ ... [(Decimal("1" * 38),), (Decimal(0),)], "number DECIMAL(38, 0)")
+ ... df.select(sf.try_avg(df.number)).show()
+ ... finally:
+ ... spark.conf.set("spark.sql.ansi.enabled", origin)
+ +---------------+
+ |try_avg(number)|
+ +---------------+
+ | NULL|
+ +---------------+
"""
return _invoke_function_over_columns("try_avg", col)
@@ -720,13 +750,55 @@ def try_sum(col: "ColumnOrName") -> Column:
Examples
--------
- >>> import pyspark.sql.functions as sf
- >>> spark.range(10).select(sf.try_sum("id")).show()
+ Example 1: Calculating the sum of values in a column
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.range(10)
+ >>> df.select(sf.try_sum(df["id"])).show()
+-----------+
|try_sum(id)|
+-----------+
| 45|
+-----------+
+
+ Example 2: Using a plus expression together to calculate the sum
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(1, 2), (3, 4)], ["A", "B"])
+ >>> df.select(sf.try_sum(sf.col("A") + sf.col("B"))).show()
+ +----------------+
+ |try_sum((A + B))|
+ +----------------+
+ | 10|
+ +----------------+
+
+ Example 3: Calculating the summation of ages with None
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"])
+ >>> df.select(sf.try_sum("age")).show()
+ +------------+
+ |try_sum(age)|
+ +------------+
+ | 6|
+ +------------+
+
+ Example 4: Overflow results in NULL when ANSI mode is on
+
+ >>> from decimal import Decimal
+ >>> import pyspark.sql.functions as sf
+ >>> origin = spark.conf.get("spark.sql.ansi.enabled")
+ >>> spark.conf.set("spark.sql.ansi.enabled", "true")
+ >>> try:
+ ... df = spark.createDataFrame([(Decimal("1" * 38),)] * 10, "number DECIMAL(38, 0)")
+ ... df.select(sf.try_sum(df.number)).show()
+ ... finally:
+ ... spark.conf.set("spark.sql.ansi.enabled", origin)
+ +---------------+
+ |try_sum(number)|
+ +---------------+
+ | NULL|
+ +---------------+
"""
return _invoke_function_over_columns("try_sum", col)
@@ -1323,6 +1395,17 @@ def sum(col: "ColumnOrName") -> Column:
+------------+
| 10|
+------------+
+
+ Example 3: Calculating the summation of ages with None
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"])
+ >>> df.select(sf.sum("age")).show()
+ +--------+
+ |sum(age)|
+ +--------+
+ | 6|
+ +--------+
"""
return _invoke_function_over_columns("sum", col)
@@ -1349,13 +1432,27 @@ def avg(col: "ColumnOrName") -> Column:
Examples
--------
- >>> df = spark.range(10)
- >>> df.select(avg(col("id"))).show()
- +-------+
- |avg(id)|
- +-------+
- | 4.5|
- +-------+
+ Example 1: Calculating the average age
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"])
+ >>> df.select(sf.avg("age")).show()
+ +--------+
+ |avg(age)|
+ +--------+
+ | 8.5|
+ +--------+
+
+ Example 2: Calculating the average age with None
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"])
+ >>> df.select(sf.avg("age")).show()
+ +--------+
+ |avg(age)|
+ +--------+
+ | 3.0|
+ +--------+
"""
return _invoke_function_over_columns("avg", col)
@@ -1383,13 +1480,27 @@ def mean(col: "ColumnOrName") -> Column:
Examples
--------
- >>> df = spark.range(10)
- >>> df.select(mean(df.id)).show()
- +-------+
- |avg(id)|
- +-------+
- | 4.5|
- +-------+
+ Example 1: Calculating the average age
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"])
+ >>> df.select(sf.mean("age")).show()
+ +--------+
+ |avg(age)|
+ +--------+
+ | 8.5|
+ +--------+
+
+ Example 2: Calculating the average age with None
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"])
+ >>> df.select(sf.mean("age")).show()
+ +--------+
+ |avg(age)|
+ +--------+
+ | 3.0|
+ +--------+
"""
return _invoke_function_over_columns("mean", col)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org