You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/08/23 00:37:51 UTC
[spark] branch master updated: [SPARK-40142][PYTHON][SQL][FOLLOW-UP] Make pyspark.sql.functions examples self-contained (part 2, 32 functions)
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d2f012efab7 [SPARK-40142][PYTHON][SQL][FOLLOW-UP] Make pyspark.sql.functions examples self-contained (part 2, 32 functions)
d2f012efab7 is described below
commit d2f012efab7e836ae066087be1febc58686b69cf
Author: Khalid Mammadov <kh...@gmail.com>
AuthorDate: Tue Aug 23 09:37:37 2022 +0900
[SPARK-40142][PYTHON][SQL][FOLLOW-UP] Make pyspark.sql.functions examples self-contained (part 2, 32 functions)
### What changes were proposed in this pull request?
Docstring improvements
### Why are the changes needed?
To help users to understand pyspark API
### Does this PR introduce _any_ user-facing change?
Yes, documentation
### How was this patch tested?
`bundle exec jekyll serve --host 0.0.0.0`
Closes #37592 from khalidmammadov/feature/improve_docstrings.
Lead-authored-by: Khalid Mammadov <kh...@gmail.com>
Co-authored-by: Hyukjin Kwon <gu...@gmail.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/sql/functions.py | 528 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 522 insertions(+), 6 deletions(-)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index abedaf24417..d59532f52cb 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -963,6 +963,13 @@ def cos(col: "ColumnOrName") -> Column:
-------
:class:`~pyspark.sql.Column`
cosine of the angle, as if computed by `java.lang.Math.cos()`.
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(cos(lit(math.pi))).first()
+ Row(COS(3.14159...)=-1.0)
"""
return _invoke_function_over_columns("cos", col)
@@ -982,6 +989,12 @@ def cosh(col: "ColumnOrName") -> Column:
-------
:class:`~pyspark.sql.Column`
hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()`
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(cosh(lit(1))).first()
+ Row(COSH(1)=1.54308...)
"""
return _invoke_function_over_columns("cosh", col)
@@ -995,12 +1008,19 @@ def cot(col: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
- Angle in radians
+ angle in radians.
Returns
-------
:class:`~pyspark.sql.Column`
- Cotangent of the angle.
+ cotangent of the angle.
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(cot(lit(math.radians(45)))).first()
+ Row(COT(0.78539...)=1.00000...)
"""
return _invoke_function_over_columns("cot", col)
@@ -1014,12 +1034,19 @@ def csc(col: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
- Angle in radians
+ angle in radians.
Returns
-------
:class:`~pyspark.sql.Column`
- Cosecant of the angle.
+ cosecant of the angle.
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(csc(lit(math.radians(90)))).first()
+ Row(CSC(1.57079...)=1.0)
"""
return _invoke_function_over_columns("csc", col)
@@ -1029,6 +1056,26 @@ def exp(col: "ColumnOrName") -> Column:
Computes the exponential of the given value.
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ column to calculate exponential for.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ exponential of the given value.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(exp(lit(0))).show()
+ +------+
+ |EXP(0)|
+ +------+
+ | 1.0|
+ +------+
"""
return _invoke_function_over_columns("exp", col)
@@ -1038,6 +1085,22 @@ def expm1(col: "ColumnOrName") -> Column:
Computes the exponential of the given value minus one.
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ column to calculate exponential for.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ exponential less one.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(expm1(lit(1))).first()
+ Row(EXPM1(1)=1.71828...)
"""
return _invoke_function_over_columns("expm1", col)
@@ -1047,6 +1110,26 @@ def floor(col: "ColumnOrName") -> Column:
Computes the floor of the given value.
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ column to find floor for.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ neares integer that is less than or equal to given value.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(floor(lit(2.5))).show()
+ +----------+
+ |FLOOR(2.5)|
+ +----------+
+ | 2|
+ +----------+
"""
return _invoke_function_over_columns("floor", col)
@@ -1056,6 +1139,23 @@ def log(col: "ColumnOrName") -> Column:
Computes the natural logarithm of the given value.
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ column to calculate natural logarithm for.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ natural logarithm of the given value.
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(log(lit(math.e))).first()
+ Row(ln(2.71828...)=1.0)
"""
return _invoke_function_over_columns("log", col)
@@ -1065,15 +1165,57 @@ def log10(col: "ColumnOrName") -> Column:
Computes the logarithm of the given value in Base 10.
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ column to calculate logarithm for.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ logarithm of the given value in Base 10.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(log10(lit(100))).show()
+ +----------+
+ |LOG10(100)|
+ +----------+
+ | 2.0|
+ +----------+
"""
return _invoke_function_over_columns("log10", col)
def log1p(col: "ColumnOrName") -> Column:
"""
- Computes the natural logarithm of the given value plus one.
+ Computes the natural logarithm of the "given value plus one".
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ column to calculate natural logarithm for.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ natural logarithm of the "given value plus one".
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(log1p(lit(math.e))).first()
+ Row(LOG1P(2.71828...)=1.31326...)
+
+ Same as:
+
+ >>> df.select(log(lit(math.e+1))).first()
+ Row(ln(3.71828...)=1.31326...)
"""
return _invoke_function_over_columns("log1p", col)
@@ -1084,6 +1226,33 @@ def rint(col: "ColumnOrName") -> Column:
is equal to a mathematical integer.
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(rint(lit(10.6))).show()
+ +----------+
+ |rint(10.6)|
+ +----------+
+ | 11.0|
+ +----------+
+
+ >>> df.select(rint(lit(10.3))).show()
+ +----------+
+ |rint(10.3)|
+ +----------+
+ | 10.0|
+ +----------+
"""
return _invoke_function_over_columns("rint", col)
@@ -1103,6 +1272,12 @@ def sec(col: "ColumnOrName") -> Column:
-------
:class:`~pyspark.sql.Column`
Secant of the angle.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(sec(lit(1.5))).first()
+ Row(SEC(1.5)=14.13683...)
"""
return _invoke_function_over_columns("sec", col)
@@ -1112,6 +1287,33 @@ def signum(col: "ColumnOrName") -> Column:
Computes the signum of the given value.
.. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(signum(lit(-5))).show()
+ +----------+
+ |SIGNUM(-5)|
+ +----------+
+ | -1.0|
+ +----------+
+
+ >>> df.select(signum(lit(6))).show()
+ +---------+
+ |SIGNUM(6)|
+ +---------+
+ | 1.0|
+ +---------+
"""
return _invoke_function_over_columns("signum", col)
@@ -1125,11 +1327,19 @@ def sin(col: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
Returns
-------
:class:`~pyspark.sql.Column`
sine of the angle, as if computed by `java.lang.Math.sin()`
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(sin(lit(math.radians(90)))).first()
+ Row(SIN(1.57079...)=1.0)
"""
return _invoke_function_over_columns("sin", col)
@@ -1143,13 +1353,19 @@ def sinh(col: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
- hyperbolic angle
+ hyperbolic angle.
Returns
-------
:class:`~pyspark.sql.Column`
hyperbolic sine of the given value,
as if computed by `java.lang.Math.sinh()`
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(sinh(lit(1.1))).first()
+ Row(SINH(1.1)=1.33564...)
"""
return _invoke_function_over_columns("sinh", col)
@@ -1169,6 +1385,13 @@ def tan(col: "ColumnOrName") -> Column:
-------
:class:`~pyspark.sql.Column`
tangent of the given value, as if computed by `java.lang.Math.tan()`
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(tan(lit(math.radians(45)))).first()
+ Row(TAN(0.78539...)=0.99999...)
"""
return _invoke_function_over_columns("tan", col)
@@ -1189,6 +1412,13 @@ def tanh(col: "ColumnOrName") -> Column:
:class:`~pyspark.sql.Column`
hyperbolic tangent of the given value
as if computed by `java.lang.Math.tanh()`
+
+ Examples
+ --------
+ >>> import math
+ >>> df = spark.range(1)
+ >>> df.select(tanh(lit(math.radians(90)))).first()
+ Row(TANH(1.57079...)=0.91715...)
"""
return _invoke_function_over_columns("tanh", col)
@@ -1233,6 +1463,32 @@ def bitwise_not(col: "ColumnOrName") -> Column:
Computes bitwise not.
.. versionadded:: 3.2.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(bitwise_not(lit(0))).show()
+ +---+
+ | ~0|
+ +---+
+ | -1|
+ +---+
+ >>> df.select(bitwise_not(lit(1))).show()
+ +---+
+ | ~1|
+ +---+
+ | -2|
+ +---+
"""
return _invoke_function_over_columns("bitwise_not", col)
@@ -1243,6 +1499,31 @@ def asc_nulls_first(col: "ColumnOrName") -> Column:
column name, and null values return before non-null values.
.. versionadded:: 2.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to sort by in the ascending order.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column specifying the order.
+
+ Examples
+ --------
+ >>> df1 = spark.createDataFrame([(1, "Bob"),
+ ... (0, None),
+ ... (2, "Alice")], ["age", "name"])
+ >>> df1.sort(asc_nulls_first(df1.name)).show()
+ +---+-----+
+ |age| name|
+ +---+-----+
+ | 0| null|
+ | 2|Alice|
+ | 1| Bob|
+ +---+-----+
+
"""
return (
col.asc_nulls_first()
@@ -1257,6 +1538,31 @@ def asc_nulls_last(col: "ColumnOrName") -> Column:
column name, and null values appear after non-null values.
.. versionadded:: 2.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to sort by in the ascending order.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column specifying the order.
+
+ Examples
+ --------
+ >>> df1 = spark.createDataFrame([(0, None),
+ ... (1, "Bob"),
+ ... (2, "Alice")], ["age", "name"])
+ >>> df1.sort(asc_nulls_last(df1.name)).show()
+ +---+-----+
+ |age| name|
+ +---+-----+
+ | 2|Alice|
+ | 1| Bob|
+ | 0| null|
+ +---+-----+
+
"""
return (
col.asc_nulls_last() if isinstance(col, Column) else _invoke_function("asc_nulls_last", col)
@@ -1269,6 +1575,31 @@ def desc_nulls_first(col: "ColumnOrName") -> Column:
column name, and null values appear before non-null values.
.. versionadded:: 2.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to sort by in the descending order.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column specifying the order.
+
+ Examples
+ --------
+ >>> df1 = spark.createDataFrame([(0, None),
+ ... (1, "Bob"),
+ ... (2, "Alice")], ["age", "name"])
+ >>> df1.sort(desc_nulls_first(df1.name)).show()
+ +---+-----+
+ |age| name|
+ +---+-----+
+ | 0| null|
+ | 1| Bob|
+ | 2|Alice|
+ +---+-----+
+
"""
return (
col.desc_nulls_first()
@@ -1283,6 +1614,31 @@ def desc_nulls_last(col: "ColumnOrName") -> Column:
column name, and null values appear after non-null values.
.. versionadded:: 2.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to sort by in the descending order.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column specifying the order.
+
+ Examples
+ --------
+ >>> df1 = spark.createDataFrame([(0, None),
+ ... (1, "Bob"),
+ ... (2, "Alice")], ["age", "name"])
+ >>> df1.sort(desc_nulls_last(df1.name)).show()
+ +---+-----+
+ |age| name|
+ +---+-----+
+ | 1| Bob|
+ | 2|Alice|
+ | 0| null|
+ +---+-----+
+
"""
return (
col.desc_nulls_last()
@@ -1296,6 +1652,22 @@ def stddev(col: "ColumnOrName") -> Column:
Aggregate function: alias for stddev_samp.
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ standard deviation of given column.
+
+ Examples
+ --------
+ >>> df = spark.range(6)
+ >>> df.select(stddev(df.id)).first()
+ Row(stddev_samp(id)=1.87082...)
"""
return _invoke_function_over_columns("stddev", col)
@@ -1306,6 +1678,22 @@ def stddev_samp(col: "ColumnOrName") -> Column:
the expression in a group.
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ standard deviation of given column.
+
+ Examples
+ --------
+ >>> df = spark.range(6)
+ >>> df.select(stddev_samp(df.id)).first()
+ Row(stddev_samp(id)=1.87082...)
"""
return _invoke_function_over_columns("stddev_samp", col)
@@ -1316,6 +1704,22 @@ def stddev_pop(col: "ColumnOrName") -> Column:
the expression in a group.
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ standard deviation of given column.
+
+ Examples
+ --------
+ >>> df = spark.range(6)
+ >>> df.select(stddev_pop(df.id)).first()
+ Row(stddev_pop(id)=1.70782...)
"""
return _invoke_function_over_columns("stddev_pop", col)
@@ -1325,6 +1729,26 @@ def variance(col: "ColumnOrName") -> Column:
Aggregate function: alias for var_samp
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ variance of given column.
+
+ Examples
+ --------
+ >>> df = spark.range(6)
+ >>> df.select(variance(df.id)).show()
+ +------------+
+ |var_samp(id)|
+ +------------+
+ | 3.5|
+ +------------+
"""
return _invoke_function_over_columns("variance", col)
@@ -1335,6 +1759,26 @@ def var_samp(col: "ColumnOrName") -> Column:
the values in a group.
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ variance of given column.
+
+ Examples
+ --------
+ >>> df = spark.range(6)
+ >>> df.select(var_samp(df.id)).show()
+ +------------+
+ |var_samp(id)|
+ +------------+
+ | 3.5|
+ +------------+
"""
return _invoke_function_over_columns("var_samp", col)
@@ -1344,6 +1788,22 @@ def var_pop(col: "ColumnOrName") -> Column:
Aggregate function: returns the population variance of the values in a group.
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ variance of given column.
+
+ Examples
+ --------
+ >>> df = spark.range(6)
+ >>> df.select(var_pop(df.id)).first()
+ Row(var_pop(id)=2.91666...)
"""
return _invoke_function_over_columns("var_pop", col)
@@ -1353,6 +1813,22 @@ def skewness(col: "ColumnOrName") -> Column:
Aggregate function: returns the skewness of the values in a group.
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ skewness of given column.
+
+ Examples
+ --------
+ >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])
+ >>> df.select(skewness(df.c)).first()
+ Row(skewness(c)=0.70710...)
"""
return _invoke_function_over_columns("skewness", col)
@@ -1362,6 +1838,26 @@ def kurtosis(col: "ColumnOrName") -> Column:
Aggregate function: returns the kurtosis of the values in a group.
.. versionadded:: 1.6.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ kurtosis of given column.
+
+ Examples
+ --------
+ >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])
+ >>> df.select(kurtosis(df.c)).show()
+ +-----------+
+ |kurtosis(c)|
+ +-----------+
+ | -1.5|
+ +-----------+
"""
return _invoke_function_over_columns("kurtosis", col)
@@ -1377,6 +1873,16 @@ def collect_list(col: "ColumnOrName") -> Column:
The function is non-deterministic because the order of collected results depends
on the order of the rows which may be non-deterministic after a shuffle.
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ list of objects with duplicates.
+
Examples
--------
>>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
@@ -1397,6 +1903,16 @@ def collect_set(col: "ColumnOrName") -> Column:
The function is non-deterministic because the order of collected results depends
on the order of the rows which may be non-deterministic after a shuffle.
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ list of objects with no duplicates.
+
Examples
--------
>>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org