You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/08/19 08:11:43 UTC
[spark] branch master updated: [SPARK-40142][PYTHON][SQL] Make pyspark.sql.functions examples self-contained (part 1, 25 functions)
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 15d44904f22 [SPARK-40142][PYTHON][SQL] Make pyspark.sql.functions examples self-contained (part 1, 25 functions)
15d44904f22 is described below
commit 15d44904f2256e471723a548c03e3309e1b08a02
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Fri Aug 19 17:10:42 2022 +0900
[SPARK-40142][PYTHON][SQL] Make pyspark.sql.functions examples self-contained (part 1, 25 functions)
### What changes were proposed in this pull request?
This PR proposes to improve the examples in `pyspark.sql.functions` by making each example self-contained with a brief explanation and a bit more realistic example.
This PR fixes the first 25 functions in `pyspark.sql.functions` because the number of functions is too large.
This PR also additionally removes `since` because we won't need this when we have the complete examples and parameters.
### Why are the changes needed?
To make the documentation more readable and able to copy and paste directly in PySpark shell.
### Does this PR introduce _any_ user-facing change?
Yes, it changes the documentation
### How was this patch tested?
Manually ran each doctests. CI also runs this.
Closes #37575 from HyukjinKwon/SPARK-40142.
Authored-by: Hyukjin Kwon <gu...@apache.org>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/sql/functions.py | 622 +++++++++++++++++++++++++++++++++++-----
1 file changed, 551 insertions(+), 71 deletions(-)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 9dd81145243..df71969ccb7 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -37,7 +37,7 @@ from typing import (
ValuesView,
)
-from pyspark import since, SparkContext
+from pyspark import SparkContext
from pyspark.rdd import PythonEvalType
from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_from_literal
from pyspark.sql.dataframe import DataFrame
@@ -129,19 +129,46 @@ def lit(col: Any) -> Column:
.. versionadded:: 1.3.0
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or Python primitive type.
+ the value to make it as a PySpark literal. If a column is passed,
+ it returns the column as is.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the literal instance.
+
Examples
--------
- >>> df.select(lit(5).alias('height')).withColumn('spark_user', lit(True)).take(1)
- [Row(height=5, spark_user=True)]
+ >>> df = spark.range(1)
+ >>> df.select(lit(5).alias('height'), df.id).show()
+ +------+---+
+ |height| id|
+ +------+---+
+ | 5| 0|
+ +------+---+
"""
return col if isinstance(col, Column) else _invoke_function("lit", col)
-@since(1.3)
def col(col: str) -> Column:
"""
Returns a :class:`~pyspark.sql.Column` based on the given column name.
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : str
+ the name for the column
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the corresponding column instance.
+
Examples
--------
>>> col('x')
@@ -155,34 +182,143 @@ def col(col: str) -> Column:
column = col
-@since(1.3)
def asc(col: "ColumnOrName") -> Column:
"""
Returns a sort expression based on the ascending order of the given column name.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to sort by in the ascending order.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column specifying the order.
+
+ Examples
+ --------
+ Sort by the column 'id' in the descending order.
+
+ >>> df = spark.range(5)
+ >>> df = df.sort(desc("id"))
+ >>> df.show()
+ +---+
+ | id|
+ +---+
+ | 4|
+ | 3|
+ | 2|
+ | 1|
+ | 0|
+ +---+
+
+ Sort by the column 'id' in the ascending order.
+
+ >>> df.orderBy(asc("id")).show()
+ +---+
+ | id|
+ +---+
+ | 0|
+ | 1|
+ | 2|
+ | 3|
+ | 4|
+ +---+
"""
return col.asc() if isinstance(col, Column) else _invoke_function("asc", col)
-@since(1.3)
def desc(col: "ColumnOrName") -> Column:
"""
Returns a sort expression based on the descending order of the given column name.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to sort by in the descending order.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column specifying the order.
+
+ Examples
+ --------
+ Sort by the column 'id' in the descending order.
+
+ >>> spark.range(5).orderBy(desc("id")).show()
+ +---+
+ | id|
+ +---+
+ | 4|
+ | 3|
+ | 2|
+ | 1|
+ | 0|
+ +---+
"""
return col.desc() if isinstance(col, Column) else _invoke_function("desc", col)
-@since(1.3)
def sqrt(col: "ColumnOrName") -> Column:
"""
Computes the square root of the specified float value.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(sqrt(lit(4))).show()
+ +-------+
+ |SQRT(4)|
+ +-------+
+ | 2.0|
+ +-------+
"""
return _invoke_function_over_columns("sqrt", col)
-@since(1.3)
def abs(col: "ColumnOrName") -> Column:
"""
Computes the absolute value.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(abs(lit(-1))).show()
+ +-------+
+ |abs(-1)|
+ +-------+
+ | 1|
+ +-------+
"""
return _invoke_function_over_columns("abs", col)
@@ -196,7 +332,7 @@ def mode(col: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
- target column that the value will be returned
+ target column to compute on.
Returns
-------
@@ -221,18 +357,60 @@ def mode(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("mode", col)
-@since(1.3)
def max(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the maximum value of the expression in a group.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(10)
+ >>> df.select(max(col("id"))).show()
+ +-------+
+ |max(id)|
+ +-------+
+ | 9|
+ +-------+
"""
return _invoke_function_over_columns("max", col)
-@since(1.3)
def min(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the minimum value of the expression in a group.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(10)
+ >>> df.select(min(df.id)).show()
+ +-------+
+ |min(id)|
+ +-------+
+ | 0|
+ +-------+
"""
return _invoke_function_over_columns("min", col)
@@ -246,7 +424,7 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
- target column that the value will be returned
+ target column to compute on.
ord : :class:`~pyspark.sql.Column` or str
column to be maximized
@@ -281,7 +459,7 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
- target column that the value will be returned
+ target column to compute on.
ord : :class:`~pyspark.sql.Column` or str
column to be minimized
@@ -307,34 +485,120 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
return _invoke_function_over_columns("min_by", col, ord)
-@since(1.3)
def count(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the number of items in a group.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ column for computed results.
+
+ Examples
+ --------
+ Count by all columns (start), and by a column that does not count ``None``.
+
+ >>> df = spark.createDataFrame([(None,), ("a",), ("b",), ("c",)], schema=["alphabets"])
+ >>> df.select(count(expr("*")), count(df.alphabets)).show()
+ +--------+----------------+
+ |count(1)|count(alphabets)|
+ +--------+----------------+
+ | 4| 3|
+ +--------+----------------+
"""
return _invoke_function_over_columns("count", col)
-@since(1.3)
def sum(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the sum of all values in the expression.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(10)
+ >>> df.select(sum(df["id"])).show()
+ +-------+
+ |sum(id)|
+ +-------+
+ | 45|
+ +-------+
"""
return _invoke_function_over_columns("sum", col)
-@since(1.3)
def avg(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the average of the values in a group.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(10)
+ >>> df.select(avg(col("id"))).show()
+ +-------+
+ |avg(id)|
+ +-------+
+ | 4.5|
+ +-------+
"""
return _invoke_function_over_columns("avg", col)
-@since(1.3)
def mean(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the average of the values in a group.
+
+ .. versionadded:: 1.3.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(10)
+ >>> df.select(mean(df.id)).show()
+ +-------+
+ |avg(id)|
+ +-------+
+ | 4.5|
+ +-------+
"""
return _invoke_function_over_columns("mean", col)
@@ -348,7 +612,7 @@ def median(col: "ColumnOrName") -> Column:
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
- target column that the value will be returned
+ target column to compute on.
Returns
-------
@@ -373,11 +637,12 @@ def median(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("median", col)
-@since(1.3)
def sumDistinct(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the sum of distinct values in the expression.
+ .. versionadded:: 1.3.0
+
.. deprecated:: 3.2.0
Use :func:`sum_distinct` instead.
"""
@@ -385,10 +650,31 @@ def sumDistinct(col: "ColumnOrName") -> Column:
return sum_distinct(col)
-@since(3.2)
def sum_distinct(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the sum of distinct values in the expression.
+
+ .. versionadded:: 3.2.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.createDataFrame([(None,), (1,), (1,), (2,)], schema=["numbers"])
+ >>> df.select(sum_distinct(col("numbers"))).show()
+ +---------------------+
+ |sum(DISTINCT numbers)|
+ +---------------------+
+ | 3|
+ +---------------------+
"""
return _invoke_function_over_columns("sum_distinct", col)
@@ -404,6 +690,11 @@ def product(col: "ColumnOrName") -> Column:
col : str, :class:`Column`
column containing values to be multiplied together
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
Examples
--------
>>> df = spark.range(1, 10).toDF('x').withColumn('mod3', col('x') % 3)
@@ -416,7 +707,6 @@ def product(col: "ColumnOrName") -> Column:
| 1| 28.0|
| 2| 80.0|
+----+-------+
-
"""
return _invoke_function_over_columns("product", col)
@@ -427,10 +717,26 @@ def acos(col: "ColumnOrName") -> Column:
.. versionadded:: 1.4.0
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
Returns
-------
:class:`~pyspark.sql.Column`
inverse cosine of `col`, as if computed by `java.lang.Math.acos()`
+
+ Examples
+ --------
+ >>> df = spark.range(1, 3)
+ >>> df.select(acos(df.id)).show()
+ +--------+
+ |ACOS(id)|
+ +--------+
+ | 0.0|
+ | NaN|
+ +--------+
"""
return _invoke_function_over_columns("acos", col)
@@ -441,9 +747,26 @@ def acosh(col: "ColumnOrName") -> Column:
.. versionadded:: 3.1.0
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
Returns
-------
:class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(2)
+ >>> df.select(acosh(col("id"))).show()
+ +---------+
+ |ACOSH(id)|
+ +---------+
+ | NaN|
+ | 0.0|
+ +---------+
"""
return _invoke_function_over_columns("acosh", col)
@@ -454,11 +777,26 @@ def asin(col: "ColumnOrName") -> Column:
.. versionadded:: 1.3.0
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
Returns
-------
:class:`~pyspark.sql.Column`
inverse sine of `col`, as if computed by `java.lang.Math.asin()`
+
+ Examples
+ --------
+ >>> df = spark.createDataFrame([(0,), (2,)])
+ >>> df.select(asin(df.schema.fieldNames()[0])).show()
+ +--------+
+ |ASIN(_1)|
+ +--------+
+ | 0.0|
+ | NaN|
+ +--------+
"""
return _invoke_function_over_columns("asin", col)
@@ -469,9 +807,25 @@ def asinh(col: "ColumnOrName") -> Column:
.. versionadded:: 3.1.0
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
Returns
-------
:class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(asinh(col("id"))).show()
+ +---------+
+ |ASINH(id)|
+ +---------+
+ | 0.0|
+ +---------+
"""
return _invoke_function_over_columns("asinh", col)
@@ -482,10 +836,25 @@ def atan(col: "ColumnOrName") -> Column:
.. versionadded:: 1.4.0
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
Returns
-------
:class:`~pyspark.sql.Column`
inverse tangent of `col`, as if computed by `java.lang.Math.atan()`
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(atan(df.id)).show()
+ +--------+
+ |ATAN(id)|
+ +--------+
+ | 0.0|
+ +--------+
"""
return _invoke_function_over_columns("atan", col)
@@ -496,25 +865,84 @@ def atanh(col: "ColumnOrName") -> Column:
.. versionadded:: 3.1.0
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
Returns
-------
:class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.createDataFrame([(0,), (2,)], schema=["numbers"])
+ >>> df.select(atanh(df["numbers"])).show()
+ +--------------+
+ |ATANH(numbers)|
+ +--------------+
+ | 0.0|
+ | NaN|
+ +--------------+
"""
return _invoke_function_over_columns("atanh", col)
-@since(1.4)
def cbrt(col: "ColumnOrName") -> Column:
"""
Computes the cube-root of the given value.
+
+ .. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(cbrt(lit(27))).show()
+ +--------+
+ |CBRT(27)|
+ +--------+
+ | 3.0|
+ +--------+
"""
return _invoke_function_over_columns("cbrt", col)
-@since(1.4)
def ceil(col: "ColumnOrName") -> Column:
"""
Computes the ceiling of the given value.
+
+ .. versionadded:: 1.4.0
+
+ Parameters
+ ----------
+ col : :class:`~pyspark.sql.Column` or str
+ target column to compute on.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ the column for computed results.
+
+ Examples
+ --------
+ >>> df = spark.range(1)
+ >>> df.select(ceil(lit(-0.1))).show()
+ +----------+
+ |CEIL(-0.1)|
+ +----------+
+ | 0|
+ +----------+
"""
return _invoke_function_over_columns("ceil", col)
@@ -595,59 +1023,66 @@ def csc(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("csc", col)
-@since(1.4)
def exp(col: "ColumnOrName") -> Column:
"""
Computes the exponential of the given value.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("exp", col)
-@since(1.4)
def expm1(col: "ColumnOrName") -> Column:
"""
Computes the exponential of the given value minus one.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("expm1", col)
-@since(1.4)
def floor(col: "ColumnOrName") -> Column:
"""
Computes the floor of the given value.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("floor", col)
-@since(1.4)
def log(col: "ColumnOrName") -> Column:
"""
Computes the natural logarithm of the given value.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("log", col)
-@since(1.4)
def log10(col: "ColumnOrName") -> Column:
"""
Computes the logarithm of the given value in Base 10.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("log10", col)
-@since(1.4)
def log1p(col: "ColumnOrName") -> Column:
"""
Computes the natural logarithm of the given value plus one.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("log1p", col)
-@since(1.4)
def rint(col: "ColumnOrName") -> Column:
"""
Returns the double value that is closest in value to the argument and
is equal to a mathematical integer.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("rint", col)
@@ -671,10 +1106,11 @@ def sec(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("sec", col)
-@since(1.4)
def signum(col: "ColumnOrName") -> Column:
"""
Computes the signum of the given value.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_function_over_columns("signum", col)
@@ -756,9 +1192,10 @@ def tanh(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("tanh", col)
-@since(1.4)
def toDegrees(col: "ColumnOrName") -> Column:
"""
+ .. versionadded:: 1.4.0
+
.. deprecated:: 2.1.0
Use :func:`degrees` instead.
"""
@@ -766,9 +1203,10 @@ def toDegrees(col: "ColumnOrName") -> Column:
return degrees(col)
-@since(1.4)
def toRadians(col: "ColumnOrName") -> Column:
"""
+ .. versionadded:: 1.4.0
+
.. deprecated:: 2.1.0
Use :func:`radians` instead.
"""
@@ -776,11 +1214,12 @@ def toRadians(col: "ColumnOrName") -> Column:
return radians(col)
-@since(1.4)
def bitwiseNOT(col: "ColumnOrName") -> Column:
"""
Computes bitwise not.
+ .. versionadded:: 1.4.0
+
.. deprecated:: 3.2.0
Use :func:`bitwise_not` instead.
"""
@@ -788,19 +1227,21 @@ def bitwiseNOT(col: "ColumnOrName") -> Column:
return bitwise_not(col)
-@since(3.2)
def bitwise_not(col: "ColumnOrName") -> Column:
"""
Computes bitwise not.
+
+ .. versionadded:: 3.2.0
"""
return _invoke_function_over_columns("bitwise_not", col)
-@since(2.4)
def asc_nulls_first(col: "ColumnOrName") -> Column:
"""
Returns a sort expression based on the ascending order of the given
column name, and null values return before non-null values.
+
+ .. versionadded:: 2.4.0
"""
return (
col.asc_nulls_first()
@@ -809,22 +1250,24 @@ def asc_nulls_first(col: "ColumnOrName") -> Column:
)
-@since(2.4)
def asc_nulls_last(col: "ColumnOrName") -> Column:
"""
Returns a sort expression based on the ascending order of the given
column name, and null values appear after non-null values.
+
+ .. versionadded:: 2.4.0
"""
return (
col.asc_nulls_last() if isinstance(col, Column) else _invoke_function("asc_nulls_last", col)
)
-@since(2.4)
def desc_nulls_first(col: "ColumnOrName") -> Column:
"""
Returns a sort expression based on the descending order of the given
column name, and null values appear before non-null values.
+
+ .. versionadded:: 2.4.0
"""
return (
col.desc_nulls_first()
@@ -833,11 +1276,12 @@ def desc_nulls_first(col: "ColumnOrName") -> Column:
)
-@since(2.4)
def desc_nulls_last(col: "ColumnOrName") -> Column:
"""
Returns a sort expression based on the descending order of the given
column name, and null values appear after non-null values.
+
+ .. versionadded:: 2.4.0
"""
return (
col.desc_nulls_last()
@@ -846,69 +1290,77 @@ def desc_nulls_last(col: "ColumnOrName") -> Column:
)
-@since(1.6)
def stddev(col: "ColumnOrName") -> Column:
"""
Aggregate function: alias for stddev_samp.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("stddev", col)
-@since(1.6)
def stddev_samp(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the unbiased sample standard deviation of
the expression in a group.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("stddev_samp", col)
-@since(1.6)
def stddev_pop(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns population standard deviation of
the expression in a group.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("stddev_pop", col)
-@since(1.6)
def variance(col: "ColumnOrName") -> Column:
"""
Aggregate function: alias for var_samp
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("variance", col)
-@since(1.6)
def var_samp(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the unbiased sample variance of
the values in a group.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("var_samp", col)
-@since(1.6)
def var_pop(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the population variance of the values in a group.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("var_pop", col)
-@since(1.6)
def skewness(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the skewness of the values in a group.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("skewness", col)
-@since(1.6)
def kurtosis(col: "ColumnOrName") -> Column:
"""
Aggregate function: returns the kurtosis of the values in a group.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function_over_columns("kurtosis", col)
@@ -1046,10 +1498,11 @@ def hypot(col1: "ColumnOrName", col2: float) -> Column:
...
-@since(1.4)
def hypot(col1: Union["ColumnOrName", float], col2: Union["ColumnOrName", float]) -> Column:
"""
Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_binary_math_function("hypot", col1, col2)
@@ -1069,10 +1522,11 @@ def pow(col1: "ColumnOrName", col2: float) -> Column:
...
-@since(1.4)
def pow(col1: Union["ColumnOrName", float], col2: Union["ColumnOrName", float]) -> Column:
"""
Returns the value of the first argument raised to the power of the second argument.
+
+ .. versionadded:: 1.4.0
"""
return _invoke_binary_math_function("pow", col1, col2)
@@ -1116,15 +1570,15 @@ def pmod(dividend: Union["ColumnOrName", float], divisor: Union["ColumnOrName",
return _invoke_binary_math_function("pmod", dividend, divisor)
-@since(1.6)
def row_number() -> Column:
"""
Window function: returns a sequential number starting at 1 within a window partition.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function("row_number")
-@since(1.6)
def dense_rank() -> Column:
"""
Window function: returns the rank of rows within a window partition, without any gaps.
@@ -1136,11 +1590,12 @@ def dense_rank() -> Column:
the person that came in third place (after the ties) would register as coming in fifth.
This is equivalent to the DENSE_RANK function in SQL.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function("dense_rank")
-@since(1.6)
def rank() -> Column:
"""
Window function: returns the rank of rows within a window partition.
@@ -1152,30 +1607,35 @@ def rank() -> Column:
the person that came in third place (after the ties) would register as coming in fifth.
This is equivalent to the RANK function in SQL.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function("rank")
-@since(1.6)
def cume_dist() -> Column:
"""
Window function: returns the cumulative distribution of values within a window partition,
i.e. the fraction of rows that are below the current row.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function("cume_dist")
-@since(1.6)
def percent_rank() -> Column:
"""
Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
+
+ .. versionadded:: 1.6.0
"""
return _invoke_function("percent_rank")
-@since(1.3)
def approxCountDistinct(col: "ColumnOrName", rsd: Optional[float] = None) -> Column:
"""
+ .. versionadded:: 1.3.0
+
.. deprecated:: 2.1.0
Use :func:`approx_count_distinct` instead.
"""
@@ -1207,9 +1667,12 @@ def approx_count_distinct(col: "ColumnOrName", rsd: Optional[float] = None) -> C
return _invoke_function("approx_count_distinct", _to_java_column(col), rsd)
-@since(1.6)
def broadcast(df: DataFrame) -> DataFrame:
- """Marks a DataFrame as small enough for use in broadcast joins."""
+ """
+ Marks a DataFrame as small enough for use in broadcast joins.
+
+ .. versionadded:: 1.6.0
+ """
sc = SparkContext._active_spark_context
assert sc is not None and sc._jvm is not None
@@ -1411,9 +1874,12 @@ def grouping_id(*cols: "ColumnOrName") -> Column:
return _invoke_function_over_seq_of_columns("grouping_id", cols)
-@since(1.6)
def input_file_name() -> Column:
- """Creates a string column for the file name of the current Spark task."""
+ """
+ Creates a string column for the file name of the current Spark task.
+
+ .. versionadded:: 1.6.0
+ """
return _invoke_function("input_file_name")
@@ -2019,11 +2485,12 @@ def ntile(n: int) -> Column:
# ---------------------- Date/Timestamp functions ------------------------------
-@since(1.5)
def current_date() -> Column:
"""
Returns the current date at the start of query evaluation as a :class:`DateType` column.
All calls of current_date within the same query return the same value.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function("current_date")
@@ -2032,6 +2499,8 @@ def current_timestamp() -> Column:
"""
Returns the current timestamp at the start of query evaluation as a :class:`TimestampType`
column. All calls of current_timestamp within the same query return the same value.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function("current_timestamp")
@@ -2933,11 +3402,12 @@ def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None
return _invoke_function("assert_true", _to_java_column(col), errMsg)
-@since(3.1)
def raise_error(errMsg: Union[Column, str]) -> Column:
"""
Throws an exception with the provided error message.
+ .. versionadded:: 3.1.0
+
Parameters
----------
errMsg : :class:`~pyspark.sql.Column` or str
@@ -2955,66 +3425,74 @@ def raise_error(errMsg: Union[Column, str]) -> Column:
# ---------------------- String/Binary functions ------------------------------
-@since(1.5)
def upper(col: "ColumnOrName") -> Column:
"""
Converts a string expression to upper case.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("upper", col)
-@since(1.5)
def lower(col: "ColumnOrName") -> Column:
"""
Converts a string expression to lower case.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("lower", col)
-@since(1.5)
def ascii(col: "ColumnOrName") -> Column:
"""
Computes the numeric value of the first character of the string column.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("ascii", col)
-@since(1.5)
def base64(col: "ColumnOrName") -> Column:
"""
Computes the BASE64 encoding of a binary column and returns it as a string column.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("base64", col)
-@since(1.5)
def unbase64(col: "ColumnOrName") -> Column:
"""
Decodes a BASE64 encoded string column and returns it as a binary column.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("unbase64", col)
-@since(1.5)
def ltrim(col: "ColumnOrName") -> Column:
"""
Trim the spaces from left end for the specified string value.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("ltrim", col)
-@since(1.5)
def rtrim(col: "ColumnOrName") -> Column:
"""
Trim the spaces from right end for the specified string value.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("rtrim", col)
-@since(1.5)
def trim(col: "ColumnOrName") -> Column:
"""
Trim the spaces from both ends for the specified string column.
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function_over_columns("trim", col)
@@ -3037,20 +3515,22 @@ def concat_ws(sep: str, *cols: "ColumnOrName") -> Column:
return _invoke_function("concat_ws", sep, _to_seq(sc, cols, _to_java_column))
-@since(1.5)
def decode(col: "ColumnOrName", charset: str) -> Column:
"""
Computes the first argument into a string from a binary using the provided character set
(one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function("decode", _to_java_column(col), charset)
-@since(1.5)
def encode(col: "ColumnOrName", charset: str) -> Column:
"""
Computes the first argument into a binary from a string using the provided character set
(one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
+
+ .. versionadded:: 1.5.0
"""
return _invoke_function("encode", _to_java_column(col), charset)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org