You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/03/08 11:14:24 UTC
[spark] branch branch-3.4 updated: [SPARK-42713][PYTHON][DOCS] Add '__getattr__' and '__getitem__' of DataFrame and Column to API reference
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new 1fcf4975063 [SPARK-42713][PYTHON][DOCS] Add '__getattr__' and '__getitem__' of DataFrame and Column to API reference
1fcf4975063 is described below
commit 1fcf4975063d4817b794243ef5a1854fe7de8cce
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Wed Mar 8 20:13:57 2023 +0900
[SPARK-42713][PYTHON][DOCS] Add '__getattr__' and '__getitem__' of DataFrame and Column to API reference
### What changes were proposed in this pull request?
Add '__getattr__' and '__getitem__' of DataFrame and Column to API reference
### Why are the changes needed?
'__getattr__' and '__getitem__' are widely used, but we did not document them.
### Does this PR introduce _any_ user-facing change?
yes, new doc
### How was this patch tested?
added doctests
Closes #40331 from zhengruifeng/py_doc.
Authored-by: Ruifeng Zheng <ru...@apache.org>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
(cherry picked from commit e28f7f38e10cf081e4e04760d7f47045973f66ac)
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../docs/source/reference/pyspark.sql/column.rst | 2 +
.../source/reference/pyspark.sql/dataframe.rst | 2 +
python/pyspark/sql/column.py | 59 ++++++++++++++++++++++
python/pyspark/sql/dataframe.py | 33 ++++++++++++
4 files changed, 96 insertions(+)
diff --git a/python/docs/source/reference/pyspark.sql/column.rst b/python/docs/source/reference/pyspark.sql/column.rst
index b5f39d299c1..b897b5c00c4 100644
--- a/python/docs/source/reference/pyspark.sql/column.rst
+++ b/python/docs/source/reference/pyspark.sql/column.rst
@@ -24,6 +24,8 @@ Column
.. autosummary::
:toctree: api/
+ Column.__getattr__
+ Column.__getitem__
Column.alias
Column.asc
Column.asc_nulls_first
diff --git a/python/docs/source/reference/pyspark.sql/dataframe.rst b/python/docs/source/reference/pyspark.sql/dataframe.rst
index e647704158f..aa306ccc382 100644
--- a/python/docs/source/reference/pyspark.sql/dataframe.rst
+++ b/python/docs/source/reference/pyspark.sql/dataframe.rst
@@ -25,6 +25,8 @@ DataFrame
.. autosummary::
:toctree: api/
+ DataFrame.__getattr__
+ DataFrame.__getitem__
DataFrame.agg
DataFrame.alias
DataFrame.approxQuantile
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index abd28136895..9f9ca0abb7a 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -639,11 +639,70 @@ class Column:
return Column(jc)
def __getattr__(self, item: Any) -> "Column":
+ """
+ An expression that gets an item at position ``ordinal`` out of a list,
+ or gets an item by key out of a dict.
+
+ .. versionadded:: 1.3.0
+
+ .. versionchanged:: 3.4.0
+ Support Spark Connect.
+
+ Parameters
+ ----------
+ item
+ a literal value.
+
+ Returns
+ -------
+ :class:`Column`
+ Column representing the item got by key out of a dict.
+
+ Examples
+ --------
+ >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
+ >>> df.select(df.d.key).show()
+ +------+
+ |d[key]|
+ +------+
+ | value|
+ +------+
+ """
if item.startswith("__"):
raise AttributeError(item)
return self[item]
def __getitem__(self, k: Any) -> "Column":
+ """
+ An expression that gets an item at position ``ordinal`` out of a list,
+ or gets an item by key out of a dict.
+
+ .. versionadded:: 1.3.0
+
+ .. versionchanged:: 3.4.0
+ Support Spark Connect.
+
+ Parameters
+ ----------
+ k
+ a literal value, or a slice object without step.
+
+ Returns
+ -------
+ :class:`Column`
+ Column representing the item got by key out of a dict, or substrings sliced by
+ the given slice object.
+
+ Examples
+ --------
+ >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
+ >>> df.select(df.l[slice(1, 3)], df.d['key']).show()
+ +------------------+------+
+ |substring(l, 1, 3)|d[key]|
+ +------------------+------+
+ | abc| value|
+ +------------------+------+
+ """
if isinstance(k, slice):
if k.step is not None:
raise ValueError("slice with step is not supported.")
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index a6357a7c137..36547dc64c7 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -2847,6 +2847,28 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
.. versionadded:: 1.3.0
+ .. versionchanged:: 3.4.0
+ Support Spark Connect.
+
+ Parameters
+ ----------
+ item : int, str, :class:`Column`, list or tuple
+ column index, column name, column, or a list or tuple of columns
+
+ Returns
+ -------
+ :class:`Column` or :class:`DataFrame`
+ a specified column, or a filtered or projected dataframe.
+
+ * If the input `item` is an int or str, the output is a :class:`Column`.
+
+ * If the input `item` is a :class:`Column`, the output is a :class:`DataFrame`
+ filtered by this given :class:`Column`.
+
+ * If the input `item` is a list or tuple, the output is a :class:`DataFrame`
+ projected by this given list or tuple.
+
+
Examples
--------
>>> df = spark.createDataFrame([
@@ -2862,6 +2884,14 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
| 5|
+---+
+ >>> df.select(df[1]).show()
+ +-----+
+ | name|
+ +-----+
+ |Alice|
+ | Bob|
+ +-----+
+
Select multiple string columns as index.
>>> df[["name", "age"]].show()
@@ -2905,6 +2935,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
.. versionadded:: 1.3.0
+ .. versionchanged:: 3.4.0
+ Support Spark Connect.
+
Parameters
----------
name : str
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org