You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/02/20 14:24:14 UTC

[spark] branch master updated: [SPARK-42476][CONNECT][DOCS] Complete Spark Connect API reference

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e6c201df33b [SPARK-42476][CONNECT][DOCS] Complete Spark Connect API reference
e6c201df33b is described below

commit e6c201df33b123c3bfc632012abeaa0db6c417bc
Author: itholic <ha...@databricks.com>
AuthorDate: Mon Feb 20 23:23:17 2023 +0900

    [SPARK-42476][CONNECT][DOCS] Complete Spark Connect API reference
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to complete missing API Reference for Spark Connect.
    
    Built API docs should include "Changed in version" for Spark Connect when it's implemented as below:
    
    <img width="814" alt="Screen Shot 2023-02-20 at 9 49 09 AM" src="https://user-images.githubusercontent.com/44108233/219986313-374e0959-b8c5-44f6-942c-bba1c0407909.png">
    
    ### Why are the changes needed?
    
    Improving usability for Spark Connect.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No, it's documentation.
    
    ### How was this patch tested?
    
    Manually built docs, confirmed each function and class one by one.
    
    Closes #40067 from itholic/SPARK-42476.
    
    Lead-authored-by: itholic <ha...@databricks.com>
    Co-authored-by: Haejoon Lee <44...@users.noreply.github.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/ml/feature.py         |  2 +-
 python/pyspark/sql/column.py         | 27 ++++++++++++++++++++++++---
 python/pyspark/sql/connect/client.py | 30 +++++++++++++++++++++++++++---
 python/pyspark/sql/connect/column.py | 12 ++++++++++++
 python/pyspark/sql/connect/udf.py    |  2 ++
 python/pyspark/sql/dataframe.py      | 27 +++++++++++++++++++++++++++
 python/pyspark/sql/functions.py      | 33 +++++++++++++++++++++++++++++++++
 python/pyspark/sql/group.py          |  3 +++
 python/pyspark/sql/session.py        |  8 ++++++++
 9 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 43e658d7f69..ff7aaf71f9c 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -3476,7 +3476,7 @@ class QuantileDiscretizer(
     non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4].
 
     Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
-    :py:meth:`~.DataFrameStatFunctions.approxQuantile` for a detailed description).
+    :py:meth:`pyspark.sql.DataFrameStatFunctions.approxQuantile` for a detailed description).
     The precision of the approximation can be controlled with the
     :py:attr:`relativeError` parameter.
     The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 0b5f94cfaaa..bcf6676d5ca 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -669,14 +669,14 @@ class Column:
     _startswith_doc = """
     String starts with. Returns a boolean :class:`Column` based on a string match.
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     other : :class:`Column` or str
         string at start of line (do not use a regex `^`)
 
-    .. versionchanged:: 3.4.0
-        Support Spark Connect.
-
     Examples
     --------
     >>> df = spark.createDataFrame(
@@ -903,6 +903,9 @@ class Column:
     _asc_doc = """
     Returns a sort expression based on the ascending order of the column.
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Examples
     --------
     >>> from pyspark.sql import Row
@@ -916,6 +919,9 @@ class Column:
 
     .. versionadded:: 2.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Examples
     --------
     >>> from pyspark.sql import Row
@@ -930,6 +936,9 @@ class Column:
 
     .. versionadded:: 2.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Examples
     --------
     >>> from pyspark.sql import Row
@@ -943,6 +952,9 @@ class Column:
 
     .. versionadded:: 2.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Examples
     --------
     >>> from pyspark.sql import Row
@@ -956,6 +968,9 @@ class Column:
 
     .. versionadded:: 2.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Examples
     --------
     >>> from pyspark.sql import Row
@@ -970,6 +985,9 @@ class Column:
 
     .. versionadded:: 2.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Examples
     --------
     >>> from pyspark.sql import Row
@@ -1128,6 +1146,9 @@ class Column:
 
         .. versionadded:: 1.3.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         lowerBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal
diff --git a/python/pyspark/sql/connect/client.py b/python/pyspark/sql/connect/client.py
index 78190b2c488..154dd161e92 100644
--- a/python/pyspark/sql/connect/client.py
+++ b/python/pyspark/sql/connect/client.py
@@ -403,7 +403,11 @@ class AnalyzeResult:
 
 
 class SparkConnectClient(object):
-    """Conceptually the remote spark session that communicates with the server"""
+    """
+    Conceptually the remote spark session that communicates with the server
+
+    .. versionadded:: 3.4.0
+    """
 
     @classmethod
     def retry_exception(cls, e: grpc.RpcError) -> bool:
@@ -465,8 +469,10 @@ class SparkConnectClient(object):
         eval_type: int = PythonEvalType.SQL_BATCHED_UDF,
         deterministic: bool = True,
     ) -> str:
-        """Create a temporary UDF in the session catalog on the other side. We generate a
-        temporary name for it."""
+        """
+        Create a temporary UDF in the session catalog on the other side. We generate a
+        temporary name for it.
+        """
 
         if name is None:
             name = f"fun_{uuid.uuid4().hex}"
@@ -509,6 +515,9 @@ class SparkConnectClient(object):
         ]
 
     def to_table(self, plan: pb2.Plan) -> "pa.Table":
+        """
+        Return given plan as a PyArrow Table.
+        """
         logger.info(f"Executing plan {self._proto_to_string(plan)}")
         req = self._execute_plan_request_with_metadata()
         req.plan.CopyFrom(plan)
@@ -516,6 +525,9 @@ class SparkConnectClient(object):
         return table
 
     def to_pandas(self, plan: pb2.Plan) -> "pd.DataFrame":
+        """
+        Return given plan as a pandas DataFrame.
+        """
         logger.info(f"Executing plan {self._proto_to_string(plan)}")
         req = self._execute_plan_request_with_metadata()
         req.plan.CopyFrom(plan)
@@ -543,6 +555,9 @@ class SparkConnectClient(object):
         return text_format.MessageToString(p, as_one_line=True)
 
     def schema(self, plan: pb2.Plan) -> StructType:
+        """
+        Return schema for given plan.
+        """
         logger.info(f"Schema for plan: {self._proto_to_string(plan)}")
         proto_schema = self._analyze(plan).schema
         # Server side should populate the struct field which is the schema.
@@ -565,11 +580,17 @@ class SparkConnectClient(object):
         return StructType(fields)
 
     def explain_string(self, plan: pb2.Plan, explain_mode: str = "extended") -> str:
+        """
+        Return explain string for given plan.
+        """
         logger.info(f"Explain (mode={explain_mode}) for plan {self._proto_to_string(plan)}")
         result = self._analyze(plan, explain_mode)
         return result.explain_string
 
     def execute_command(self, command: pb2.Command) -> None:
+        """
+        Execute given command.
+        """
         logger.info(f"Execute command for command {self._proto_to_string(command)}")
         req = self._execute_plan_request_with_metadata()
         if self._user_id:
@@ -579,6 +600,9 @@ class SparkConnectClient(object):
         return
 
     def close(self) -> None:
+        """
+        Close the channel.
+        """
         self._channel.close()
 
     def _execute_plan_request_with_metadata(self) -> pb2.ExecutePlanRequest:
diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py
index f5c82336bb7..a172b884f69 100644
--- a/python/pyspark/sql/connect/column.py
+++ b/python/pyspark/sql/connect/column.py
@@ -272,21 +272,33 @@ class Column:
     def asc(self) -> "Column":
         return self.asc_nulls_first()
 
+    asc.__doc__ = PySparkColumn.asc.__doc__
+
     def asc_nulls_first(self) -> "Column":
         return Column(SortOrder(self._expr, ascending=True, nullsFirst=True))
 
+    asc_nulls_first.__doc__ = PySparkColumn.asc_nulls_first.__doc__
+
     def asc_nulls_last(self) -> "Column":
         return Column(SortOrder(self._expr, ascending=True, nullsFirst=False))
 
+    asc_nulls_last.__doc__ = PySparkColumn.asc_nulls_last.__doc__
+
     def desc(self) -> "Column":
         return self.desc_nulls_last()
 
+    desc.__doc__ = PySparkColumn.desc.__doc__
+
     def desc_nulls_first(self) -> "Column":
         return Column(SortOrder(self._expr, ascending=False, nullsFirst=True))
 
+    desc_nulls_first.__doc__ = PySparkColumn.desc_nulls_first.__doc__
+
     def desc_nulls_last(self) -> "Column":
         return Column(SortOrder(self._expr, ascending=False, nullsFirst=False))
 
+    desc_nulls_last.__doc__ = PySparkColumn.desc_nulls_last.__doc__
+
     def cast(self, dataType: Union[DataType, str]) -> "Column":
         if isinstance(dataType, (DataType, str)):
             return Column(CastExpression(expr=self._expr, data_type=dataType))
diff --git a/python/pyspark/sql/connect/udf.py b/python/pyspark/sql/connect/udf.py
index a60f9e516c7..bfe7006d161 100644
--- a/python/pyspark/sql/connect/udf.py
+++ b/python/pyspark/sql/connect/udf.py
@@ -172,6 +172,8 @@ class UserDefinedFunction:
     def asNondeterministic(self) -> "UserDefinedFunction":
         """
         Updates UserDefinedFunction to nondeterministic.
+
+        .. versionadded:: 3.4.0
         """
         self.deterministic = False
         return self
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index d9de9ee14ac..fa25d148060 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1681,6 +1681,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 2.4.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         numPartitions : int
@@ -1874,6 +1877,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 1.5.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         col : :class:`Column` or str
@@ -3191,6 +3197,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 1.4.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         cols : list, str or :class:`Column`
@@ -3237,6 +3246,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 1.4.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         cols : list, str or :class:`Column`
@@ -4328,6 +4340,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 2.0.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         col: str, tuple or list
@@ -4427,6 +4442,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 1.4.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         col1 : str
@@ -4477,6 +4495,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 1.4.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         col1 : str
@@ -4575,6 +4596,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 1.4.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         cols : list or tuple
@@ -4806,6 +4830,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
 
         .. versionadded:: 3.3.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         columnName : str
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index d296075fb0b..9a760551cac 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -4907,6 +4907,9 @@ def months_between(date1: "ColumnOrName", date2: "ColumnOrName", roundOff: bool
 
     .. versionadded:: 1.5.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     date1 : :class:`~pyspark.sql.Column` or str
@@ -5424,6 +5427,9 @@ def window(
 
     .. versionadded:: 2.0.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     timeColumn : :class:`~pyspark.sql.Column`
@@ -5500,6 +5506,9 @@ def window_time(
 
     .. versionadded:: 3.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     windowColumn : :class:`~pyspark.sql.Column`
@@ -5555,6 +5564,9 @@ def session_window(timeColumn: "ColumnOrName", gapDuration: Union[Column, str])
 
     .. versionadded:: 3.2.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     timeColumn : :class:`~pyspark.sql.Column` or str
@@ -6299,6 +6311,9 @@ def format_number(col: "ColumnOrName", d: int) -> Column:
 
     .. versionadded:: 1.5.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -7826,6 +7841,9 @@ def array_compact(col: "ColumnOrName") -> Column:
 
     .. versionadded:: 3.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -9083,6 +9101,9 @@ def sequence(
 
     .. versionadded:: 2.4.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     start : :class:`~pyspark.sql.Column` or str
@@ -9816,6 +9837,9 @@ def years(col: "ColumnOrName") -> Column:
 
     .. versionadded:: 3.1.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -9887,6 +9911,9 @@ def days(col: "ColumnOrName") -> Column:
 
     .. versionadded:: 3.1.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -9921,6 +9948,9 @@ def hours(col: "ColumnOrName") -> Column:
 
     .. versionadded:: 3.1.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -9955,6 +9985,9 @@ def bucket(numBuckets: Union[Column, int], col: "ColumnOrName") -> Column:
 
     .. versionadded:: 3.1.0
 
+    .. versionchanged:: 3.4.0
+        Support Spark Connect.
+
     Examples
     --------
     >>> df.writeTo("catalog.db.table").partitionedBy(  # doctest: +SKIP
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 3df32b32fd2..737e9ea5880 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -430,6 +430,9 @@ class GroupedData(PandasGroupedOpsMixin):
 
         .. versionadded:: 1.6.0
 
+        .. versionchanged:: 3.4.0
+            Support Spark Connect.
+
         Parameters
         ----------
         pivot_col : str
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 7019210a4d8..de2eb4970f7 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -218,6 +218,9 @@ class SparkSession(SparkConversionMixin):
 
             .. versionadded:: 2.0.0
 
+            .. versionchanged:: 3.4.0
+                Support Spark Connect.
+
             Parameters
             ----------
             key : str, optional
@@ -348,6 +351,9 @@ class SparkSession(SparkConversionMixin):
 
             .. versionadded:: 2.0.0
 
+            .. versionchanged:: 3.4.0
+                Support Spark Connect.
+
             Parameters
             ----------
             name : str
@@ -387,6 +393,8 @@ class SparkSession(SparkConversionMixin):
 
             .. versionadded:: 2.0.0
 
+            .. versionchanged:: 3.4.0
+                Support Spark Connect.
 
             Returns
             -------


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org