You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2020/10/30 01:51:28 UTC
[GitHub] [spark] viirya commented on a change in pull request #30181: [SPARK-33250][PYTHON][DOCS] Migration to NumPy documentation style in SQL (pyspark.sql.*)

viirya commented on a change in pull request #30181:
URL: https://github.com/apache/spark/pull/30181#discussion_r514662903



##########
File path: python/pyspark/sql/functions.py
##########
@@ -199,26 +202,39 @@ def sumDistinct(col):
     return _invoke_function_over_column("sumDistinct", col)
 
 
-@since(1.4)
 def acos(col):
     """
-    :return: inverse cosine of `col`, as if computed by `java.lang.Math.acos()`
+    .. versionadded:: 1.4.0
+
+    Returns
+    -------
+    :class:`Column`
+        inverse cosine of `col`, as if computed by `java.lang.Math.acos()`
     """
     return _invoke_function_over_column("acos", col)
 
 
-@since(1.4)
 def asin(col):
     """
-    :return: inverse sine of `col`, as if computed by `java.lang.Math.asin()`
+    .. versionadded:: 1.3.0

Review comment:
       1.4.0?

##########
File path: python/pyspark/sql/column.py
##########
@@ -219,11 +226,11 @@ def __init__(self, jc):
     |            true|          false|           false|
     +----------------+---------------+----------------+
 
-    .. note:: Unlike Pandas, PySpark doesn't consider NaN values to be NULL.
-       See the `NaN Semantics`_ for details.
-    .. _NaN Semantics:
-       https://spark.apache.org/docs/latest/sql-programming-guide.html#nan-semantics
-    .. versionadded:: 2.3.0

Review comment:
       `versionadded:: 2.3.0` is just removed?

##########
File path: python/pyspark/sql/session.py
##########
@@ -497,23 +557,39 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
         If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
         rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.
 
-        :param data: an RDD of any kind of SQL data representation (e.g. row, tuple, int, boolean,
-            etc.), :class:`list`, or :class:`pandas.DataFrame`.
-        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
-            column names, default is ``None``.  The data type string format equals to
-            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
-            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
-            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
-            ``int`` as a short name for ``IntegerType``.
-        :param samplingRatio: the sample ratio of rows used for inferring
-        :param verifySchema: verify data types of every row against schema.
-        :return: :class:`DataFrame`
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/session.py
##########
@@ -641,38 +727,51 @@ def table(self, tableName):
         return DataFrame(self._jsparkSession.table(tableName), self._wrapped)
 
     @property
-    @since(2.0)
     def read(self):
         """
         Returns a :class:`DataFrameReader` that can be used to read data
         in as a :class:`DataFrame`.
 
-        :return: :class:`DataFrameReader`
+        .. versionadded:: 2.0
+
+        Returns
+        -------
+        :class:`DataFrameReader`
         """
         return DataFrameReader(self._wrapped)
 
     @property
-    @since(2.0)
     def readStream(self):
         """
         Returns a :class:`DataStreamReader` that can be used to read data streams
         as a streaming :class:`DataFrame`.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0
+
+        Notes
+        -----
+        This API is evolving.
 
-        :return: :class:`DataStreamReader`
+        Returns
+        -------
+        :class:`DataStreamReader`
         """
         return DataStreamReader(self._wrapped)
 
     @property
-    @since(2.0)
     def streams(self):
         """Returns a :class:`StreamingQueryManager` that allows managing all the
         :class:`StreamingQuery` instances active on `this` context.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -790,9 +940,11 @@ class DataStreamWriter(object):
     Use :attr:`DataFrame.writeStream <pyspark.sql.DataFrame.writeStream>`
     to access this.
 
-    .. note:: Evolving.
-
     .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -804,10 +956,11 @@ def _sq(self, jsq):
         from pyspark.sql.streaming import StreamingQuery
         return StreamingQuery(jsq)
 
-    @since(2.0)
     def outputMode(self, outputMode):
         """Specifies how data of a streaming DataFrame/Dataset is written to a streaming sink.
 
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -922,22 +1106,32 @@ def queryName(self, queryName):
         return self
 
     @keyword_only
-    @since(2.0)
     def trigger(self, *, processingTime=None, once=None, continuous=None):
         """Set the trigger for the stream query. If this is not set it will run the query as fast
         as possible, which is equivalent to setting the trigger to ``processingTime='0 seconds'``.
 
-        .. note:: Evolving.
-
-        :param processingTime: a processing time interval as a string, e.g. '5 seconds', '1 minute'.
-                               Set a trigger that runs a microbatch query periodically based on the
-                               processing time. Only one trigger can be set.
-        :param once: if set to True, set a trigger that processes only one batch of data in a
-                     streaming query then terminates the query. Only one trigger can be set.
-        :param continuous: a time interval as a string, e.g. '5 seconds', '1 minute'.
-                           Set a trigger that runs a continuous query with a given checkpoint
-                           interval. Only one trigger can be set.
-
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/functions.py
##########
@@ -633,20 +733,28 @@ def percent_rank():
 @since(1.3)

Review comment:
       this?

##########
File path: python/pyspark/sql/session.py
##########
@@ -42,10 +42,12 @@ def toDF(self, schema=None, sampleRatio=None):
 
         This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``
 
-        :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns
-        :param sampleRatio: the sample ratio of rows used for inferring
-        :return: a DataFrame

Review comment:
       These parameters are just removed?

##########
File path: python/pyspark/sql/session.py
##########
@@ -614,25 +690,35 @@ def prepare(obj):
         df._schema = schema
         return df
 
-    @since(2.0)
     def sql(self, sqlQuery):
         """Returns a :class:`DataFrame` representing the result of the given query.
 
-        :return: :class:`DataFrame`
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/pandas/serializers.py
##########
@@ -130,8 +135,15 @@ def _create_batch(self, series):
         Create an Arrow record batch from the given pandas.Series or list of Series,
         with optional type.
 
-        :param series: A single pandas.Series, list of Series, or list of (series, arrow_type)
-        :return: Arrow RecordBatch
+        Parameters
+        ----------
+        series : pandas.Series
+            A single pandas.Series, list of Series, or list of (series, arrow_type)

Review comment:
       `pandas.Series`?

##########
File path: python/pyspark/sql/functions.py
##########
@@ -372,7 +436,8 @@ def toDegrees(col):
 @since(1.4)

Review comment:
       ditto

##########
File path: python/pyspark/sql/session.py
##########
@@ -614,25 +690,35 @@ def prepare(obj):
         df._schema = schema
         return df
 
-    @since(2.0)
     def sql(self, sqlQuery):
         """Returns a :class:`DataFrame` representing the result of the given query.
 
-        :return: :class:`DataFrame`
+        .. versionadded:: 2.0
+
+        Returns
+        -------
+        :class:`DataFrame`
 
+        Examples
+        --------
         >>> df.createOrReplaceTempView("table1")
         >>> df2 = spark.sql("SELECT field1 AS f1, field2 as f2 from table1")
         >>> df2.collect()
         [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3, f2='row3')]
         """
         return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
 
-    @since(2.0)
     def table(self, tableName):
         """Returns the specified table as a :class:`DataFrame`.
 
-        :return: :class:`DataFrame`
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/functions.py
##########
@@ -322,48 +354,80 @@ def signum(col):
     return _invoke_function_over_column("signum", col)
 
 
-@since(1.4)
 def sin(col):
     """
-    :param col: angle in radians
-    :return: sine of the angle, as if computed by `java.lang.Math.sin()`
+    .. versionadded:: 1.4.0
+
+    Parameters
+    ----------
+    col : :class:`Column` or str
+
+    Returns
+    -------
+    :class:`Column`
+        sine of the angle, as if computed by `java.lang.Math.sin()`
     """
     return _invoke_function_over_column("sin", col)
 
 
-@since(1.4)
 def sinh(col):
     """
-    :param col: hyperbolic angle
-    :return: hyperbolic sine of the given value,
-             as if computed by `java.lang.Math.sinh()`
+    .. versionadded:: 1.4.0
+
+    Parameters
+    ----------
+    col : :class:`Column` or str
+        hyperbolic angle
+
+    Returns
+    -------
+    :class:`Column`
+        hyperbolic sine of the given value,
+        as if computed by `java.lang.Math.sinh()`
     """
     return _invoke_function_over_column("sinh", col)
 
 
-@since(1.4)
 def tan(col):
     """
-    :param col: angle in radians
-    :return: tangent of the given value, as if computed by `java.lang.Math.tan()`
+    .. versionadded:: 1.4.0
+
+    Parameters
+    ----------
+    col : :class:`Column` or str
+        angle in radians
+
+    Returns
+    -------
+    :class:`Column`
+        tangent of the given value, as if computed by `java.lang.Math.tan()`
     """
     return _invoke_function_over_column("tan", col)
 
 
-@since(1.4)
 def tanh(col):
     """
-    :param col: hyperbolic angle
-    :return: hyperbolic tangent of the given value
-             as if computed by `java.lang.Math.tanh()`
+    .. versionadded:: 1.4.0
+
+    Parameters
+    ----------
+    col : :class:`Column` or str
+        hyperbolic angle
+
+    Returns
+    -------
+    :class:`Column`
+        hyperbolic tangent of the given value
+        as if computed by `java.lang.Math.tanh()`
     """
     return _invoke_function_over_column("tanh", col)
 
 
 @since(1.4)

Review comment:
       Not to change to `.. versionadded::`?

##########
File path: python/pyspark/sql/dataframe.py
##########
@@ -1308,24 +1546,37 @@ def summary(self, *statistics):
         |  count|  2|   2|
         +-------+---+----+
 
-        See also describe for basic statistics.
+        See Also
+        --------
+        DataFrame.display
         """
         if len(statistics) == 1 and isinstance(statistics[0], list):
             statistics = statistics[0]
         jdf = self._jdf.summary(self._jseq(statistics))
         return DataFrame(jdf, self.sql_ctx)
 
-    @since(1.3)
     def head(self, n=None):
         """Returns the first ``n`` rows.
 
-        .. note:: This method should only be used if the resulting array is expected
-            to be small, as all the data is loaded into the driver's memory.
+        .. versionadded:: 1.3.0
+
+        Notes
+        -----
+        This method should only be used if the resulting array is expected
+        to be small, as all the data is loaded into the driver's memory.
 
-        :param n: int, default 1. Number of rows to return.
-        :return: If n is greater than 1, return a list of :class:`Row`.
-            If n is 1, return a single Row.
+        Parameters
+        ----------
+        n : int, optional
+            default 1. Number of rows to return.
 
+        Returns
+        -------
+        If n is greater than 1, return a list of :class:`Row`.
+        If n is 1, return a single Row.

Review comment:
       :class:`Row`?

##########
File path: python/pyspark/sql/dataframe.py
##########
@@ -935,19 +1090,32 @@ def sample(self, withReplacement=None, fraction=None, seed=None):
         jdf = self._jdf.sample(*args)
         return DataFrame(jdf, self.sql_ctx)
 
-    @since(1.5)
     def sampleBy(self, col, fractions, seed=None):
         """
         Returns a stratified sample without replacement based on the
         fraction given on each stratum.
 
-        :param col: column that defines strata
-        :param fractions:
+        .. versionadded:: 1.5.0
+
+        Parameters
+        ----------
+        col : :class:`Column` or str
+            column that defines strata
+
+            .. versionchanged:: 3.0
+               Added sampling by a column of :class:`Column`

Review comment:
       Is this `versionchanged` put among parameters intentionally?

##########
File path: python/pyspark/sql/dataframe.py
##########
@@ -1782,20 +2095,27 @@ def replace(self, to_replace, value=_NoValue, subset=None):
         floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)
         and arbitrary replacement will be used.
 
-        :param to_replace: bool, int, float, string, list or dict.
+        .. versionadded:: 1.4.0
+
+        Parameters
+        ----------
+        to_replace : bool, int, float, string, list or dict
             Value to be replaced.
             If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`
             must be a mapping between a value and a replacement.
-        :param value: bool, int, float, string, list or None.
+        value : bool, int, float, string or None, optional

Review comment:
       `list` is removed?

##########
File path: python/pyspark/sql/session.py
##########
@@ -641,38 +727,51 @@ def table(self, tableName):
         return DataFrame(self._jsparkSession.table(tableName), self._wrapped)
 
     @property
-    @since(2.0)
     def read(self):
         """
         Returns a :class:`DataFrameReader` that can be used to read data
         in as a :class:`DataFrame`.
 
-        :return: :class:`DataFrameReader`
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/session.py
##########
@@ -641,38 +727,51 @@ def table(self, tableName):
         return DataFrame(self._jsparkSession.table(tableName), self._wrapped)
 
     @property
-    @since(2.0)
     def read(self):
         """
         Returns a :class:`DataFrameReader` that can be used to read data
         in as a :class:`DataFrame`.
 
-        :return: :class:`DataFrameReader`
+        .. versionadded:: 2.0
+
+        Returns
+        -------
+        :class:`DataFrameReader`
         """
         return DataFrameReader(self._wrapped)
 
     @property
-    @since(2.0)
     def readStream(self):
         """
         Returns a :class:`DataStreamReader` that can be used to read data streams
         as a streaming :class:`DataFrame`.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -34,9 +34,11 @@ class StreamingQuery(object):
     A handle to a query that is executing continuously in the background as new data arrives.
     All these methods are thread-safe.
 
-    .. note:: Evolving
-
     .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -191,19 +209,24 @@ def exception(self):
 class StreamingQueryManager(object):
     """A class to manage all the :class:`StreamingQuery` StreamingQueries active.
 
-    .. note:: Evolving
-
     .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -881,39 +1048,56 @@ def options(self, **options):
                 ambiguous. If it isn't set, the current value of the SQL config
                 ``spark.sql.session.timeZone`` is used by default.
 
-       .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -858,12 +1022,15 @@ def option(self, key, value):
                 ambiguous. If it isn't set, the current value of the SQL config
                 ``spark.sql.session.timeZone`` is used by default.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -881,39 +1048,56 @@ def options(self, **options):
                 ambiguous. If it isn't set, the current value of the SQL config
                 ``spark.sql.session.timeZone`` is used by default.
 
-       .. note:: Evolving.
+        .. versionadded:: 2.0
+
+        Notes
+        -----
+        This API is evolving.
         """
         for k in options:
             self._jwrite = self._jwrite.option(k, to_str(options[k]))
         return self
 
-    @since(2.0)
     def partitionBy(self, *cols):
         """Partitions the output by the given columns on the file system.
 
         If specified, the output is laid out on the file system similar
         to Hive's partitioning scheme.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -654,111 +768,147 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         ``inferSchema`` is enabled. To avoid going through the entire data once, disable
         ``inferSchema`` option or specify the schema explicitly using ``schema``.
 
-        .. note:: Evolving.
-
-        :param path: string, or list of strings, for input path(s).
-        :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema
-                       or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
-        :param sep: sets a separator (one or more characters) for each field and value. If None is
-                    set, it uses the default value, ``,``.
-        :param encoding: decodes the CSV files by the given encoding type. If None is set,
-                         it uses the default value, ``UTF-8``.
-        :param quote: sets a single character used for escaping quoted values where the
-                      separator can be part of the value. If None is set, it uses the default
-                      value, ``"``. If you would like to turn off quotations, you need to set an
-                      empty string.
-        :param escape: sets a single character used for escaping quotes inside an already
-                       quoted value. If None is set, it uses the default value, ``\``.
-        :param comment: sets a single character used for skipping lines beginning with this
-                        character. By default (None), it is disabled.
-        :param header: uses the first line as names of columns. If None is set, it uses the
-                       default value, ``false``.
-        :param inferSchema: infers the input schema automatically from data. It requires one extra
-                       pass over the data. If None is set, it uses the default value, ``false``.
-        :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be
-                              forcibly applied to datasource files, and headers in CSV files will be
-                              ignored. If the option is set to ``false``, the schema will be
-                              validated against all headers in CSV files or the first header in RDD
-                              if the ``header`` option is set to ``true``. Field names in the schema
-                              and column names in CSV headers are checked by their positions
-                              taking into account ``spark.sql.caseSensitive``. If None is set,
-                              ``true`` is used by default. Though the default value is ``true``,
-                              it is recommended to disable the ``enforceSchema`` option
-                              to avoid incorrect results.
-        :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from
-                                        values being read should be skipped. If None is set, it
-                                        uses the default value, ``false``.
-        :param ignoreTrailingWhiteSpace: a flag indicating whether or not trailing whitespaces from
-                                         values being read should be skipped. If None is set, it
-                                         uses the default value, ``false``.
-        :param nullValue: sets the string representation of a null value. If None is set, it uses
-                          the default value, empty string. Since 2.0.1, this ``nullValue`` param
-                          applies to all supported types including the string type.
-        :param nanValue: sets the string representation of a non-number value. If None is set, it
-                         uses the default value, ``NaN``.
-        :param positiveInf: sets the string representation of a positive infinity value. If None
-                            is set, it uses the default value, ``Inf``.
-        :param negativeInf: sets the string representation of a negative infinity value. If None
-                            is set, it uses the default value, ``Inf``.
-        :param dateFormat: sets the string that indicates a date format. Custom date formats
-                           follow the formats at `datetime pattern`_.
-                           This applies to date type. If None is set, it uses the
-                           default value, ``yyyy-MM-dd``.
-        :param timestampFormat: sets the string that indicates a timestamp format.
-                                Custom date formats follow the formats at `datetime pattern`_.
-                                This applies to timestamp type. If None is set, it uses the
-                                default value, ``yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]``.
-        :param maxColumns: defines a hard limit of how many columns a record can have. If None is
-                           set, it uses the default value, ``20480``.
-        :param maxCharsPerColumn: defines the maximum number of characters allowed for any given
-                                  value being read. If None is set, it uses the default value,
-                                  ``-1`` meaning unlimited length.
-        :param maxMalformedLogPerPartition: this parameter is no longer used since Spark 2.2.0.
-                                            If specified, it is ignored.
-        :param mode: allows a mode for dealing with corrupt records during parsing. If None is
-                     set, it uses the default value, ``PERMISSIVE``.
-
-                * ``PERMISSIVE``: when it meets a corrupted record, puts the malformed string \
-                  into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \
-                  fields to ``null``. To keep corrupt records, an user can set a string type \
-                  field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \
-                  schema does not have the field, it drops corrupt records during parsing. \
-                  A record with less/more tokens than schema is not a corrupted record to CSV. \
-                  When it meets a record having fewer tokens than the length of the schema, \
-                  sets ``null`` to extra fields. When the record has more tokens than the \
-                  length of the schema, it drops extra tokens.
-                * ``DROPMALFORMED``: ignores the whole corrupted records.
-                * ``FAILFAST``: throws an exception when it meets corrupted records.
-
-        :param columnNameOfCorruptRecord: allows renaming the new field having malformed string
-                                          created by ``PERMISSIVE`` mode. This overrides
-                                          ``spark.sql.columnNameOfCorruptRecord``. If None is set,
-                                          it uses the value specified in
-                                          ``spark.sql.columnNameOfCorruptRecord``.
-        :param multiLine: parse one record, which may span multiple lines. If None is
-                          set, it uses the default value, ``false``.
-        :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for
-                                          the quote character. If None is set, the default value is
-                                          escape character when escape and quote characters are
-                                          different, ``\0`` otherwise..
-        :param emptyValue: sets the string representation of an empty value. If None is set, it uses
-                           the default value, empty string.
-        :param locale: sets a locale as language tag in IETF BCP 47 format. If None is set,
-                       it uses the default value, ``en-US``. For instance, ``locale`` is used while
-                       parsing dates and timestamps.
-        :param lineSep: defines the line separator that should be used for parsing. If None is
-                        set, it covers all ``\\r``, ``\\r\\n`` and ``\\n``.
-                        Maximum length is 1 character.
-        :param pathGlobFilter: an optional glob pattern to only include files with paths matching
-                               the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`.
-                               It does not change the behavior of `partition discovery`_.
-        :param recursiveFileLookup: recursively scan a directory for files. Using this option
-                                    disables `partition discovery`_.
-
-        .. _partition discovery:
-          https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery
-        .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
-
+        Parameters
+        ----------
+        path : str or list
+            string, or list of strings, for input path(s).
+        schema : :class:`pyspark.sql.types.StructType` or str, optional
+            an optional :class:`pyspark.sql.types.StructType` for the input schema
+            or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
+        sep : str, optional
+            sets a separator (one or more characters) for each field and value. If None is
+            set, it uses the default value, ``,``.
+        encoding : str, optional
+            decodes the CSV files by the given encoding type. If None is set,
+            it uses the default value, ``UTF-8``.
+        quote : str, optional sets a single character used for escaping quoted values where the
+            separator can be part of the value. If None is set, it uses the default
+            value, ``"``. If you would like to turn off quotations, you need to set an
+            empty string.
+        escape : str, optional
+            sets a single character used for escaping quotes inside an already
+            quoted value. If None is set, it uses the default value, ``\``.
+        comment : str, optional
+            sets a single character used for skipping lines beginning with this
+            character. By default (None), it is disabled.
+        header : str or bool, optional
+            uses the first line as names of columns. If None is set, it uses the
+            default value, ``false``.
+        inferSchema : str or bool, optional
+            infers the input schema automatically from data. It requires one extra
+            pass over the data. If None is set, it uses the default value, ``false``.
+        enforceSchema : str or bool, optional
+            If it is set to ``true``, the specified or inferred schema will be
+            forcibly applied to datasource files, and headers in CSV files will be
+            ignored. If the option is set to ``false``, the schema will be
+            validated against all headers in CSV files or the first header in RDD
+            if the ``header`` option is set to ``true``. Field names in the schema
+            and column names in CSV headers are checked by their positions
+            taking into account ``spark.sql.caseSensitive``. If None is set,
+            ``true`` is used by default. Though the default value is ``true``,
+            it is recommended to disable the ``enforceSchema`` option
+            to avoid incorrect results.
+        ignoreLeadingWhiteSpace : str or bool, optional
+            a flag indicating whether or not leading whitespaces from
+            values being read should be skipped. If None is set, it
+            uses the default value, ``false``.
+        ignoreTrailingWhiteSpace : str or bool, optional
+            a flag indicating whether or not trailing whitespaces from
+            values being read should be skipped. If None is set, it
+            uses the default value, ``false``.
+        nullValue : str, optional
+            sets the string representation of a null value. If None is set, it uses
+            the default value, empty string. Since 2.0.1, this ``nullValue`` param
+            applies to all supported types including the string type.
+        nanValue : str, optional
+            sets the string representation of a non-number value. If None is set, it
+            uses the default value, ``NaN``.
+        positiveInf : str, optional
+            sets the string representation of a positive infinity value. If None
+            is set, it uses the default value, ``Inf``.
+        negativeInf : str, optional
+            sets the string representation of a negative infinity value. If None
+            is set, it uses the default value, ``Inf``.
+        dateFormat : str, optional
+            sets the string that indicates a date format. Custom date formats
+            follow the formats at
+            `datetime pattern <https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html>`_.  # noqa
+            This applies to date type. If None is set, it uses the
+            default value, ``yyyy-MM-dd``.
+        timestampFormat : str, optional
+            sets the string that indicates a timestamp format.
+            Custom date formats follow the formats at
+            `datetime pattern <https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html>`_.  # noqa
+            This applies to timestamp type. If None is set, it uses the
+            default value, ``yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]``.
+        maxColumns : str or int, optional
+            defines a hard limit of how many columns a record can have. If None is
+            set, it uses the default value, ``20480``.
+        maxCharsPerColumn : str or int, optional
+            defines the maximum number of characters allowed for any given
+            value being read. If None is set, it uses the default value,
+            ``-1`` meaning unlimited length.
+        maxMalformedLogPerPartition : str or int, optional
+            this parameter is no longer used since Spark 2.2.0.
+            If specified, it is ignored.
+        mode : str, optional
+            allows a mode for dealing with corrupt records during parsing. If None is
+            set, it uses the default value, ``PERMISSIVE``.
+
+            * ``PERMISSIVE``: when it meets a corrupted record, puts the malformed string \
+              into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \
+              fields to ``null``. To keep corrupt records, an user can set a string type \
+              field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \
+              schema does not have the field, it drops corrupt records during parsing. \
+              A record with less/more tokens than schema is not a corrupted record to CSV. \
+              When it meets a record having fewer tokens than the length of the schema, \
+              sets ``null`` to extra fields. When the record has more tokens than the \
+              length of the schema, it drops extra tokens.
+            * ``DROPMALFORMED``: ignores the whole corrupted records.
+            * ``FAILFAST``: throws an exception when it meets corrupted records.
+
+        columnNameOfCorruptRecord : str, optional
+            allows renaming the new field having malformed string
+            created by ``PERMISSIVE`` mode. This overrides
+            ``spark.sql.columnNameOfCorruptRecord``. If None is set,
+            it uses the value specified in
+            ``spark.sql.columnNameOfCorruptRecord``.
+        multiLine : str or bool, optional
+            parse one record, which may span multiple lines. If None is
+            set, it uses the default value, ``false``.
+        charToEscapeQuoteEscaping : str, optional
+            sets a single character used for escaping the escape for
+            the quote character. If None is set, the default value is
+            escape character when escape and quote characters are
+            different, ``\0`` otherwise.
+        emptyValue : str, optional
+            sets the string representation of an empty value. If None is set, it uses
+            the default value, empty string.
+        locale : str, optional
+            sets a locale as language tag in IETF BCP 47 format. If None is set,
+            it uses the default value, ``en-US``. For instance, ``locale`` is used while
+            parsing dates and timestamps.
+        lineSep : str, optional
+            defines the line separator that should be used for parsing. If None is
+            set, it covers all ``\\r``, ``\\r\\n`` and ``\\n``.
+            Maximum length is 1 character.
+        pathGlobFilter : str or bool, optional
+            an optional glob pattern to only include files with paths matching
+            the pattern. The syntax follows `org.apache.hadoop.fs.GlobFilter`.
+            It does not change the behavior of
+            `partition discovery <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery>`_.  # noqa
+        recursiveFileLookup : str or bool, optional
+            recursively scan a directory for files. Using this option disables
+            `partition discovery <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery>`_.  # noqa
+
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -818,29 +971,40 @@ def outputMode(self, outputMode):
            written to the sink every time there are some updates. If the query doesn't contain
            aggregations, it will be equivalent to `append` mode.
 
-       .. note:: Evolving.
+        Notes
+        -----
+        This API is evolving.
 
+        Examples
+        --------
         >>> writer = sdf.writeStream.outputMode('append')
         """
         if not outputMode or type(outputMode) != str or len(outputMode.strip()) == 0:
             raise ValueError('The output mode must be a non-empty string. Got: %s' % outputMode)
         self._jwrite = self._jwrite.outputMode(outputMode)
         return self
 
-    @since(2.0)
     def format(self, source):
         """Specifies the underlying output data source.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/window.py
##########
@@ -34,19 +34,21 @@ class Window(object):
     """
     Utility functions for defining window in DataFrames.
 
-    For example:
+    .. versionadded:: 1.4

Review comment:
       1.4.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -1151,8 +1349,14 @@ def foreachBatch(self, func):
         to exactly same for the same batchId (assuming all operations are deterministic in the
         query).
 
-        .. note:: Evolving.
+        .. versionadded:: 2.4

Review comment:
       2.4.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -1177,12 +1380,17 @@ def start(self, path=None, format=None, outputMode=None, partitionBy=None, query
         If ``format`` is not specified, the default data source configured by
         ``spark.sql.sources.default`` will be used.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -881,39 +1048,56 @@ def options(self, **options):
                 ambiguous. If it isn't set, the current value of the SQL config
                 ``spark.sql.session.timeZone`` is used by default.
 
-       .. note:: Evolving.
+        .. versionadded:: 2.0
+
+        Notes
+        -----
+        This API is evolving.
         """
         for k in options:
             self._jwrite = self._jwrite.option(k, to_str(options[k]))
         return self
 
-    @since(2.0)
     def partitionBy(self, *cols):
         """Partitions the output by the given columns on the file system.
 
         If specified, the output is laid out on the file system similar
         to Hive's partitioning scheme.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0
 
-        :param cols: name of columns
+        Parameters
+        ----------
+        cols : str or list
+            name of columns
 
+        Notes
+        -----
+        This API is evolving.
         """
         if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
             cols = cols[0]
         self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))
         return self
 
-    @since(2.0)
     def queryName(self, queryName):
         """Specifies the name of the :class:`StreamingQuery` that can be started with
         :func:`start`. This name must be unique among all the currently active queries
         in the associated SparkSession.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.0

Review comment:
       2.0.0?

##########
File path: python/pyspark/sql/streaming.py
##########
@@ -1045,8 +1238,14 @@ def foreach(self, f):
                 returns successfully (irrespective of the return value), except if the Python
                 crashes in the middle.
 
-        .. note:: Evolving.
+        .. versionadded:: 2.4

Review comment:
       2.4.0?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org