You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2022/09/21 05:50:05 UTC

[GitHub] [spark] itholic commented on a diff in pull request #37948: [SPARK-40327][PS][DOCS] Add resampling to API references

itholic commented on code in PR #37948:
URL: https://github.com/apache/spark/pull/37948#discussion_r976058915


##########
python/pyspark/pandas/resample.py:
##########
@@ -412,21 +412,267 @@ def _handle_output(self, psdf: DataFrame) -> FrameLike:
         pass
 
     def min(self) -> FrameLike:
+        """
+        Compute max of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").min().sort_index()
+                           A         B
+        2022-05-01  0.171162  0.338864
+        2022-05-04  0.010527  0.561204
+        2022-05-07       NaN       NaN
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("min"))
 
     def max(self) -> FrameLike:
+        """
+        Compute max of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").max().sort_index()
+                           A         B
+        2022-05-01  0.420538  0.859182
+        2022-05-04  0.270533  0.691041
+        2022-05-07       NaN       NaN
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("max"))
 
     def sum(self) -> FrameLike:
+        """
+        Compute sum of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").sum().sort_index()
+                           A         B
+        2022-05-01  0.800160  1.679727
+        2022-05-04  0.281060  1.252245
+        2022-05-07  0.000000  0.000000
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("sum").fillna(0.0))
 
     def mean(self) -> FrameLike:
+        """
+        Compute mean of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").mean().sort_index()
+                           A         B
+        2022-05-01  0.266720  0.559909
+        2022-05-04  0.140530  0.626123
+        2022-05-07       NaN       NaN
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("mean"))
 
     def std(self) -> FrameLike:
+        """
+        Compute mean of resampled values.

Review Comment:
   mean -> std ?



##########
python/pyspark/pandas/resample.py:
##########
@@ -412,21 +412,267 @@ def _handle_output(self, psdf: DataFrame) -> FrameLike:
         pass
 
     def min(self) -> FrameLike:
+        """
+        Compute max of resampled values.

Review Comment:
   max -> min ?



##########
python/pyspark/pandas/resample.py:
##########
@@ -412,21 +412,267 @@ def _handle_output(self, psdf: DataFrame) -> FrameLike:
         pass
 
     def min(self) -> FrameLike:
+        """
+        Compute max of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").min().sort_index()
+                           A         B
+        2022-05-01  0.171162  0.338864
+        2022-05-04  0.010527  0.561204
+        2022-05-07       NaN       NaN
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("min"))
 
     def max(self) -> FrameLike:
+        """
+        Compute max of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").max().sort_index()
+                           A         B
+        2022-05-01  0.420538  0.859182
+        2022-05-04  0.270533  0.691041
+        2022-05-07       NaN       NaN
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("max"))
 
     def sum(self) -> FrameLike:
+        """
+        Compute sum of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").sum().sort_index()
+                           A         B
+        2022-05-01  0.800160  1.679727
+        2022-05-04  0.281060  1.252245
+        2022-05-07  0.000000  0.000000
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("sum").fillna(0.0))
 
     def mean(self) -> FrameLike:
+        """
+        Compute mean of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").mean().sort_index()
+                           A         B
+        2022-05-01  0.266720  0.559909
+        2022-05-04  0.140530  0.626123
+        2022-05-07       NaN       NaN
+        2022-05-10  0.813726  0.745100
+        """
         return self._handle_output(self._downsample("mean"))
 
     def std(self) -> FrameLike:
+        """
+        Compute mean of resampled values.
+
+        .. versionadded:: 3.4.0
+
+        See Also
+        --------
+        pyspark.pandas.Series.groupby
+        pyspark.pandas.DataFrame.groupby
+
+        Examples
+        --------
+        >>> np.random.seed(22)
+        >>> dates = [
+        ...    datetime(2022, 5, 1, 4, 5, 6),
+        ...    datetime(2022, 5, 3),
+        ...    datetime(2022, 5, 3, 23, 59, 59),
+        ...    datetime(2022, 5, 4),
+        ...    pd.NaT,
+        ...    datetime(2022, 5, 4, 0, 0, 1),
+        ...    datetime(2022, 5, 11),
+        ... ]
+        >>> df = ps.DataFrame(
+        ...    np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=["A", "B"]
+        ... )
+        >>> df
+                                    A         B
+        2022-05-01 04:05:06  0.208461  0.481681
+        2022-05-03 00:00:00  0.420538  0.859182
+        2022-05-03 23:59:59  0.171162  0.338864
+        2022-05-04 00:00:00  0.270533  0.691041
+        NaT                  0.220405  0.811951
+        2022-05-04 00:00:01  0.010527  0.561204
+        2022-05-11 00:00:00  0.813726  0.745100
+        >>> df.resample("3D").std().sort_index()
+                           A         B
+        2022-05-01  0.134509  0.268835
+        2022-05-04  0.183852  0.091809
+        2022-05-07       NaN       NaN
+        2022-05-10       NaN       NaN
+        """
         return self._handle_output(self._downsample("std"))
 
     def var(self) -> FrameLike:
+        """
+        Compute mean of resampled values.

Review Comment:
   mean -> var ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org