You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/09/25 06:22:44 UTC

[spark] branch master updated: [SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new fb2bee37c96 [SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0
fb2bee37c96 is described below

commit fb2bee37c964bf2164fc89a0a55085dd0c840b56
Author: zhyhimont <zh...@gmail.com>
AuthorDate: Mon Sep 25 15:22:32 2023 +0900

    [SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0
    
    ### What changes were proposed in this pull request?
    
    Support `isocalendar` from the pandas 2.0.0
    
    ### Why are the changes needed?
    
    When pandas 2.0.0 is released, we should match the behavior in pandas API on Spark.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Added new method `DatetimeIndex.isocalendar` and removed two depreceted `DatetimeIndex.week` and `DatetimeIndex.weekofyear`
    ```
    dfs = ps.from_pandas(pd.date_range(start='2019-12-29', freq='D', periods=4).to_series())
    dfs.dt.isocalendar()
                        year  week  day
            2019-12-29  2019    52    7
            2019-12-30  2020     1    1
            2019-12-31  2020     1    2
            2020-01-01  2020     1    3
    dfs.dt.isocalendar().week
            2019-12-29    52
            2019-12-30     1
            2019-12-31     1
            2020-01-01     1
    ```
    
    ### How was this patch tested?
    
    UT was updated
    
    Closes #40420 from dzhigimont/SPARK-42617_ZH.
    
    Lead-authored-by: zhyhimont <zh...@gmail.com>
    Co-authored-by: Zhyhimont Dmitry <zh...@profitero.com>
    Co-authored-by: Dmitry Zhyhimont <dz...@mail.ru>
    Co-authored-by: Zhyhimont Dmitry <dz...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../source/reference/pyspark.pandas/indexing.rst   |  3 +-
 .../source/reference/pyspark.pandas/series.rst     |  3 +-
 python/pyspark/pandas/datetimes.py                 | 70 ++++++++++++++++------
 python/pyspark/pandas/indexes/base.py              |  4 +-
 python/pyspark/pandas/indexes/datetimes.py         | 49 +++++++++------
 python/pyspark/pandas/namespace.py                 |  3 +-
 .../pyspark/pandas/tests/indexes/test_datetime.py  | 28 ++-------
 .../pandas/tests/indexes/test_datetime_property.py | 19 +-----
 .../pyspark/pandas/tests/test_series_datetime.py   | 17 +-----
 9 files changed, 100 insertions(+), 96 deletions(-)

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 70d463c052a..d6be57ee9c8 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -338,8 +338,7 @@ Time/date components
    DatetimeIndex.minute
    DatetimeIndex.second
    DatetimeIndex.microsecond
-   DatetimeIndex.week
-   DatetimeIndex.weekofyear
+   DatetimeIndex.isocalendar
    DatetimeIndex.dayofweek
    DatetimeIndex.day_of_week
    DatetimeIndex.weekday
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 552acec096f..7b658d45d4b 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -313,8 +313,7 @@ Datetime Properties
    Series.dt.minute
    Series.dt.second
    Series.dt.microsecond
-   Series.dt.week
-   Series.dt.weekofyear
+   Series.dt.isocalendar
    Series.dt.dayofweek
    Series.dt.weekday
    Series.dt.dayofyear
diff --git a/python/pyspark/pandas/datetimes.py b/python/pyspark/pandas/datetimes.py
index b0649cf5761..4b6e23fae7a 100644
--- a/python/pyspark/pandas/datetimes.py
+++ b/python/pyspark/pandas/datetimes.py
@@ -18,7 +18,6 @@
 """
 Date/Time related functions on pandas-on-Spark Series
 """
-import warnings
 from typing import Any, Optional, Union, no_type_check
 
 import numpy as np
@@ -27,7 +26,9 @@ from pandas.tseries.offsets import DateOffset
 
 import pyspark.pandas as ps
 import pyspark.sql.functions as F
-from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, LongType, IntegerType
+from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, IntegerType
+from pyspark.pandas import DataFrame
+from pyspark.pandas.config import option_context
 
 
 class DatetimeMethods:
@@ -116,26 +117,59 @@ class DatetimeMethods:
     def nanosecond(self) -> "ps.Series":
         raise NotImplementedError()
 
-    # TODO(SPARK-42617): Support isocalendar.week and replace it.
-    # See also https://github.com/pandas-dev/pandas/pull/33595.
-    @property
-    def week(self) -> "ps.Series":
+    def isocalendar(self) -> "ps.DataFrame":
         """
-        The week ordinal of the year.
+        Calculate year, week, and day according to the ISO 8601 standard.
 
-        .. deprecated:: 3.4.0
-        """
-        warnings.warn(
-            "weekofyear and week have been deprecated.",
-            FutureWarning,
-        )
-        return self._data.spark.transform(lambda c: F.weekofyear(c).cast(LongType()))
+            .. versionadded:: 4.0.0
 
-    @property
-    def weekofyear(self) -> "ps.Series":
-        return self.week
+        Returns
+        -------
+        DataFrame
+            With columns year, week and day.
 
-    weekofyear.__doc__ = week.__doc__
+        .. note:: Returns have int64 type instead of UInt32 as is in pandas due to UInt32
+            is not supported by spark
+
+        Examples
+        --------
+        >>> dfs = ps.from_pandas(pd.date_range(start='2019-12-29', freq='D', periods=4).to_series())
+        >>> dfs.dt.isocalendar()
+                    year  week  day
+        2019-12-29  2019    52    7
+        2019-12-30  2020     1    1
+        2019-12-31  2020     1    2
+        2020-01-01  2020     1    3
+
+        >>> dfs.dt.isocalendar().week
+        2019-12-29    52
+        2019-12-30     1
+        2019-12-31     1
+        2020-01-01     1
+        Name: week, dtype: int64
+        """
+
+        return_types = [self._data.index.dtype, int, int, int]
+
+        def pandas_isocalendar(  # type: ignore[no-untyped-def]
+            pdf,
+        ) -> ps.DataFrame[return_types]:  # type: ignore[valid-type]
+            # cast to int64 due to UInt32 is not supported by spark
+            return pdf[pdf.columns[0]].dt.isocalendar().astype(np.int64).reset_index()
+
+        with option_context("compute.default_index_type", "distributed"):
+            psdf = self._data.to_frame().pandas_on_spark.apply_batch(pandas_isocalendar)
+
+        return DataFrame(
+            psdf._internal.copy(
+                spark_frame=psdf._internal.spark_frame,
+                index_spark_columns=psdf._internal.data_spark_columns[:1],
+                index_fields=psdf._internal.data_fields[:1],
+                data_spark_columns=psdf._internal.data_spark_columns[1:],
+                data_fields=psdf._internal.data_fields[1:],
+                column_labels=[("year",), ("week",), ("day",)],
+            )
+        )
 
     @property
     def dayofweek(self) -> "ps.Series":
diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py
index 48ce22b6e51..c020e918d37 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -2007,7 +2007,7 @@ class Index(IndexOpsMixin):
 
         if isinstance(self, MultiIndex) and level is not None:
             self_names = self.names
-            self_names[level] = names  # type: ignore[index]
+            self_names[level] = names
             names = self_names
         return self.rename(name=names, inplace=inplace)
 
@@ -2077,7 +2077,7 @@ class Index(IndexOpsMixin):
                 [isinstance(item, tuple) for item in other]
             )
             if is_other_list_of_tuples:
-                other = MultiIndex.from_tuples(other)  # type: ignore[arg-type]
+                other = MultiIndex.from_tuples(other)
             else:
                 raise TypeError("other must be a MultiIndex or a list of tuples")
 
diff --git a/python/pyspark/pandas/indexes/datetimes.py b/python/pyspark/pandas/indexes/datetimes.py
index 2c208974167..5a2a347d1ba 100644
--- a/python/pyspark/pandas/indexes/datetimes.py
+++ b/python/pyspark/pandas/indexes/datetimes.py
@@ -25,6 +25,7 @@ from pandas.tseries.offsets import DateOffset
 from pyspark._globals import _NoValue
 
 from pyspark import pandas as ps
+from pyspark.pandas import DataFrame
 from pyspark.pandas.indexes.base import Index
 from pyspark.pandas.missing.indexes import MissingPandasLikeDatetimeIndex
 from pyspark.pandas.series import Series, first_series
@@ -232,28 +233,40 @@ class DatetimeIndex(Index):
         )
         return Index(self.to_series().dt.microsecond)
 
-    @property
-    def week(self) -> Index:
+    def isocalendar(self) -> DataFrame:
         """
-        The week ordinal of the year.
+        Calculate year, week, and day according to the ISO 8601 standard.
 
-        .. deprecated:: 3.5.0
-        """
-        warnings.warn(
-            "`week` is deprecated in 3.5.0 and will be removed in 4.0.0.",
-            FutureWarning,
-        )
-        return Index(self.to_series().dt.week)
+            .. versionadded:: 4.0.0
 
-    @property
-    def weekofyear(self) -> Index:
-        warnings.warn(
-            "`weekofyear` is deprecated in 3.5.0 and will be removed in 4.0.0.",
-            FutureWarning,
-        )
-        return Index(self.to_series().dt.weekofyear)
+        Returns
+        -------
+        DataFrame
+            With columns year, week and day.
 
-    weekofyear.__doc__ = week.__doc__
+        .. note:: Returns have int64 type instead of UInt32 as is in pandas due to UInt32
+            is not supported by spark
+
+        Examples
+        --------
+        >>> psidxs = ps.from_pandas(
+        ...     pd.DatetimeIndex(["2019-12-29", "2019-12-30", "2019-12-31", "2020-01-01"])
+        ... )
+        >>> psidxs.isocalendar()
+                    year  week  day
+        2019-12-29  2019    52    7
+        2019-12-30  2020     1    1
+        2019-12-31  2020     1    2
+        2020-01-01  2020     1    3
+
+        >>> psidxs.isocalendar().week
+        2019-12-29    52
+        2019-12-30     1
+        2019-12-31     1
+        2020-01-01     1
+        Name: week, dtype: int64
+        """
+        return self.to_series().dt.isocalendar()
 
     @property
     def dayofweek(self) -> Index:
diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
index f7c07b37c16..a700a243e5d 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -158,7 +158,8 @@ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series,
         raise TypeError("Unknown data type: {}".format(type(pobj).__name__))
 
 
-_range = range  # built-in range
+# built-in range
+_range: Type[range] = range  # type: ignore[assignment]
 
 
 def range(
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py
index e93ab76186a..4eaefb514d9 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -19,6 +19,7 @@ import datetime
 
 from distutils.version import LooseVersion
 
+import numpy as np
 import pandas as pd
 
 import pyspark.pandas as ps
@@ -98,28 +99,6 @@ class DatetimeIndexTestsMixin:
             self.assert_eq(psidx.day_of_year, pidx.day_of_year)
             self.assert_eq(psidx.day_of_week, pidx.day_of_week)
 
-        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
-            # TODO(SPARK-42617): Support isocalendar.week and replace it.
-            expected_results = [
-                ps.Index([1]),
-                ps.Index([1, 1, 13]),
-                ps.Index([52, 52, 1]),
-                ps.Index([52, 52, 52]),
-                ps.Index([52, 52, 52]),
-                ps.Index([52, 52, 52]),
-                ps.Index([52, 52, 52]),
-                ps.Index([52, 52, 52]),
-                ps.Index([52, 1, 2]),
-                ps.Index([13, 26, 39]),
-            ]
-            for psidx, expected_result in zip(self.psidxs, expected_results):
-                self.assert_eq(psidx.week, expected_result)
-                self.assert_eq(psidx.weekofyear, expected_result)
-        else:
-            for psidx, pidx in self.idx_pairs:
-                self.assert_eq(psidx.week, pidx.week)
-                self.assert_eq(psidx.weekofyear, pidx.weekofyear)
-
     def test_ceil(self):
         for psidx, pidx in self.idx_pairs:
             for freq in self.fixed_freqs:
@@ -267,6 +246,11 @@ class DatetimeIndexTestsMixin:
         mapper_pser = pd.Series([1, 2, 3], index=pidx)
         self.assert_eq(psidx.map(mapper_pser), pidx.map(mapper_pser))
 
+    def test_isocalendar(self):
+        for psidx, pidx in self.idx_pairs:
+            self.assert_eq(psidx.isocalendar().astype(int), pidx.isocalendar().astype(int))
+            self.assert_eq(psidx.isocalendar().week, pidx.isocalendar().week.astype(np.int64))
+
 
 class DatetimeIndexTests(DatetimeIndexTestsMixin, PandasOnSparkTestCase, TestUtils):
     pass
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime_property.py b/python/pyspark/pandas/tests/indexes/test_datetime_property.py
index 523b8bdda4a..0ab17664b9f 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime_property.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime_property.py
@@ -18,6 +18,7 @@
 import datetime
 import unittest
 
+import numpy as np
 import pandas as pd
 
 import pyspark.pandas as ps
@@ -83,23 +84,7 @@ class DatetimeIndexPropertyTestsMixin:
             self.assert_eq(psidx.is_leap_year, pd.Index(pidx.is_leap_year))
             self.assert_eq(psidx.day_of_year, pidx.day_of_year)
             self.assert_eq(psidx.day_of_week, pidx.day_of_week)
-
-        # TODO(SPARK-42617): Support isocalendar.week and replace it.
-        expected_results = [
-            ps.Index([1]),
-            ps.Index([1, 1, 13]),
-            ps.Index([52, 52, 1]),
-            ps.Index([52, 52, 52]),
-            ps.Index([52, 52, 52]),
-            ps.Index([52, 52, 52]),
-            ps.Index([52, 52, 52]),
-            ps.Index([52, 52, 52]),
-            ps.Index([52, 1, 2]),
-            ps.Index([13, 26, 39]),
-        ]
-        for psidx, expected_result in zip(self.psidxs, expected_results):
-            self.assert_eq(psidx.week, expected_result)
-            self.assert_eq(psidx.weekofyear, expected_result)
+            self.assert_eq(psidx.isocalendar().week, pidx.isocalendar().week.astype(np.int64))
 
 
 class DatetimeIndexPropertyTests(DatetimeIndexPropertyTestsMixin, PandasOnSparkTestCase, TestUtils):
diff --git a/python/pyspark/pandas/tests/test_series_datetime.py b/python/pyspark/pandas/tests/test_series_datetime.py
index 7e05364ca5f..c7ffc0675c6 100644
--- a/python/pyspark/pandas/tests/test_series_datetime.py
+++ b/python/pyspark/pandas/tests/test_series_datetime.py
@@ -197,23 +197,12 @@ class SeriesDateTimeTestsMixin:
         with self.assertRaises(NotImplementedError):
             self.check_func(lambda x: x.dt.nanosecond)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-42617): Support `isocalendar`",
-    )
-    def test_week(self):
-        self.check_func(lambda x: x.dt.week)
-
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-42617): Support `isocalendar`",
-    )
-    def test_weekofyear(self):
-        self.check_func(lambda x: x.dt.weekofyear)
-
     def test_dayofweek(self):
         self.check_func(lambda x: x.dt.dayofweek)
 
+    def test_isocalendar(self):
+        self.check_func(lambda x: x.dt.isocalendar().astype(np.int64))
+
     def test_weekday(self):
         self.check_func(lambda x: x.dt.weekday)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org