You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/09/25 06:22:44 UTC
[spark] branch master updated: [SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new fb2bee37c96 [SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0
fb2bee37c96 is described below
commit fb2bee37c964bf2164fc89a0a55085dd0c840b56
Author: zhyhimont <zh...@gmail.com>
AuthorDate: Mon Sep 25 15:22:32 2023 +0900
[SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0
### What changes were proposed in this pull request?
Support `isocalendar` from the pandas 2.0.0
### Why are the changes needed?
When pandas 2.0.0 is released, we should match the behavior in pandas API on Spark.
### Does this PR introduce _any_ user-facing change?
Added new method `DatetimeIndex.isocalendar` and removed two depreceted `DatetimeIndex.week` and `DatetimeIndex.weekofyear`
```
dfs = ps.from_pandas(pd.date_range(start='2019-12-29', freq='D', periods=4).to_series())
dfs.dt.isocalendar()
year week day
2019-12-29 2019 52 7
2019-12-30 2020 1 1
2019-12-31 2020 1 2
2020-01-01 2020 1 3
dfs.dt.isocalendar().week
2019-12-29 52
2019-12-30 1
2019-12-31 1
2020-01-01 1
```
### How was this patch tested?
UT was updated
Closes #40420 from dzhigimont/SPARK-42617_ZH.
Lead-authored-by: zhyhimont <zh...@gmail.com>
Co-authored-by: Zhyhimont Dmitry <zh...@profitero.com>
Co-authored-by: Dmitry Zhyhimont <dz...@mail.ru>
Co-authored-by: Zhyhimont Dmitry <dz...@gmail.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../source/reference/pyspark.pandas/indexing.rst | 3 +-
.../source/reference/pyspark.pandas/series.rst | 3 +-
python/pyspark/pandas/datetimes.py | 70 ++++++++++++++++------
python/pyspark/pandas/indexes/base.py | 4 +-
python/pyspark/pandas/indexes/datetimes.py | 49 +++++++++------
python/pyspark/pandas/namespace.py | 3 +-
.../pyspark/pandas/tests/indexes/test_datetime.py | 28 ++-------
.../pandas/tests/indexes/test_datetime_property.py | 19 +-----
.../pyspark/pandas/tests/test_series_datetime.py | 17 +-----
9 files changed, 100 insertions(+), 96 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 70d463c052a..d6be57ee9c8 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -338,8 +338,7 @@ Time/date components
DatetimeIndex.minute
DatetimeIndex.second
DatetimeIndex.microsecond
- DatetimeIndex.week
- DatetimeIndex.weekofyear
+ DatetimeIndex.isocalendar
DatetimeIndex.dayofweek
DatetimeIndex.day_of_week
DatetimeIndex.weekday
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 552acec096f..7b658d45d4b 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -313,8 +313,7 @@ Datetime Properties
Series.dt.minute
Series.dt.second
Series.dt.microsecond
- Series.dt.week
- Series.dt.weekofyear
+ Series.dt.isocalendar
Series.dt.dayofweek
Series.dt.weekday
Series.dt.dayofyear
diff --git a/python/pyspark/pandas/datetimes.py b/python/pyspark/pandas/datetimes.py
index b0649cf5761..4b6e23fae7a 100644
--- a/python/pyspark/pandas/datetimes.py
+++ b/python/pyspark/pandas/datetimes.py
@@ -18,7 +18,6 @@
"""
Date/Time related functions on pandas-on-Spark Series
"""
-import warnings
from typing import Any, Optional, Union, no_type_check
import numpy as np
@@ -27,7 +26,9 @@ from pandas.tseries.offsets import DateOffset
import pyspark.pandas as ps
import pyspark.sql.functions as F
-from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, LongType, IntegerType
+from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, IntegerType
+from pyspark.pandas import DataFrame
+from pyspark.pandas.config import option_context
class DatetimeMethods:
@@ -116,26 +117,59 @@ class DatetimeMethods:
def nanosecond(self) -> "ps.Series":
raise NotImplementedError()
- # TODO(SPARK-42617): Support isocalendar.week and replace it.
- # See also https://github.com/pandas-dev/pandas/pull/33595.
- @property
- def week(self) -> "ps.Series":
+ def isocalendar(self) -> "ps.DataFrame":
"""
- The week ordinal of the year.
+ Calculate year, week, and day according to the ISO 8601 standard.
- .. deprecated:: 3.4.0
- """
- warnings.warn(
- "weekofyear and week have been deprecated.",
- FutureWarning,
- )
- return self._data.spark.transform(lambda c: F.weekofyear(c).cast(LongType()))
+ .. versionadded:: 4.0.0
- @property
- def weekofyear(self) -> "ps.Series":
- return self.week
+ Returns
+ -------
+ DataFrame
+ With columns year, week and day.
- weekofyear.__doc__ = week.__doc__
+ .. note:: Returns have int64 type instead of UInt32 as is in pandas due to UInt32
+ is not supported by spark
+
+ Examples
+ --------
+ >>> dfs = ps.from_pandas(pd.date_range(start='2019-12-29', freq='D', periods=4).to_series())
+ >>> dfs.dt.isocalendar()
+ year week day
+ 2019-12-29 2019 52 7
+ 2019-12-30 2020 1 1
+ 2019-12-31 2020 1 2
+ 2020-01-01 2020 1 3
+
+ >>> dfs.dt.isocalendar().week
+ 2019-12-29 52
+ 2019-12-30 1
+ 2019-12-31 1
+ 2020-01-01 1
+ Name: week, dtype: int64
+ """
+
+ return_types = [self._data.index.dtype, int, int, int]
+
+ def pandas_isocalendar( # type: ignore[no-untyped-def]
+ pdf,
+ ) -> ps.DataFrame[return_types]: # type: ignore[valid-type]
+ # cast to int64 due to UInt32 is not supported by spark
+ return pdf[pdf.columns[0]].dt.isocalendar().astype(np.int64).reset_index()
+
+ with option_context("compute.default_index_type", "distributed"):
+ psdf = self._data.to_frame().pandas_on_spark.apply_batch(pandas_isocalendar)
+
+ return DataFrame(
+ psdf._internal.copy(
+ spark_frame=psdf._internal.spark_frame,
+ index_spark_columns=psdf._internal.data_spark_columns[:1],
+ index_fields=psdf._internal.data_fields[:1],
+ data_spark_columns=psdf._internal.data_spark_columns[1:],
+ data_fields=psdf._internal.data_fields[1:],
+ column_labels=[("year",), ("week",), ("day",)],
+ )
+ )
@property
def dayofweek(self) -> "ps.Series":
diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py
index 48ce22b6e51..c020e918d37 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -2007,7 +2007,7 @@ class Index(IndexOpsMixin):
if isinstance(self, MultiIndex) and level is not None:
self_names = self.names
- self_names[level] = names # type: ignore[index]
+ self_names[level] = names
names = self_names
return self.rename(name=names, inplace=inplace)
@@ -2077,7 +2077,7 @@ class Index(IndexOpsMixin):
[isinstance(item, tuple) for item in other]
)
if is_other_list_of_tuples:
- other = MultiIndex.from_tuples(other) # type: ignore[arg-type]
+ other = MultiIndex.from_tuples(other)
else:
raise TypeError("other must be a MultiIndex or a list of tuples")
diff --git a/python/pyspark/pandas/indexes/datetimes.py b/python/pyspark/pandas/indexes/datetimes.py
index 2c208974167..5a2a347d1ba 100644
--- a/python/pyspark/pandas/indexes/datetimes.py
+++ b/python/pyspark/pandas/indexes/datetimes.py
@@ -25,6 +25,7 @@ from pandas.tseries.offsets import DateOffset
from pyspark._globals import _NoValue
from pyspark import pandas as ps
+from pyspark.pandas import DataFrame
from pyspark.pandas.indexes.base import Index
from pyspark.pandas.missing.indexes import MissingPandasLikeDatetimeIndex
from pyspark.pandas.series import Series, first_series
@@ -232,28 +233,40 @@ class DatetimeIndex(Index):
)
return Index(self.to_series().dt.microsecond)
- @property
- def week(self) -> Index:
+ def isocalendar(self) -> DataFrame:
"""
- The week ordinal of the year.
+ Calculate year, week, and day according to the ISO 8601 standard.
- .. deprecated:: 3.5.0
- """
- warnings.warn(
- "`week` is deprecated in 3.5.0 and will be removed in 4.0.0.",
- FutureWarning,
- )
- return Index(self.to_series().dt.week)
+ .. versionadded:: 4.0.0
- @property
- def weekofyear(self) -> Index:
- warnings.warn(
- "`weekofyear` is deprecated in 3.5.0 and will be removed in 4.0.0.",
- FutureWarning,
- )
- return Index(self.to_series().dt.weekofyear)
+ Returns
+ -------
+ DataFrame
+ With columns year, week and day.
- weekofyear.__doc__ = week.__doc__
+ .. note:: Returns have int64 type instead of UInt32 as is in pandas due to UInt32
+ is not supported by spark
+
+ Examples
+ --------
+ >>> psidxs = ps.from_pandas(
+ ... pd.DatetimeIndex(["2019-12-29", "2019-12-30", "2019-12-31", "2020-01-01"])
+ ... )
+ >>> psidxs.isocalendar()
+ year week day
+ 2019-12-29 2019 52 7
+ 2019-12-30 2020 1 1
+ 2019-12-31 2020 1 2
+ 2020-01-01 2020 1 3
+
+ >>> psidxs.isocalendar().week
+ 2019-12-29 52
+ 2019-12-30 1
+ 2019-12-31 1
+ 2020-01-01 1
+ Name: week, dtype: int64
+ """
+ return self.to_series().dt.isocalendar()
@property
def dayofweek(self) -> Index:
diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
index f7c07b37c16..a700a243e5d 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -158,7 +158,8 @@ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series,
raise TypeError("Unknown data type: {}".format(type(pobj).__name__))
-_range = range # built-in range
+# built-in range
+_range: Type[range] = range # type: ignore[assignment]
def range(
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py
index e93ab76186a..4eaefb514d9 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -19,6 +19,7 @@ import datetime
from distutils.version import LooseVersion
+import numpy as np
import pandas as pd
import pyspark.pandas as ps
@@ -98,28 +99,6 @@ class DatetimeIndexTestsMixin:
self.assert_eq(psidx.day_of_year, pidx.day_of_year)
self.assert_eq(psidx.day_of_week, pidx.day_of_week)
- if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
- # TODO(SPARK-42617): Support isocalendar.week and replace it.
- expected_results = [
- ps.Index([1]),
- ps.Index([1, 1, 13]),
- ps.Index([52, 52, 1]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 1, 2]),
- ps.Index([13, 26, 39]),
- ]
- for psidx, expected_result in zip(self.psidxs, expected_results):
- self.assert_eq(psidx.week, expected_result)
- self.assert_eq(psidx.weekofyear, expected_result)
- else:
- for psidx, pidx in self.idx_pairs:
- self.assert_eq(psidx.week, pidx.week)
- self.assert_eq(psidx.weekofyear, pidx.weekofyear)
-
def test_ceil(self):
for psidx, pidx in self.idx_pairs:
for freq in self.fixed_freqs:
@@ -267,6 +246,11 @@ class DatetimeIndexTestsMixin:
mapper_pser = pd.Series([1, 2, 3], index=pidx)
self.assert_eq(psidx.map(mapper_pser), pidx.map(mapper_pser))
+ def test_isocalendar(self):
+ for psidx, pidx in self.idx_pairs:
+ self.assert_eq(psidx.isocalendar().astype(int), pidx.isocalendar().astype(int))
+ self.assert_eq(psidx.isocalendar().week, pidx.isocalendar().week.astype(np.int64))
+
class DatetimeIndexTests(DatetimeIndexTestsMixin, PandasOnSparkTestCase, TestUtils):
pass
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime_property.py b/python/pyspark/pandas/tests/indexes/test_datetime_property.py
index 523b8bdda4a..0ab17664b9f 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime_property.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime_property.py
@@ -18,6 +18,7 @@
import datetime
import unittest
+import numpy as np
import pandas as pd
import pyspark.pandas as ps
@@ -83,23 +84,7 @@ class DatetimeIndexPropertyTestsMixin:
self.assert_eq(psidx.is_leap_year, pd.Index(pidx.is_leap_year))
self.assert_eq(psidx.day_of_year, pidx.day_of_year)
self.assert_eq(psidx.day_of_week, pidx.day_of_week)
-
- # TODO(SPARK-42617): Support isocalendar.week and replace it.
- expected_results = [
- ps.Index([1]),
- ps.Index([1, 1, 13]),
- ps.Index([52, 52, 1]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 52, 52]),
- ps.Index([52, 1, 2]),
- ps.Index([13, 26, 39]),
- ]
- for psidx, expected_result in zip(self.psidxs, expected_results):
- self.assert_eq(psidx.week, expected_result)
- self.assert_eq(psidx.weekofyear, expected_result)
+ self.assert_eq(psidx.isocalendar().week, pidx.isocalendar().week.astype(np.int64))
class DatetimeIndexPropertyTests(DatetimeIndexPropertyTestsMixin, PandasOnSparkTestCase, TestUtils):
diff --git a/python/pyspark/pandas/tests/test_series_datetime.py b/python/pyspark/pandas/tests/test_series_datetime.py
index 7e05364ca5f..c7ffc0675c6 100644
--- a/python/pyspark/pandas/tests/test_series_datetime.py
+++ b/python/pyspark/pandas/tests/test_series_datetime.py
@@ -197,23 +197,12 @@ class SeriesDateTimeTestsMixin:
with self.assertRaises(NotImplementedError):
self.check_func(lambda x: x.dt.nanosecond)
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-42617): Support `isocalendar`",
- )
- def test_week(self):
- self.check_func(lambda x: x.dt.week)
-
- @unittest.skipIf(
- LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
- "TODO(SPARK-42617): Support `isocalendar`",
- )
- def test_weekofyear(self):
- self.check_func(lambda x: x.dt.weekofyear)
-
def test_dayofweek(self):
self.check_func(lambda x: x.dt.dayofweek)
+ def test_isocalendar(self):
+ self.check_func(lambda x: x.dt.isocalendar().astype(np.int64))
+
def test_weekday(self):
self.check_func(lambda x: x.dt.weekday)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org