You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2021/08/17 17:30:29 UTC
[spark] branch master updated: [SPARK-36387][PYTHON] Fix Series.astype from datetime to nullable string

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new c0441bb  [SPARK-36387][PYTHON] Fix Series.astype from datetime to nullable string
c0441bb is described below

commit c0441bb7e83e83e3240bf7e2991de34b01a182f5
Author: itholic <ha...@databricks.com>
AuthorDate: Tue Aug 17 10:29:16 2021 -0700

    [SPARK-36387][PYTHON] Fix Series.astype from datetime to nullable string
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to fix `Series.astype` when converting datetime type to StringDtype, to match the behavior of pandas 1.3.
    
    In pandas < 1.3,
    ```python
    >>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string")
    0    2020-10-27 00:00:01
    1                    NaT
    Name: datetime, dtype: string
    ```
    
    This is changed to
    
    ```python
    >>> pd.Series(["2020-10-27 00:00:01", None], name="datetime").astype("string")
    0    2020-10-27 00:00:01
    1                   <NA>
    Name: datetime, dtype: string
    ```
    
    in pandas >= 1.3, so we follow the behavior of latest pandas.
    
    ### Why are the changes needed?
    
    Because pandas-on-Spark always follow the behavior of latest pandas.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the behavior is changed to latest pandas when converting datetime to nullable string (StringDtype)
    
    ### How was this patch tested?
    
    Unittest passed
    
    Closes #33735 from itholic/SPARK-36387.
    
    Authored-by: itholic <ha...@databricks.com>
    Signed-off-by: Takuya UESHIN <ue...@databricks.com>
---
 python/pyspark/pandas/data_type_ops/base.py         |  2 +-
 python/pyspark/pandas/data_type_ops/datetime_ops.py | 19 ++++---------------
 python/pyspark/pandas/tests/test_series.py          |  8 +++++---
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py
index c69715f..b4c8c3e 100644
--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@@ -155,7 +155,7 @@ def _as_string_type(
     index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)
 ) -> IndexOpsLike:
     """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
-    representing null Spark column.
+    representing null Spark column. Note that `null_str` is for non-extension dtypes only.
     """
     spark_type = StringType()
     if isinstance(dtype, extension_dtypes):
diff --git a/python/pyspark/pandas/data_type_ops/datetime_ops.py b/python/pyspark/pandas/data_type_ops/datetime_ops.py
index 071c22e..63d817b 100644
--- a/python/pyspark/pandas/data_type_ops/datetime_ops.py
+++ b/python/pyspark/pandas/data_type_ops/datetime_ops.py
@@ -23,7 +23,7 @@ import numpy as np
 import pandas as pd
 from pandas.api.types import CategoricalDtype
 
-from pyspark.sql import functions as F, Column
+from pyspark.sql import Column
 from pyspark.sql.types import BooleanType, LongType, StringType, TimestampType
 
 from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
@@ -33,10 +33,11 @@ from pyspark.pandas.data_type_ops.base import (
     _as_bool_type,
     _as_categorical_type,
     _as_other_type,
+    _as_string_type,
     _sanitize_list_like,
 )
 from pyspark.pandas.spark import functions as SF
-from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type
+from pyspark.pandas.typedef import pandas_on_spark_type
 
 
 class DatetimeOps(DataTypeOps):
@@ -133,18 +134,6 @@ class DatetimeOps(DataTypeOps):
         elif isinstance(spark_type, BooleanType):
             return _as_bool_type(index_ops, dtype)
         elif isinstance(spark_type, StringType):
-            if isinstance(dtype, extension_dtypes):
-                # seems like a pandas' bug?
-                scol = F.when(index_ops.spark.column.isNull(), str(pd.NaT)).otherwise(
-                    index_ops.spark.column.cast(spark_type)
-                )
-            else:
-                null_str = str(pd.NaT)
-                casted = index_ops.spark.column.cast(spark_type)
-                scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
-            return index_ops._with_new_scol(
-                scol.alias(index_ops._internal.data_spark_column_names[0]),
-                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
-            )
+            return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
         else:
             return _as_other_type(index_ops, dtype, spark_type)
diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py
index d9ba3c76..58c87ed 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -1556,16 +1556,18 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
         if extension_object_dtypes_available:
             from pandas import StringDtype
 
+            # The behavior of casting datetime to nullable string is changed from pandas 1.3.
             if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-                # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-                pass
-            else:
                 self._check_extension(
                     psser.astype("M").astype("string"), pser.astype("M").astype("string")
                 )
                 self._check_extension(
                     psser.astype("M").astype(StringDtype()), pser.astype("M").astype(StringDtype())
                 )
+            else:
+                expected = ps.Series(["2020-10-27 00:00:01", None], name="x", dtype="string")
+                self._check_extension(psser.astype("M").astype("string"), expected)
+                self._check_extension(psser.astype("M").astype(StringDtype()), expected)
 
         with self.assertRaisesRegex(TypeError, "not understood"):
             psser.astype("int63")

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org