You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/08/01 01:37:58 UTC
[spark] branch master updated: [SPARK-35976][PYTHON] Adjust astype method for ExtensionDtype in pandas API on Spark

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new f04e991  [SPARK-35976][PYTHON] Adjust astype method for ExtensionDtype in pandas API on Spark
f04e991 is described below

commit f04e991e6a3bc70e92a18a9de7a1d9d6427df4ab
Author: Yikun Jiang <yi...@gmail.com>
AuthorDate: Sun Aug 1 10:37:25 2021 +0900

    [SPARK-35976][PYTHON] Adjust astype method for ExtensionDtype in pandas API on Spark
    
    ### What changes were proposed in this pull request?
    This patch set value to `<NA>` (pd.NA) in BooleanExtensionOps and StringExtensionOps.
    
    ### Why are the changes needed?
    The pandas behavior:
    ```python
    >>> pd.Series([True, False, None], dtype="boolean").astype(str).tolist()
    ['True', 'False', '<NA>']
    >>> pd.Series(['s1', 's2', None], dtype="string").astype(str).tolist()
    ['1', '2', '<NA>']
    ```
    
    pandas on spark
    ```python
    >>> import pandas as pd
    >>> from pyspark import pandas as ps
    
    # Before
    >>> ps.from_pandas(pd.Series([True, False, None], dtype="boolean")).astype(str).tolist()
    ['True', 'False', 'None']
    >>> ps.from_pandas(pd.Series(['s1', 's2', None], dtype="string")).astype(str).tolist()
    ['True', 'False', 'None']
    
    # After
    >>> ps.from_pandas(pd.Series([True, False, None], dtype="boolean")).astype(str).tolist()
    ['True', 'False', '<NA>']
    >>> ps.from_pandas(pd.Series(['s1', 's2', None], dtype="string")).astype(str).tolist()
    ['s1', 's2', '<NA>']
    ```
    
    See more in [SPARK-35976](https://issues.apache.org/jira/browse/SPARK-35976)
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, return `<NA>` when None to follow the pandas behavior
    
    ### How was this patch tested?
    Change the ut to cover this scenario.
    
    Closes #33585 from Yikun/SPARK-35976.
    
    Authored-by: Yikun Jiang <yi...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/pandas/data_type_ops/boolean_ops.py            | 2 +-
 python/pyspark/pandas/data_type_ops/string_ops.py             | 3 ++-
 python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py | 3 +--
 python/pyspark/pandas/tests/data_type_ops/test_string_ops.py  | 3 +--
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index 1f708ca..6b257e0 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -261,7 +261,7 @@ class BooleanOps(DataTypeOps):
                 )
                 nullable = index_ops.spark.nullable
             else:
-                null_str = str(None)
+                null_str = str(pd.NA) if isinstance(self, BooleanExtensionOps) else str(None)
                 casted = F.when(index_ops.spark.column, "True").otherwise("False")
                 scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
                 nullable = False
diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py
index b2c4259..bfe36e1 100644
--- a/python/pyspark/pandas/data_type_ops/string_ops.py
+++ b/python/pyspark/pandas/data_type_ops/string_ops.py
@@ -136,7 +136,8 @@ class StringOps(DataTypeOps):
                 field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
             )
         elif isinstance(spark_type, StringType):
-            return _as_string_type(index_ops, dtype)
+            null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
+            return _as_string_type(index_ops, dtype, null_str=null_str)
         else:
             return _as_other_type(index_ops, dtype, spark_type)
 
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index a2fbbf1..b43d025 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -699,8 +699,7 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
     def test_astype(self):
         pser, psser = self.boolean_pdf["this"], self.boolean_psdf["this"]
 
-        # TODO(SPARK-35976): [True, False, <NA>] is returned in pandas
-        self.assert_eq(["True", "False", "None"], psser.astype(str).tolist())
+        self.assert_eq(pser.astype(str).tolist(), psser.astype(str).tolist())
 
         self.assert_eq(pser.astype("category"), psser.astype("category"))
         cat_type = CategoricalDtype(categories=[False, True])
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
index 96c24af..f7c45cc 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
@@ -286,8 +286,7 @@ class StringExtensionOpsTest(StringOpsTest, PandasOnSparkTestCase, TestCasesUtil
         pser = self.pser
         psser = self.psser
 
-        # TODO(SPARK-35976): [x, y, z, <NA>] is returned in pandas
-        self.assert_eq(["x", "y", "z", "None"], self.psser.astype(str).tolist())
+        self.assert_eq(pser.astype(str).tolist(), psser.astype(str).tolist())
 
         self.assert_eq(pser.astype("category"), psser.astype("category"))
         cat_type = CategoricalDtype(categories=["x", "y"])

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org