You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/08/01 01:37:58 UTC
[spark] branch master updated: [SPARK-35976][PYTHON] Adjust astype
method for ExtensionDtype in pandas API on Spark
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f04e991 [SPARK-35976][PYTHON] Adjust astype method for ExtensionDtype in pandas API on Spark
f04e991 is described below
commit f04e991e6a3bc70e92a18a9de7a1d9d6427df4ab
Author: Yikun Jiang <yi...@gmail.com>
AuthorDate: Sun Aug 1 10:37:25 2021 +0900
[SPARK-35976][PYTHON] Adjust astype method for ExtensionDtype in pandas API on Spark
### What changes were proposed in this pull request?
This patch set value to `<NA>` (pd.NA) in BooleanExtensionOps and StringExtensionOps.
### Why are the changes needed?
The pandas behavior:
```python
>>> pd.Series([True, False, None], dtype="boolean").astype(str).tolist()
['True', 'False', '<NA>']
>>> pd.Series(['s1', 's2', None], dtype="string").astype(str).tolist()
['1', '2', '<NA>']
```
pandas on spark
```python
>>> import pandas as pd
>>> from pyspark import pandas as ps
# Before
>>> ps.from_pandas(pd.Series([True, False, None], dtype="boolean")).astype(str).tolist()
['True', 'False', 'None']
>>> ps.from_pandas(pd.Series(['s1', 's2', None], dtype="string")).astype(str).tolist()
['True', 'False', 'None']
# After
>>> ps.from_pandas(pd.Series([True, False, None], dtype="boolean")).astype(str).tolist()
['True', 'False', '<NA>']
>>> ps.from_pandas(pd.Series(['s1', 's2', None], dtype="string")).astype(str).tolist()
['s1', 's2', '<NA>']
```
See more in [SPARK-35976](https://issues.apache.org/jira/browse/SPARK-35976)
### Does this PR introduce _any_ user-facing change?
Yes, return `<NA>` when None to follow the pandas behavior
### How was this patch tested?
Change the ut to cover this scenario.
Closes #33585 from Yikun/SPARK-35976.
Authored-by: Yikun Jiang <yi...@gmail.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/pandas/data_type_ops/boolean_ops.py | 2 +-
python/pyspark/pandas/data_type_ops/string_ops.py | 3 ++-
python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py | 3 +--
python/pyspark/pandas/tests/data_type_ops/test_string_ops.py | 3 +--
4 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index 1f708ca..6b257e0 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -261,7 +261,7 @@ class BooleanOps(DataTypeOps):
)
nullable = index_ops.spark.nullable
else:
- null_str = str(None)
+ null_str = str(pd.NA) if isinstance(self, BooleanExtensionOps) else str(None)
casted = F.when(index_ops.spark.column, "True").otherwise("False")
scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
nullable = False
diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py
index b2c4259..bfe36e1 100644
--- a/python/pyspark/pandas/data_type_ops/string_ops.py
+++ b/python/pyspark/pandas/data_type_ops/string_ops.py
@@ -136,7 +136,8 @@ class StringOps(DataTypeOps):
field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
)
elif isinstance(spark_type, StringType):
- return _as_string_type(index_ops, dtype)
+ null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
+ return _as_string_type(index_ops, dtype, null_str=null_str)
else:
return _as_other_type(index_ops, dtype, spark_type)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index a2fbbf1..b43d025 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -699,8 +699,7 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
def test_astype(self):
pser, psser = self.boolean_pdf["this"], self.boolean_psdf["this"]
- # TODO(SPARK-35976): [True, False, <NA>] is returned in pandas
- self.assert_eq(["True", "False", "None"], psser.astype(str).tolist())
+ self.assert_eq(pser.astype(str).tolist(), psser.astype(str).tolist())
self.assert_eq(pser.astype("category"), psser.astype("category"))
cat_type = CategoricalDtype(categories=[False, True])
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
index 96c24af..f7c45cc 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
@@ -286,8 +286,7 @@ class StringExtensionOpsTest(StringOpsTest, PandasOnSparkTestCase, TestCasesUtil
pser = self.pser
psser = self.psser
- # TODO(SPARK-35976): [x, y, z, <NA>] is returned in pandas
- self.assert_eq(["x", "y", "z", "None"], self.psser.astype(str).tolist())
+ self.assert_eq(pser.astype(str).tolist(), psser.astype(str).tolist())
self.assert_eq(pser.astype("category"), psser.astype("category"))
cat_type = CategoricalDtype(categories=["x", "y"])
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org