You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/03/01 01:42:34 UTC
[spark] branch branch-3.4 updated: [SPARK-41868][CONNECT][PYTHON] Fix createDataFrame to support durations
This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new e67d39f28f9 [SPARK-41868][CONNECT][PYTHON] Fix createDataFrame to support durations
e67d39f28f9 is described below
commit e67d39f28f931885d4204596389d245586d9863f
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Wed Mar 1 09:41:42 2023 +0800
[SPARK-41868][CONNECT][PYTHON] Fix createDataFrame to support durations
### What changes were proposed in this pull request?
Fixes `createDataFrame` to support durations.
### Why are the changes needed?
Currently the following command:
```py
spark.createDataFrame(pd.DataFrame({"a": [timedelta(microseconds=123)]}))
```
raises an error:
```
[UNSUPPORTED_ARROWTYPE] Unsupported arrow type Duration(NANOSECOND).
```
because Arrow takes a different type for `timedelta` objects from what Spark expects.
### Does this PR introduce _any_ user-facing change?
`timedelta` objects will be properly converted to `DayTimeIntervalType`.
### How was this patch tested?
Enabled the related test.
Closes #40226 from ueshin/issues/SPARK-41868/duration.
Authored-by: Takuya UESHIN <ue...@databricks.com>
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
(cherry picked from commit b43b1337066a2d48d6de1df9f3955cddc105f57a)
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
python/pyspark/sql/connect/session.py | 4 ++++
python/pyspark/sql/tests/connect/test_parity_dataframe.py | 5 -----
2 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
index d6ba9a36f96..d82dbcb2db0 100644
--- a/python/pyspark/sql/connect/session.py
+++ b/python/pyspark/sql/connect/session.py
@@ -43,6 +43,7 @@ import pyarrow as pa
from pandas.api.types import ( # type: ignore[attr-defined]
is_datetime64_dtype,
is_datetime64tz_dtype,
+ is_timedelta64_dtype,
)
from pyspark import SparkContext, SparkConf, __version__
@@ -60,6 +61,7 @@ from pyspark.sql.types import (
_merge_type,
Row,
DataType,
+ DayTimeIntervalType,
StructType,
AtomicType,
TimestampType,
@@ -245,6 +247,8 @@ class SparkSession:
arrow_types = [
to_arrow_type(TimestampType())
if is_datetime64_dtype(t) or is_datetime64tz_dtype(t)
+ else to_arrow_type(DayTimeIntervalType())
+ if is_timedelta64_dtype(t)
else None
for t in data.dtypes
]
diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe.py b/python/pyspark/sql/tests/connect/test_parity_dataframe.py
index da1172086cc..c5972ac02ae 100644
--- a/python/pyspark/sql/tests/connect/test_parity_dataframe.py
+++ b/python/pyspark/sql/tests/connect/test_parity_dataframe.py
@@ -22,11 +22,6 @@ from pyspark.testing.connectutils import ReusedConnectTestCase
class DataFrameParityTests(DataFrameTestsMixin, ReusedConnectTestCase):
- # TODO(SPARK-41868): Support data type Duration(NANOSECOND)
- @unittest.skip("Fails in Spark Connect, should enable.")
- def test_create_dataframe_from_pandas_with_day_time_interval(self):
- super().test_create_dataframe_from_pandas_with_day_time_interval()
-
# TODO(SPARK-41834): Implement SparkSession.conf
@unittest.skip("Fails in Spark Connect, should enable.")
def test_create_dataframe_from_pandas_with_dst(self):
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org