You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/06/21 23:47:39 UTC
[spark] branch master updated: [SPARK-43710][PS][CONNECT] Support `functions.date_part` for Spark Connect
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new bc20e85b0e1 [SPARK-43710][PS][CONNECT] Support `functions.date_part` for Spark Connect
bc20e85b0e1 is described below
commit bc20e85b0e1da510cc091dbd03f210ef9fc56b25
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Thu Jun 22 08:47:27 2023 +0900
[SPARK-43710][PS][CONNECT] Support `functions.date_part` for Spark Connect
### What changes were proposed in this pull request?
switch to the [newly added `date_part` function](https://github.com/apache/spark/commit/8dc02863b926b9e0780b994f9ee6c5c1058d49a0)
### Why are the changes needed?
to support connect
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
existing UT
Closes #41691 from zhengruifeng/ps_date_part.
Authored-by: Ruifeng Zheng <ru...@apache.org>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/pandas/indexes/timedelta.py | 6 +++---
python/pyspark/pandas/spark/functions.py | 32 ++----------------------------
2 files changed, 5 insertions(+), 33 deletions(-)
diff --git a/python/pyspark/pandas/indexes/timedelta.py b/python/pyspark/pandas/indexes/timedelta.py
index 564c484d968..e46d602e985 100644
--- a/python/pyspark/pandas/indexes/timedelta.py
+++ b/python/pyspark/pandas/indexes/timedelta.py
@@ -150,9 +150,9 @@ class TimedeltaIndex(Index):
@no_type_check
def get_seconds(scol):
- hour_scol = SF.date_part("HOUR", scol)
- minute_scol = SF.date_part("MINUTE", scol)
- second_scol = SF.date_part("SECOND", scol)
+ hour_scol = F.date_part("HOUR", scol)
+ minute_scol = F.date_part("MINUTE", scol)
+ second_scol = F.date_part("SECOND", scol)
return (
F.when(
hour_scol < 0,
diff --git a/python/pyspark/pandas/spark/functions.py b/python/pyspark/pandas/spark/functions.py
index a904071aee7..b33705263c7 100644
--- a/python/pyspark/pandas/spark/functions.py
+++ b/python/pyspark/pandas/spark/functions.py
@@ -17,15 +17,11 @@
"""
Additional Spark functions used in pandas-on-Spark.
"""
-from typing import Union, no_type_check
+from typing import Union
from pyspark import SparkContext
import pyspark.sql.functions as F
-from pyspark.sql.column import (
- Column,
- _to_java_column,
- _create_column_from_literal,
-)
+from pyspark.sql.column import Column
# For supporting Spark Connect
from pyspark.sql.utils import is_remote
@@ -145,27 +141,3 @@ def repeat(col: Column, n: Union[int, Column]) -> Column:
"""
_n = F.lit(n) if isinstance(n, int) else n
return F.call_udf("repeat", col, _n)
-
-
-def date_part(field: Union[str, Column], source: Column) -> Column:
- """
- Extracts a part of the date/timestamp or interval source.
- """
- sc = SparkContext._active_spark_context
- field = (
- _to_java_column(field) if isinstance(field, Column) else _create_column_from_literal(field)
- )
- return _call_udf(sc, "date_part", field, _to_java_column(source))
-
-
-@no_type_check
-def _call_udf(sc, name, *cols):
- return Column(sc._jvm.functions.callUDF(name, _make_arguments(sc, *cols)))
-
-
-@no_type_check
-def _make_arguments(sc, *cols):
- java_arr = sc._gateway.new_array(sc._jvm.Column, len(cols))
- for i, col in enumerate(cols):
- java_arr[i] = col
- return java_arr
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org