You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/06/20 13:47:20 UTC
[spark] branch master updated: [SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API
This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8dc02863b92 [SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API
8dc02863b92 is described below
commit 8dc02863b926b9e0780b994f9ee6c5c1058d49a0
Author: Jiaan Geng <be...@163.com>
AuthorDate: Tue Jun 20 21:47:05 2023 +0800
[SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API
### What changes were proposed in this pull request?
This PR follows up https://github.com/apache/spark/pull/41636 and https://github.com/apache/spark/pull/41651 and add extract, date_part, datepart to Scala, Python and Connect API.
### Why are the changes needed?
Add extract, date_part, datepart to Scala, Python and Connect API
### Does this PR introduce _any_ user-facing change?
'No'.
New feature.
### How was this patch tested?
New test cases.
Closes #41667 from beliefer/datetime_functions_followup.
Authored-by: Jiaan Geng <be...@163.com>
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
.../scala/org/apache/spark/sql/functions.scala | 50 ++++++++++
.../apache/spark/sql/PlanGenerationTestSuite.scala | 12 +++
.../explain-results/function_date_part.explain | 2 +
.../explain-results/function_datepart.explain | 2 +
.../explain-results/function_extract.explain | 2 +
.../query-tests/queries/function_date_part.json | 29 ++++++
.../queries/function_date_part.proto.bin | Bin 0 -> 133 bytes
.../query-tests/queries/function_datepart.json | 29 ++++++
.../queries/function_datepart.proto.bin | Bin 0 -> 132 bytes
.../query-tests/queries/function_extract.json | 29 ++++++
.../query-tests/queries/function_extract.proto.bin | Bin 0 -> 131 bytes
.../source/reference/pyspark.sql/functions.rst | 3 +
python/pyspark/sql/connect/functions.py | 21 ++++
python/pyspark/sql/functions.py | 110 +++++++++++++++++++++
.../scala/org/apache/spark/sql/functions.scala | 41 ++++++++
.../org/apache/spark/sql/DateFunctionsSuite.scala | 68 +++++++++++++
16 files changed, 398 insertions(+)
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 2ac20bd5911..a3f4a273661 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4438,6 +4438,56 @@ object functions {
*/
def hour(e: Column): Column = Column.fn("hour", e)
+ /**
+ * Extracts a part of the date/timestamp or interval source.
+ *
+ * @param field
+ * selects which part of the source should be extracted.
+ * @param source
+ * a date/timestamp or interval column from where `field` should be extracted.
+ * @return
+ * a part of the date/timestamp or interval source
+ * @group datetime_funcs
+ * @since 3.5.0
+ */
+ def extract(field: Column, source: Column): Column = {
+ Column.fn("extract", field, source)
+ }
+
+ /**
+ * Extracts a part of the date/timestamp or interval source.
+ *
+ * @param field
+ * selects which part of the source should be extracted, and supported string values are as
+ * same as the fields of the equivalent function `extract`.
+ * @param source
+ * a date/timestamp or interval column from where `field` should be extracted.
+ * @return
+ * a part of the date/timestamp or interval source
+ * @group datetime_funcs
+ * @since 3.5.0
+ */
+ def date_part(field: Column, source: Column): Column = {
+ Column.fn("date_part", field, source)
+ }
+
+ /**
+ * Extracts a part of the date/timestamp or interval source.
+ *
+ * @param field
+ * selects which part of the source should be extracted, and supported string values are as
+ * same as the fields of the equivalent function `extract`.
+ * @param source
+ * a date/timestamp or interval column from where `field` should be extracted.
+ * @return
+ * a part of the date/timestamp or interval source
+ * @group datetime_funcs
+ * @since 3.5.0
+ */
+ def datepart(field: Column, source: Column): Column = {
+ Column.fn("datepart", field, source)
+ }
+
/**
* Returns the last day of the month which the given date belongs to. For example, input
* "2015-07-27" returns "2015-07-31" since July 31 is the last day of the month in July 2015.
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index e212720b84e..2dae9a99146 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -2021,6 +2021,18 @@ class PlanGenerationTestSuite
fn.weekofyear(fn.col("d"))
}
+ temporalFunctionTest("extract") {
+ fn.extract(lit("year"), fn.col("d"))
+ }
+
+ temporalFunctionTest("date_part") {
+ fn.date_part(lit("year"), fn.col("d"))
+ }
+
+ temporalFunctionTest("datepart") {
+ fn.datepart(lit("year"), fn.col("d"))
+ }
+
temporalFunctionTest("from_unixtime") {
fn.from_unixtime(lit(1L))
}
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain
new file mode 100644
index 00000000000..dda95c323c6
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain
@@ -0,0 +1,2 @@
+Project [year(d#0) AS date_part(year, d)#0]
++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain
new file mode 100644
index 00000000000..1434b1c6db4
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain
@@ -0,0 +1,2 @@
+Project [year(d#0) AS datepart(year FROM d)#0]
++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain
new file mode 100644
index 00000000000..07f30cadb2e
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain
@@ -0,0 +1,2 @@
+Project [year(d#0) AS extract(year FROM d)#0]
++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json
new file mode 100644
index 00000000000..5e8d075c4e2
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json
@@ -0,0 +1,29 @@
+{
+ "common": {
+ "planId": "1"
+ },
+ "project": {
+ "input": {
+ "common": {
+ "planId": "0"
+ },
+ "localRelation": {
+ "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e"
+ }
+ },
+ "expressions": [{
+ "unresolvedFunction": {
+ "functionName": "date_part",
+ "arguments": [{
+ "literal": {
+ "string": "year"
+ }
+ }, {
+ "unresolvedAttribute": {
+ "unparsedIdentifier": "d"
+ }
+ }]
+ }
+ }]
+ }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin
new file mode 100644
index 00000000000..368ecb676c1
Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin differ
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json
new file mode 100644
index 00000000000..cc4dca86742
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json
@@ -0,0 +1,29 @@
+{
+ "common": {
+ "planId": "1"
+ },
+ "project": {
+ "input": {
+ "common": {
+ "planId": "0"
+ },
+ "localRelation": {
+ "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e"
+ }
+ },
+ "expressions": [{
+ "unresolvedFunction": {
+ "functionName": "datepart",
+ "arguments": [{
+ "literal": {
+ "string": "year"
+ }
+ }, {
+ "unresolvedAttribute": {
+ "unparsedIdentifier": "d"
+ }
+ }]
+ }
+ }]
+ }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin
new file mode 100644
index 00000000000..9d58fd3474d
Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin differ
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json
new file mode 100644
index 00000000000..6ccdb2f9d75
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json
@@ -0,0 +1,29 @@
+{
+ "common": {
+ "planId": "1"
+ },
+ "project": {
+ "input": {
+ "common": {
+ "planId": "0"
+ },
+ "localRelation": {
+ "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e"
+ }
+ },
+ "expressions": [{
+ "unresolvedFunction": {
+ "functionName": "extract",
+ "arguments": [{
+ "literal": {
+ "string": "year"
+ }
+ }, {
+ "unresolvedAttribute": {
+ "unparsedIdentifier": "d"
+ }
+ }]
+ }
+ }]
+ }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin
new file mode 100644
index 00000000000..91553c3b94b
Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin differ
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst
index f41443b8274..8195f06bea8 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -130,9 +130,12 @@ Datetime Functions
dateadd
datediff
day
+ date_part
+ datepart
dayofmonth
dayofweek
dayofyear
+ extract
second
weekofyear
year
diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py
index 90800106cf7..de43d59773e 100644
--- a/python/pyspark/sql/connect/functions.py
+++ b/python/pyspark/sql/connect/functions.py
@@ -2740,6 +2740,27 @@ def weekday(col: "ColumnOrName") -> Column:
weekday.__doc__ = pysparkfuncs.weekday.__doc__
+def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+ return _invoke_function_over_columns("extract", field, source)
+
+
+extract.__doc__ = pysparkfuncs.extract.__doc__
+
+
+def date_part(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+ return _invoke_function_over_columns("date_part", field, source)
+
+
+extract.__doc__ = pysparkfuncs.extract.__doc__
+
+
+def datepart(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+ return _invoke_function_over_columns("datepart", field, source)
+
+
+extract.__doc__ = pysparkfuncs.extract.__doc__
+
+
def make_date(year: "ColumnOrName", month: "ColumnOrName", day: "ColumnOrName") -> Column:
return _invoke_function_over_columns("make_date", year, month, day)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 0cfc19615be..267c7e433bd 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -6029,6 +6029,116 @@ def weekday(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("weekday", col)
+@try_remote_functions
+def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+ """
+ Extracts a part of the date/timestamp or interval source.
+
+ .. versionadded:: 3.5.0
+
+ Parameters
+ ----------
+ field : :class:`~pyspark.sql.Column` or str
+ selects which part of the source should be extracted.
+ source : :class:`~pyspark.sql.Column` or str
+ a date/timestamp or interval column from where `field` should be extracted.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ a part of the date/timestamp or interval source.
+
+ Examples
+ --------
+ >>> import datetime
+ >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])
+ >>> df.select(
+ ... extract(lit('YEAR'), 'ts').alias('year'),
+ ... extract(lit('month'), 'ts').alias('month'),
+ ... extract(lit('WEEK'), 'ts').alias('week'),
+ ... extract(lit('D'), 'ts').alias('day'),
+ ... extract(lit('M'), 'ts').alias('minute'),
+ ... extract(lit('S'), 'ts').alias('second')
+ ... ).collect()
+ [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]
+ """
+ return _invoke_function_over_columns("extract", field, source)
+
+
+@try_remote_functions
+def date_part(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+ """
+ Extracts a part of the date/timestamp or interval source.
+
+ .. versionadded:: 3.5.0
+
+ Parameters
+ ----------
+ field : :class:`~pyspark.sql.Column` or str
+ selects which part of the source should be extracted, and supported string values
+ are as same as the fields of the equivalent function `extract`.
+ source : :class:`~pyspark.sql.Column` or str
+ a date/timestamp or interval column from where `field` should be extracted.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ a part of the date/timestamp or interval source.
+
+ Examples
+ --------
+ >>> import datetime
+ >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])
+ >>> df.select(
+ ... date_part(lit('YEAR'), 'ts').alias('year'),
+ ... date_part(lit('month'), 'ts').alias('month'),
+ ... date_part(lit('WEEK'), 'ts').alias('week'),
+ ... date_part(lit('D'), 'ts').alias('day'),
+ ... date_part(lit('M'), 'ts').alias('minute'),
+ ... date_part(lit('S'), 'ts').alias('second')
+ ... ).collect()
+ [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]
+ """
+ return _invoke_function_over_columns("date_part", field, source)
+
+
+@try_remote_functions
+def datepart(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+ """
+ Extracts a part of the date/timestamp or interval source.
+
+ .. versionadded:: 3.5.0
+
+ Parameters
+ ----------
+ field : :class:`~pyspark.sql.Column` or str
+ selects which part of the source should be extracted, and supported string values
+ are as same as the fields of the equivalent function `extract`.
+ source : :class:`~pyspark.sql.Column` or str
+ a date/timestamp or interval column from where `field` should be extracted.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ a part of the date/timestamp or interval source.
+
+ Examples
+ --------
+ >>> import datetime
+ >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])
+ >>> df.select(
+ ... datepart(lit('YEAR'), 'ts').alias('year'),
+ ... datepart(lit('month'), 'ts').alias('month'),
+ ... datepart(lit('WEEK'), 'ts').alias('week'),
+ ... datepart(lit('D'), 'ts').alias('day'),
+ ... datepart(lit('M'), 'ts').alias('minute'),
+ ... datepart(lit('S'), 'ts').alias('second')
+ ... ).collect()
+ [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]
+ """
+ return _invoke_function_over_columns("datepart", field, source)
+
+
@try_remote_functions
def make_date(year: "ColumnOrName", month: "ColumnOrName", day: "ColumnOrName") -> Column:
"""
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 7c3f65e2495..6c1b1262c53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4547,6 +4547,47 @@ object functions {
*/
def hour(e: Column): Column = withExpr { Hour(e.expr) }
+ /**
+ * Extracts a part of the date/timestamp or interval source.
+ *
+ * @param field selects which part of the source should be extracted.
+ * @param source a date/timestamp or interval column from where `field` should be extracted.
+ * @return a part of the date/timestamp or interval source
+ * @group datetime_funcs
+ * @since 3.5.0
+ */
+ def extract(field: Column, source: Column): Column = withExpr {
+ UnresolvedFunction("extract", Seq(field.expr, source.expr), isDistinct = false)
+ }
+
+ /**
+ * Extracts a part of the date/timestamp or interval source.
+ *
+ * @param field selects which part of the source should be extracted, and supported string values
+ * are as same as the fields of the equivalent function `extract`.
+ * @param source a date/timestamp or interval column from where `field` should be extracted.
+ * @return a part of the date/timestamp or interval source
+ * @group datetime_funcs
+ * @since 3.5.0
+ */
+ def date_part(field: Column, source: Column): Column = withExpr {
+ UnresolvedFunction("date_part", Seq(field.expr, source.expr), isDistinct = false)
+ }
+
+ /**
+ * Extracts a part of the date/timestamp or interval source.
+ *
+ * @param field selects which part of the source should be extracted, and supported string values
+ * are as same as the fields of the equivalent function `EXTRACT`.
+ * @param source a date/timestamp or interval column from where `field` should be extracted.
+ * @return a part of the date/timestamp or interval source
+ * @group datetime_funcs
+ * @since 3.5.0
+ */
+ def datepart(field: Column, source: Column): Column = withExpr {
+ UnresolvedFunction("datepart", Seq(field.expr, source.expr), isDistinct = false)
+ }
+
/**
* Returns the last day of the month which the given date belongs to.
* For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 7b13af8bf7d..d5d2fe8a5d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -267,6 +267,74 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
Row(2, 2, 0))
}
+ test("extract") {
+ val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+ checkAnswer(
+ df.select(
+ extract(lit("YEAR"), $"a"),
+ extract(lit("MONTH"), $"a"),
+ extract(lit("week"), $"b"),
+ extract(lit("day"), $"b"),
+ extract(lit("MINUTE"), $"c"),
+ extract(lit("SECONDS"), $"c")),
+ Row(2015, 4, 15, 8, 10, 15.000000))
+
+ checkAnswer(
+ df.selectExpr(
+ "extract(YEAR FROM a)",
+ "extract(MONTH FROM a)",
+ "extract(week FROM b)",
+ "extract(day FROM b)",
+ "extract(MINUTE FROM c)",
+ "extract(SECONDS FROM c)"),
+ Row(2015, 4, 15, 8, 10, 15.000000))
+ }
+
+ test("date_part & datepart") {
+ val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+ checkAnswer(
+ df.select(
+ date_part(lit("YEAR"), $"a"),
+ date_part(lit("MONTH"), $"a"),
+ date_part(lit("week"), $"b"),
+ date_part(lit("day"), $"b"),
+ date_part(lit("MINUTE"), $"c"),
+ date_part(lit("SECONDS"), $"c")),
+ Row(2015, 4, 15, 8, 10, 15.000000))
+
+ checkAnswer(
+ df.select(
+ datepart(lit("YEAR"), $"a"),
+ datepart(lit("MONTH"), $"a"),
+ datepart(lit("week"), $"b"),
+ datepart(lit("day"), $"b"),
+ datepart(lit("MINUTE"), $"c"),
+ datepart(lit("SECONDS"), $"c")),
+ Row(2015, 4, 15, 8, 10, 15.000000))
+
+ checkAnswer(
+ df.selectExpr(
+ "date_part('YEAR', a)",
+ "date_part('MONTH', a)",
+ "date_part('week', b)",
+ "date_part('day', b)",
+ "date_part('MINUTE', c)",
+ "date_part('SECONDS', c)"),
+ Row(2015, 4, 15, 8, 10, 15.000000))
+
+ checkAnswer(
+ df.selectExpr(
+ "datepart('YEAR', a)",
+ "datepart('MONTH', a)",
+ "datepart('week', b)",
+ "datepart('day', b)",
+ "datepart('MINUTE', c)",
+ "datepart('SECONDS', c)"),
+ Row(2015, 4, 15, 8, 10, 15.000000))
+ }
+
test("function date_add & dateadd") {
val st1 = "2015-06-01 12:34:56"
val st2 = "2015-06-02 12:34:56"
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org