You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/06/20 13:47:20 UTC

[spark] branch master updated: [SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 8dc02863b92 [SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API
8dc02863b92 is described below

commit 8dc02863b926b9e0780b994f9ee6c5c1058d49a0
Author: Jiaan Geng <be...@163.com>
AuthorDate: Tue Jun 20 21:47:05 2023 +0800

    [SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API
    
    ### What changes were proposed in this pull request?
    This PR follows up https://github.com/apache/spark/pull/41636 and https://github.com/apache/spark/pull/41651 and add extract, date_part, datepart to Scala, Python and Connect API.
    
    ### Why are the changes needed?
    Add extract, date_part, datepart to Scala, Python and Connect API
    
    ### Does this PR introduce _any_ user-facing change?
    'No'.
    New feature.
    
    ### How was this patch tested?
    New test cases.
    
    Closes #41667 from beliefer/datetime_functions_followup.
    
    Authored-by: Jiaan Geng <be...@163.com>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 .../scala/org/apache/spark/sql/functions.scala     |  50 ++++++++++
 .../apache/spark/sql/PlanGenerationTestSuite.scala |  12 +++
 .../explain-results/function_date_part.explain     |   2 +
 .../explain-results/function_datepart.explain      |   2 +
 .../explain-results/function_extract.explain       |   2 +
 .../query-tests/queries/function_date_part.json    |  29 ++++++
 .../queries/function_date_part.proto.bin           | Bin 0 -> 133 bytes
 .../query-tests/queries/function_datepart.json     |  29 ++++++
 .../queries/function_datepart.proto.bin            | Bin 0 -> 132 bytes
 .../query-tests/queries/function_extract.json      |  29 ++++++
 .../query-tests/queries/function_extract.proto.bin | Bin 0 -> 131 bytes
 .../source/reference/pyspark.sql/functions.rst     |   3 +
 python/pyspark/sql/connect/functions.py            |  21 ++++
 python/pyspark/sql/functions.py                    | 110 +++++++++++++++++++++
 .../scala/org/apache/spark/sql/functions.scala     |  41 ++++++++
 .../org/apache/spark/sql/DateFunctionsSuite.scala  |  68 +++++++++++++
 16 files changed, 398 insertions(+)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 2ac20bd5911..a3f4a273661 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4438,6 +4438,56 @@ object functions {
    */
   def hour(e: Column): Column = Column.fn("hour", e)
 
+  /**
+   * Extracts a part of the date/timestamp or interval source.
+   *
+   * @param field
+   *   selects which part of the source should be extracted.
+   * @param source
+   *   a date/timestamp or interval column from where `field` should be extracted.
+   * @return
+   *   a part of the date/timestamp or interval source
+   * @group datetime_funcs
+   * @since 3.5.0
+   */
+  def extract(field: Column, source: Column): Column = {
+    Column.fn("extract", field, source)
+  }
+
+  /**
+   * Extracts a part of the date/timestamp or interval source.
+   *
+   * @param field
+   *   selects which part of the source should be extracted, and supported string values are as
+   *   same as the fields of the equivalent function `extract`.
+   * @param source
+   *   a date/timestamp or interval column from where `field` should be extracted.
+   * @return
+   *   a part of the date/timestamp or interval source
+   * @group datetime_funcs
+   * @since 3.5.0
+   */
+  def date_part(field: Column, source: Column): Column = {
+    Column.fn("date_part", field, source)
+  }
+
+  /**
+   * Extracts a part of the date/timestamp or interval source.
+   *
+   * @param field
+   *   selects which part of the source should be extracted, and supported string values are as
+   *   same as the fields of the equivalent function `extract`.
+   * @param source
+   *   a date/timestamp or interval column from where `field` should be extracted.
+   * @return
+   *   a part of the date/timestamp or interval source
+   * @group datetime_funcs
+   * @since 3.5.0
+   */
+  def datepart(field: Column, source: Column): Column = {
+    Column.fn("datepart", field, source)
+  }
+
   /**
    * Returns the last day of the month which the given date belongs to. For example, input
    * "2015-07-27" returns "2015-07-31" since July 31 is the last day of the month in July 2015.
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index e212720b84e..2dae9a99146 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -2021,6 +2021,18 @@ class PlanGenerationTestSuite
     fn.weekofyear(fn.col("d"))
   }
 
+  temporalFunctionTest("extract") {
+    fn.extract(lit("year"), fn.col("d"))
+  }
+
+  temporalFunctionTest("date_part") {
+    fn.date_part(lit("year"), fn.col("d"))
+  }
+
+  temporalFunctionTest("datepart") {
+    fn.datepart(lit("year"), fn.col("d"))
+  }
+
   temporalFunctionTest("from_unixtime") {
     fn.from_unixtime(lit(1L))
   }
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain
new file mode 100644
index 00000000000..dda95c323c6
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain
@@ -0,0 +1,2 @@
+Project [year(d#0) AS date_part(year, d)#0]
++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain
new file mode 100644
index 00000000000..1434b1c6db4
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain
@@ -0,0 +1,2 @@
+Project [year(d#0) AS datepart(year FROM d)#0]
++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain
new file mode 100644
index 00000000000..07f30cadb2e
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain
@@ -0,0 +1,2 @@
+Project [year(d#0) AS extract(year FROM d)#0]
++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json
new file mode 100644
index 00000000000..5e8d075c4e2
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json
@@ -0,0 +1,29 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "date_part",
+        "arguments": [{
+          "literal": {
+            "string": "year"
+          }
+        }, {
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "d"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin
new file mode 100644
index 00000000000..368ecb676c1
Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin differ
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json
new file mode 100644
index 00000000000..cc4dca86742
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json
@@ -0,0 +1,29 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "datepart",
+        "arguments": [{
+          "literal": {
+            "string": "year"
+          }
+        }, {
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "d"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin
new file mode 100644
index 00000000000..9d58fd3474d
Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin differ
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json
new file mode 100644
index 00000000000..6ccdb2f9d75
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json
@@ -0,0 +1,29 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "extract",
+        "arguments": [{
+          "literal": {
+            "string": "year"
+          }
+        }, {
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "d"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin
new file mode 100644
index 00000000000..91553c3b94b
Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin differ
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst
index f41443b8274..8195f06bea8 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -130,9 +130,12 @@ Datetime Functions
     dateadd
     datediff
     day
+    date_part
+    datepart
     dayofmonth
     dayofweek
     dayofyear
+    extract
     second
     weekofyear
     year
diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py
index 90800106cf7..de43d59773e 100644
--- a/python/pyspark/sql/connect/functions.py
+++ b/python/pyspark/sql/connect/functions.py
@@ -2740,6 +2740,27 @@ def weekday(col: "ColumnOrName") -> Column:
 weekday.__doc__ = pysparkfuncs.weekday.__doc__
 
 
+def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+    return _invoke_function_over_columns("extract", field, source)
+
+
+extract.__doc__ = pysparkfuncs.extract.__doc__
+
+
+def date_part(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+    return _invoke_function_over_columns("date_part", field, source)
+
+
+extract.__doc__ = pysparkfuncs.extract.__doc__
+
+
+def datepart(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+    return _invoke_function_over_columns("datepart", field, source)
+
+
+extract.__doc__ = pysparkfuncs.extract.__doc__
+
+
 def make_date(year: "ColumnOrName", month: "ColumnOrName", day: "ColumnOrName") -> Column:
     return _invoke_function_over_columns("make_date", year, month, day)
 
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 0cfc19615be..267c7e433bd 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -6029,6 +6029,116 @@ def weekday(col: "ColumnOrName") -> Column:
     return _invoke_function_over_columns("weekday", col)
 
 
+@try_remote_functions
+def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+    """
+    Extracts a part of the date/timestamp or interval source.
+
+    .. versionadded:: 3.5.0
+
+    Parameters
+    ----------
+    field : :class:`~pyspark.sql.Column` or str
+        selects which part of the source should be extracted.
+    source : :class:`~pyspark.sql.Column` or str
+        a date/timestamp or interval column from where `field` should be extracted.
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        a part of the date/timestamp or interval source.
+
+    Examples
+    --------
+    >>> import datetime
+    >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])
+    >>> df.select(
+    ...     extract(lit('YEAR'), 'ts').alias('year'),
+    ...     extract(lit('month'), 'ts').alias('month'),
+    ...     extract(lit('WEEK'), 'ts').alias('week'),
+    ...     extract(lit('D'), 'ts').alias('day'),
+    ...     extract(lit('M'), 'ts').alias('minute'),
+    ...     extract(lit('S'), 'ts').alias('second')
+    ... ).collect()
+    [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]
+    """
+    return _invoke_function_over_columns("extract", field, source)
+
+
+@try_remote_functions
+def date_part(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+    """
+    Extracts a part of the date/timestamp or interval source.
+
+    .. versionadded:: 3.5.0
+
+    Parameters
+    ----------
+    field : :class:`~pyspark.sql.Column` or str
+        selects which part of the source should be extracted, and supported string values
+        are as same as the fields of the equivalent function `extract`.
+    source : :class:`~pyspark.sql.Column` or str
+        a date/timestamp or interval column from where `field` should be extracted.
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        a part of the date/timestamp or interval source.
+
+    Examples
+    --------
+    >>> import datetime
+    >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])
+    >>> df.select(
+    ...     date_part(lit('YEAR'), 'ts').alias('year'),
+    ...     date_part(lit('month'), 'ts').alias('month'),
+    ...     date_part(lit('WEEK'), 'ts').alias('week'),
+    ...     date_part(lit('D'), 'ts').alias('day'),
+    ...     date_part(lit('M'), 'ts').alias('minute'),
+    ...     date_part(lit('S'), 'ts').alias('second')
+    ... ).collect()
+    [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]
+    """
+    return _invoke_function_over_columns("date_part", field, source)
+
+
+@try_remote_functions
+def datepart(field: "ColumnOrName", source: "ColumnOrName") -> Column:
+    """
+    Extracts a part of the date/timestamp or interval source.
+
+    .. versionadded:: 3.5.0
+
+    Parameters
+    ----------
+    field : :class:`~pyspark.sql.Column` or str
+        selects which part of the source should be extracted, and supported string values
+        are as same as the fields of the equivalent function `extract`.
+    source : :class:`~pyspark.sql.Column` or str
+        a date/timestamp or interval column from where `field` should be extracted.
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        a part of the date/timestamp or interval source.
+
+    Examples
+    --------
+    >>> import datetime
+    >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])
+    >>> df.select(
+    ...     datepart(lit('YEAR'), 'ts').alias('year'),
+    ...     datepart(lit('month'), 'ts').alias('month'),
+    ...     datepart(lit('WEEK'), 'ts').alias('week'),
+    ...     datepart(lit('D'), 'ts').alias('day'),
+    ...     datepart(lit('M'), 'ts').alias('minute'),
+    ...     datepart(lit('S'), 'ts').alias('second')
+    ... ).collect()
+    [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]
+    """
+    return _invoke_function_over_columns("datepart", field, source)
+
+
 @try_remote_functions
 def make_date(year: "ColumnOrName", month: "ColumnOrName", day: "ColumnOrName") -> Column:
     """
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 7c3f65e2495..6c1b1262c53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4547,6 +4547,47 @@ object functions {
    */
   def hour(e: Column): Column = withExpr { Hour(e.expr) }
 
+  /**
+   * Extracts a part of the date/timestamp or interval source.
+   *
+   * @param field selects which part of the source should be extracted.
+   * @param source a date/timestamp or interval column from where `field` should be extracted.
+   * @return a part of the date/timestamp or interval source
+   * @group datetime_funcs
+   * @since 3.5.0
+   */
+  def extract(field: Column, source: Column): Column = withExpr {
+    UnresolvedFunction("extract", Seq(field.expr, source.expr), isDistinct = false)
+  }
+
+  /**
+   * Extracts a part of the date/timestamp or interval source.
+   *
+   * @param field selects which part of the source should be extracted, and supported string values
+   *              are as same as the fields of the equivalent function `extract`.
+   * @param source a date/timestamp or interval column from where `field` should be extracted.
+   * @return a part of the date/timestamp or interval source
+   * @group datetime_funcs
+   * @since 3.5.0
+   */
+  def date_part(field: Column, source: Column): Column = withExpr {
+    UnresolvedFunction("date_part", Seq(field.expr, source.expr), isDistinct = false)
+  }
+
+  /**
+   * Extracts a part of the date/timestamp or interval source.
+   *
+   * @param field selects which part of the source should be extracted, and supported string values
+   *              are as same as the fields of the equivalent function `EXTRACT`.
+   * @param source a date/timestamp or interval column from where `field` should be extracted.
+   * @return a part of the date/timestamp or interval source
+   * @group datetime_funcs
+   * @since 3.5.0
+   */
+  def datepart(field: Column, source: Column): Column = withExpr {
+    UnresolvedFunction("datepart", Seq(field.expr, source.expr), isDistinct = false)
+  }
+
   /**
    * Returns the last day of the month which the given date belongs to.
    * For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 7b13af8bf7d..d5d2fe8a5d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -267,6 +267,74 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
       Row(2, 2, 0))
   }
 
+  test("extract") {
+    val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(
+        extract(lit("YEAR"), $"a"),
+        extract(lit("MONTH"), $"a"),
+        extract(lit("week"), $"b"),
+        extract(lit("day"), $"b"),
+        extract(lit("MINUTE"), $"c"),
+        extract(lit("SECONDS"), $"c")),
+      Row(2015, 4, 15, 8, 10, 15.000000))
+
+    checkAnswer(
+      df.selectExpr(
+        "extract(YEAR FROM a)",
+        "extract(MONTH FROM a)",
+        "extract(week FROM b)",
+        "extract(day FROM b)",
+        "extract(MINUTE FROM c)",
+        "extract(SECONDS FROM c)"),
+      Row(2015, 4, 15, 8, 10, 15.000000))
+  }
+
+  test("date_part & datepart") {
+    val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(
+        date_part(lit("YEAR"), $"a"),
+        date_part(lit("MONTH"), $"a"),
+        date_part(lit("week"), $"b"),
+        date_part(lit("day"), $"b"),
+        date_part(lit("MINUTE"), $"c"),
+        date_part(lit("SECONDS"), $"c")),
+      Row(2015, 4, 15, 8, 10, 15.000000))
+
+    checkAnswer(
+      df.select(
+        datepart(lit("YEAR"), $"a"),
+        datepart(lit("MONTH"), $"a"),
+        datepart(lit("week"), $"b"),
+        datepart(lit("day"), $"b"),
+        datepart(lit("MINUTE"), $"c"),
+        datepart(lit("SECONDS"), $"c")),
+      Row(2015, 4, 15, 8, 10, 15.000000))
+
+    checkAnswer(
+      df.selectExpr(
+        "date_part('YEAR', a)",
+        "date_part('MONTH', a)",
+        "date_part('week', b)",
+        "date_part('day', b)",
+        "date_part('MINUTE', c)",
+        "date_part('SECONDS', c)"),
+      Row(2015, 4, 15, 8, 10, 15.000000))
+
+    checkAnswer(
+      df.selectExpr(
+        "datepart('YEAR', a)",
+        "datepart('MONTH', a)",
+        "datepart('week', b)",
+        "datepart('day', b)",
+        "datepart('MINUTE', c)",
+        "datepart('SECONDS', c)"),
+      Row(2015, 4, 15, 8, 10, 15.000000))
+  }
+
   test("function date_add & dateadd") {
     val st1 = "2015-06-01 12:34:56"
     val st2 = "2015-06-02 12:34:56"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org