You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2024/01/05 20:21:05 UTC

(spark) branch master updated: [SPARK-46515] Add MONTHNAME function

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e8dfcd3081ab [SPARK-46515] Add MONTHNAME function
e8dfcd3081ab is described below

commit e8dfcd3081abe16b2115bb2944a2b1cb547eca8e
Author: Stefan Kandic <st...@databricks.com>
AuthorDate: Fri Jan 5 23:20:51 2024 +0300

    [SPARK-46515] Add MONTHNAME function
    
    ### What changes were proposed in this pull request?
    
    Added MONTHNAME function which returns three-letter abbreviated month name for a given date to:
    - Scala API
    - Python API
    - R API
    - Spark Connect Scala Client
    - Spark Connect Python Client
    
    ### Why are the changes needed?
    
    for parity with Snowflake
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, new MONTHNAME function
    
    ### How was this patch tested?
    
    With newly added unit tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #44483 from stefankandic/monthname-function.
    
    Authored-by: Stefan Kandic <st...@databricks.com>
    Signed-off-by: Max Gekk <ma...@gmail.com>
---
 R/pkg/R/functions.R                                |  14 ++++++++++
 R/pkg/R/generics.R                                 |   4 +++
 R/pkg/tests/fulltests/test_sparkSQL.R              |   1 +
 .../scala/org/apache/spark/sql/functions.scala     |   9 +++++++
 .../apache/spark/sql/PlanGenerationTestSuite.scala |   4 +++
 .../explain-results/function_monthname.explain     |   2 ++
 .../query-tests/queries/function_monthname.json    |  25 +++++++++++++++++
 .../queries/function_monthname.proto.bin           | Bin 0 -> 123 bytes
 .../source/reference/pyspark.sql/functions.rst     |   1 +
 python/pyspark/sql/connect/functions/builtin.py    |   7 +++++
 python/pyspark/sql/functions/builtin.py            |  30 +++++++++++++++++++++
 python/pyspark/sql/tests/test_functions.py         |   6 +++++
 .../sql/catalyst/analysis/FunctionRegistry.scala   |   1 +
 .../catalyst/expressions/datetimeExpressions.scala |  17 ++++++++++++
 .../spark/sql/catalyst/util/DateTimeUtils.scala    |  12 +++++++++
 .../expressions/DateExpressionsSuite.scala         |  11 ++++++++
 .../scala/org/apache/spark/sql/functions.scala     |   9 +++++++
 .../sql-functions/sql-expression-schema.md         |   1 +
 .../org/apache/spark/sql/DateFunctionsSuite.scala  |  12 +++++++++
 19 files changed, 166 insertions(+)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 14b4cb8cbdaa..0db25ad8739f 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1091,6 +1091,20 @@ setMethod("dayofyear",
             column(jc)
           })
 
+#' @details
+#' \code{monthname}: Extracts the three-letter abbreviated month name from a
+#' given date/timestamp/string.
+#'
+#' @rdname column_datetime_functions
+#' @aliases monthname monthname,Column-method
+#' @note monthname since 4.0.0
+setMethod("monthname",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "monthname", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{decode}: Computes the first argument into a string from a binary using the provided
 #' character set.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 2f46f63ba38a..92febc02710d 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1020,6 +1020,10 @@ setGeneric("dayofweek", function(x) { standardGeneric("dayofweek") })
 #' @name NULL
 setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
 
+#' @rdname column_datetime_functions
+#' @name NULL
+setGeneric("monthname", function(x) { standardGeneric("monthname") })
+
 #' @rdname column_string_functions
 #' @name NULL
 setGeneric("decode", function(x, charset) { standardGeneric("decode") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index c1a5292195af..118c853a00df 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2062,6 +2062,7 @@ test_that("date functions on a DataFrame", {
   expect_equal(collect(select(df, weekofyear(df$b)))[, 1], c(50, 50, 51))
   expect_equal(collect(select(df, year(df$b)))[, 1], c(2012, 2013, 2014))
   expect_equal(collect(select(df, month(df$b)))[, 1], c(12, 12, 12))
+  expect_equal(collect(select(df, monthname(df$b)))[, 1], c("Dec", "Dec", "Dec"))
   expect_equal(collect(select(df, last_day(df$b)))[, 1],
                c(as.Date("2012-12-31"), as.Date("2013-12-31"), as.Date("2014-12-31")))
   expect_equal(collect(select(df, next_day(df$b, "MONDAY")))[, 1],
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 061fca276a3d..9191633171f7 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -5944,6 +5944,15 @@ object functions {
   def to_unix_timestamp(timeExp: Column): Column =
     Column.fn("to_unix_timestamp", timeExp)
 
+  /**
+   * Extracts the three-letter abbreviated month name from a given date/timestamp/string.
+   *
+   * @group datetime_funcs
+   * @since 4.0.0
+   */
+  def monthname(timeExp: Column): Column =
+    Column.fn("monthname", timeExp)
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Collection functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 0a7768aa488b..3e6e4b7a3a02 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -2125,6 +2125,10 @@ class PlanGenerationTestSuite
     fn.months_between(fn.current_date(), fn.col("d"), roundOff = true)
   }
 
+  temporalFunctionTest("monthname") {
+    fn.monthname(fn.col("d"))
+  }
+
   temporalFunctionTest("next_day") {
     fn.next_day(fn.col("d"), "Mon")
   }
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_monthname.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_monthname.explain
new file mode 100644
index 000000000000..672d076ef8bf
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_monthname.explain
@@ -0,0 +1,2 @@
+Project [monthname(d#0) AS monthname(d)#0]
++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_monthname.json b/connector/connect/common/src/test/resources/query-tests/queries/function_monthname.json
new file mode 100644
index 000000000000..c5ad3485252f
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_monthname.json
@@ -0,0 +1,25 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "monthname",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "d"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_monthname.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_monthname.proto.bin
new file mode 100644
index 000000000000..4518bb8d7425
Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_monthname.proto.bin differ
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst
index d1dba5f2bede..cbdadd9cd2bd 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -270,6 +270,7 @@ Date and Timestamp Functions
     make_ym_interval
     minute
     month
+    monthname
     months_between
     next_day
     now
diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py
index 461694362612..c9bf5fadd91c 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -2980,6 +2980,13 @@ def weekday(col: "ColumnOrName") -> Column:
 weekday.__doc__ = pysparkfuncs.weekday.__doc__
 
 
+def monthname(col: "ColumnOrName") -> Column:
+    return _invoke_function_over_columns("monthname", col)
+
+
+monthname.__doc__ = pysparkfuncs.monthname.__doc__
+
+
 def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column:
     return _invoke_function_over_columns("extract", field, source)
 
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index bd151050e77f..bac59808cd3f 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -7273,6 +7273,36 @@ def weekday(col: "ColumnOrName") -> Column:
     return _invoke_function_over_columns("weekday", col)
 
 
+@_try_remote_functions
+def monthname(col: "ColumnOrName") -> Column:
+    """
+    Returns the three-letter abbreviated month name from the given date.
+
+    .. versionadded:: 4.0.0
+
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or str
+        target date/timestamp column to work on.
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        the three-letter abbreviation of month name for date/timestamp (Jan, Feb, Mar...)
+
+    Examples
+    --------
+    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df.select(monthname('dt').alias('month')).show()
+    +-----+
+    |month|
+    +-----+
+    |  Apr|
+    +-----+
+    """
+    return _invoke_function_over_columns("monthname", col)
+
+
 @_try_remote_functions
 def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column:
     """
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index aaf58136508a..c67ddcde44ea 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -415,6 +415,12 @@ class FunctionsTestsMixin:
         row = df.select(F.dayofweek(df.date)).first()
         self.assertEqual(row[0], 2)
 
+    def test_monthname(self):
+        dt = datetime.datetime(2017, 11, 6)
+        df = self.spark.createDataFrame([Row(date=dt)])
+        row = df.select(F.monthname(df.date)).first()
+        self.assertEqual(row[0], "Nov")
+
     # Test added for SPARK-37738; change Python API to accept both col & int as input
     def test_date_add_function(self):
         dt = datetime.date(2021, 12, 27)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 3f61dfc5f2ed..a9b1178a8dba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -649,6 +649,7 @@ object FunctionRegistry {
     expression[WindowTime]("window_time"),
     expression[MakeDate]("make_date"),
     expression[MakeTimestamp]("make_timestamp"),
+    expression[MonthName]("monthname"),
     // We keep the 2 expression builders below to have different function docs.
     expressionBuilder("make_timestamp_ntz", MakeTimestampNTZExpressionBuilder, setAlias = true),
     expressionBuilder("make_timestamp_ltz", MakeTimestampLTZExpressionBuilder, setAlias = true),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 50a9dbf7f899..9be260a9f3da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -911,6 +911,23 @@ case class WeekOfYear(child: Expression) extends GetDateField {
     copy(child = newChild)
 }
 
+@ExpressionDescription(
+  usage = "_FUNC_(date) - Returns the three-letter abbreviated month name from the given date.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_('2008-02-20');
+       Feb
+  """,
+  group = "datetime_funcs",
+  since = "4.0.0")
+case class MonthName(child: Expression) extends GetDateField {
+  override val func = DateTimeUtils.getMonthName
+  override val funcName = "getMonthName"
+  override def dataType: DataType = StringType
+  override protected def withNewChildInternal(newChild: Expression): MonthName =
+    copy(child = newChild)
+}
+
 // scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(timestamp, fmt) - Converts `timestamp` to a value of string in the format specified by the date format `fmt`.",
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 128582f71d11..cb93814e90e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.util
 
 import java.time._
+import java.time.format.TextStyle
 import java.time.temporal.{ChronoField, ChronoUnit, IsoFields, Temporal}
 import java.util.Locale
 import java.util.concurrent.TimeUnit._
@@ -196,6 +197,17 @@ object DateTimeUtils extends SparkDateTimeUtils {
     localDateToDays(daysToLocalDate(days).plusMonths(months))
   }
 
+  /**
+   * Returns the three-letter abbreviated month name for the given number of days since 1970-01-01.
+   */
+  def getMonthName(days: Int): UTF8String = {
+    val monthName = Month
+      .of(getMonth(days))
+      .getDisplayName(TextStyle.SHORT, DateFormatter.defaultLocale)
+
+    UTF8String.fromString(monthName)
+  }
+
   /**
    * Adds months to a timestamp at the given time zone. It converts the input timestamp to a local
    * timestamp at the given time zone, adds months, and converts the resulted local timestamp
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index d20101026903..09c2b6f5cc9b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -285,6 +285,17 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkConsistencyBetweenInterpretedAndCodegen(WeekOfYear, DateType)
   }
 
+  test("MonthName") {
+    checkEvaluation(MonthName(Literal.create(null, DateType)), null)
+    checkEvaluation(MonthName(Literal(d)), "Apr")
+    checkEvaluation(MonthName(Cast(Literal(date), DateType, UTC_OPT)), "Apr")
+    checkEvaluation(MonthName(Cast(Literal(ts), DateType, UTC_OPT)), "Nov")
+    checkEvaluation(MonthName(Cast(Literal("2011-05-06"), DateType, UTC_OPT)), "May")
+    checkEvaluation(MonthName(Literal(new Date(toMillis("2017-01-27 13:10:15")))), "Jan")
+    checkEvaluation(MonthName(Literal(new Date(toMillis("1582-12-15 13:10:15")))), "Dec")
+    checkConsistencyBetweenInterpretedAndCodegen(MonthName, DateType)
+  }
+
   test("DateFormat") {
     Seq("legacy", "corrected").foreach { legacyParserPolicy =>
       withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index d8b5a4b416c9..97963c4ed924 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -5738,6 +5738,15 @@ object functions {
   def to_unix_timestamp(timeExp: Column): Column =
     Column.fn("to_unix_timestamp", timeExp)
 
+  /**
+   * Extracts the three-letter abbreviated month name from a given date/timestamp/string.
+   *
+   * @group datetime_funcs
+   * @since 4.0.0
+   */
+  def monthname(timeExp: Column): Column =
+    Column.fn("monthname", timeExp)
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Collection functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 1a04fd57090d..cff7921db0e5 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -221,6 +221,7 @@
 | org.apache.spark.sql.catalyst.expressions.Minute | minute | SELECT minute('2009-07-30 12:58:59') | struct<minute(2009-07-30 12:58:59):int> |
 | org.apache.spark.sql.catalyst.expressions.MonotonicallyIncreasingID | monotonically_increasing_id | SELECT monotonically_increasing_id() | struct<monotonically_increasing_id():bigint> |
 | org.apache.spark.sql.catalyst.expressions.Month | month | SELECT month('2016-07-30') | struct<month(2016-07-30):int> |
+| org.apache.spark.sql.catalyst.expressions.MonthName | monthname | SELECT monthname('2008-02-20') | struct<monthname(2008-02-20):string> |
 | org.apache.spark.sql.catalyst.expressions.MonthsBetween | months_between | SELECT months_between('1997-02-28 10:30:00', '1996-10-30') | struct<months_between(1997-02-28 10:30:00, 1996-10-30, true):double> |
 | org.apache.spark.sql.catalyst.expressions.Multiply | * | SELECT 2 * 3 | struct<(2 * 3):int> |
 | org.apache.spark.sql.catalyst.expressions.Murmur3Hash | hash | SELECT hash('Spark', array(123), 2) | struct<hash(Spark, array(123), 2):int> |
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index a8c304ff66d0..7d608f4a08c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -267,6 +267,18 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
       Row(2, 2, 0))
   }
 
+  test("monthname") {
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(monthname($"a"), monthname($"b"), monthname($"c")),
+      Row("Apr", "Apr", "Apr"))
+
+    checkAnswer(
+      df.selectExpr("monthname(a)", "monthname(b)", "monthname(c)"),
+      Row("Apr", "Apr", "Apr"))
+  }
+
   test("extract") {
     val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org