You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/06/20 06:09:16 UTC
[spark] branch master updated: [SPARK-43944][SQL][CONNECT][PYTHON][FOLLOW-UP] Make `startswith` & `endswith` support binary type

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 6b36a9368d6 [SPARK-43944][SQL][CONNECT][PYTHON][FOLLOW-UP] Make `startswith` & `endswith` support binary type
6b36a9368d6 is described below

commit 6b36a9368d6e97f7f1f94c4ca7f6ee76dcd0015f
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Tue Jun 20 14:08:56 2023 +0800

    [SPARK-43944][SQL][CONNECT][PYTHON][FOLLOW-UP] Make `startswith` & `endswith` support binary type
    
    ### What changes were proposed in this pull request?
    Make `startswith`, `endswith` support binary type:
    1, in Connect API, `startswith` & `endswith` actually already support binary type;
    2, in vanilla API, support binary type via `call_udf`
    
    ### Why are the changes needed?
    for parity
    
    ### Does this PR introduce _any_ user-facing change?
    yes
    
    ### How was this patch tested?
    added ut
    
    Closes #41659 from zhengruifeng/sql_func_sw.
    
    Lead-authored-by: Ruifeng Zheng <ru...@apache.org>
    Co-authored-by: Ruifeng Zheng <ru...@foxmail.com>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 .../scala/org/apache/spark/sql/functions.scala     | 14 +++------
 python/pyspark/sql/functions.py                    | 36 ++++++++++++++++------
 .../scala/org/apache/spark/sql/functions.scala     | 24 ++++++---------
 .../apache/spark/sql/StringFunctionsSuite.scala    | 14 +++++++--
 4 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 93cf8f521b2..2ac20bd5911 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3945,11 +3945,8 @@ object functions {
 
   /**
    * Returns a boolean. The value is True if str ends with suffix. Returns NULL if either input
-   * expression is NULL. Otherwise, returns False. Both str or suffix must be of STRING type.
-   *
-   * @note
-   *   Only STRING type is supported in this function, while `endswith` in SQL supports both
-   *   STRING and BINARY.
+   * expression is NULL. Otherwise, returns False. Both str or suffix must be of STRING or BINARY
+   * type.
    *
    * @group string_funcs
    * @since 3.5.0
@@ -3959,11 +3956,8 @@ object functions {
 
   /**
    * Returns a boolean. The value is True if str starts with prefix. Returns NULL if either input
-   * expression is NULL. Otherwise, returns False. Both str or prefix must be of STRING type.
-   *
-   * @note
-   *   Only STRING type is supported in this function, while `startswith` in SQL supports both
-   *   STRING and BINARY.
+   * expression is NULL. Otherwise, returns False. Both str or prefix must be of STRING or BINARY
+   * type.
    *
    * @group string_funcs
    * @since 3.5.0
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 3eaccdc1ea1..0cfc19615be 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -9660,11 +9660,6 @@ def endswith(str: "ColumnOrName", suffix: "ColumnOrName") -> Column:
 
     .. versionadded:: 3.5.0
 
-    Notes
-    -----
-    Only STRING type is supported in this function,
-    while `startswith` in SQL supports both STRING and BINARY.
-
     Parameters
     ----------
     str : :class:`~pyspark.sql.Column` or str
@@ -9677,6 +9672,19 @@ def endswith(str: "ColumnOrName", suffix: "ColumnOrName") -> Column:
     >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])
     >>> df.select(endswith(df.a, df.b).alias('r')).collect()
     [Row(r=False)]
+
+    >>> df = spark.createDataFrame([("414243", "4243",)], ["e", "f"])
+    >>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))
+    >>> df.printSchema()
+    root
+     |-- e: binary (nullable = true)
+     |-- f: binary (nullable = true)
+    >>> df.select(endswith("e", "f"), endswith("f", "e")).show()
+    +--------------+--------------+
+    |endswith(e, f)|endswith(f, e)|
+    +--------------+--------------+
+    |          true|         false|
+    +--------------+--------------+
     """
     return _invoke_function_over_columns("endswith", str, suffix)
 
@@ -9690,11 +9698,6 @@ def startswith(str: "ColumnOrName", prefix: "ColumnOrName") -> Column:
 
     .. versionadded:: 3.5.0
 
-    Notes
-    -----
-    Only STRING type is supported in this function,
-    while `startswith` in SQL supports both STRING and BINARY.
-
     Parameters
     ----------
     str : :class:`~pyspark.sql.Column` or str
@@ -9707,6 +9710,19 @@ def startswith(str: "ColumnOrName", prefix: "ColumnOrName") -> Column:
     >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])
     >>> df.select(startswith(df.a, df.b).alias('r')).collect()
     [Row(r=True)]
+
+    >>> df = spark.createDataFrame([("414243", "4142",)], ["e", "f"])
+    >>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))
+    >>> df.printSchema()
+    root
+     |-- e: binary (nullable = true)
+     |-- f: binary (nullable = true)
+    >>> df.select(startswith("e", "f"), startswith("f", "e")).show()
+    +----------------+----------------+
+    |startswith(e, f)|startswith(f, e)|
+    +----------------+----------------+
+    |            true|           false|
+    +----------------+----------------+
     """
     return _invoke_function_over_columns("startswith", str, prefix)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index a18c6969d47..68b81810da4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4046,33 +4046,29 @@ object functions {
   /**
    * Returns a boolean. The value is True if str ends with suffix.
    * Returns NULL if either input expression is NULL. Otherwise, returns False.
-   * Both str or suffix must be of STRING type.
-   *
-   * @note
-   *   Only STRING type is supported in this function, while `endswith` in SQL supports both
-   *   STRING and BINARY.
+   * Both str or suffix must be of STRING or BINARY type.
    *
    * @group string_funcs
    * @since 3.5.0
    */
-  def endswith(str: Column, suffix: Column): Column = withExpr {
-    EndsWith(str.expr, suffix.expr)
+  def endswith(str: Column, suffix: Column): Column = {
+    // 'EndsWith' expression only supports StringType,
+    // use 'call_udf' to support both StringType and BinaryType.
+    call_udf("endswith", str, suffix)
   }
 
   /**
    * Returns a boolean. The value is True if str starts with prefix.
    * Returns NULL if either input expression is NULL. Otherwise, returns False.
-   * Both str or prefix must be of STRING type.
-   *
-   * @note
-   *   Only STRING type is supported in this function, while `endswith` in SQL supports both
-   *   STRING and BINARY.
+   * Both str or prefix must be of STRING or BINARY type.
    *
    * @group string_funcs
    * @since 3.5.0
    */
-  def startswith(str: Column, prefix: Column): Column = withExpr {
-    StartsWith(str.expr, prefix.expr)
+  def startswith(str: Column, prefix: Column): Column = {
+    // 'StartsWith' expression only supports StringType,
+    // use 'call_udf' to support both StringType and BinaryType.
+    call_udf("startswith", str, prefix)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index bdacd8a914f..f64b5bc316e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -1116,16 +1116,26 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
   }
 
   test("endswith") {
-    val df = Seq(("Spark SQL", "Spark")).toDF("a", "b")
+    val df = Seq(("Spark SQL", "Spark", Array[Byte](1, 2, 3, 4), Array[Byte](3, 4)))
+      .toDF("a", "b", "c", "d")
 
     checkAnswer(df.selectExpr("endswith(a, b)"), Row(false))
     checkAnswer(df.select(endswith(col("a"), col("b"))), Row(false))
+
+    // test binary
+    checkAnswer(df.selectExpr("endswith(c, d)"), Row(true))
+    checkAnswer(df.select(endswith(col("c"), col("d"))), Row(true))
   }
 
   test("startswith") {
-    val df = Seq(("Spark SQL", "Spark")).toDF("a", "b")
+    val df = Seq(("Spark SQL", "Spark", Array[Byte](1, 2, 3, 4), Array[Byte](1, 2)))
+      .toDF("a", "b", "c", "d")
 
     checkAnswer(df.selectExpr("startswith(a, b)"), Row(true))
     checkAnswer(df.select(startswith(col("a"), col("b"))), Row(true))
+
+    // test binary
+    checkAnswer(df.selectExpr("startswith(c, d)"), Row(true))
+    checkAnswer(df.select(startswith(col("c"), col("d"))), Row(true))
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org