You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/06/20 03:45:19 UTC
[spark] branch master updated: [SPARK-43942][SQL][CONNECT][PYTHON][FOLLOW-UP] Make contains support binary type

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new d6380ad4d02 [SPARK-43942][SQL][CONNECT][PYTHON][FOLLOW-UP] Make contains support binary type
d6380ad4d02 is described below

commit d6380ad4d02cb3b04ccd83c40f3d32e063627735
Author: panbingkun <pb...@gmail.com>
AuthorDate: Tue Jun 20 11:44:55 2023 +0800

    [SPARK-43942][SQL][CONNECT][PYTHON][FOLLOW-UP] Make contains support binary type
    
    What changes were proposed in this pull request?
    Make contains support binary type:
    - in Connect API, contains actually already support binary type;
    - in vanilla API, support binary type via call_udf
    
    Why are the changes needed?
    for parity
    
    Does this PR introduce any user-facing change?
    yes
    
    How was this patch tested?
    added ut
    
    Closes #41665 from panbingkun/SPARK-43942_FOLLOWUP.
    
    Authored-by: panbingkun <pb...@gmail.com>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 .../main/scala/org/apache/spark/sql/functions.scala  |  8 ++------
 python/pyspark/sql/functions.py                      | 20 ++++++++++++++------
 .../main/scala/org/apache/spark/sql/functions.scala  | 12 +++++-------
 .../org/apache/spark/sql/StringFunctionsSuite.scala  |  8 +++++++-
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index c12bb23f850..ccd46c2d267 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4025,12 +4025,8 @@ object functions {
 
   /**
    * Returns a boolean. The value is True if right is found inside left. Returns NULL if either
-   * input expression is NULL. Otherwise, returns False. Both left or right must be of STRING
-   * type.
-   *
-   * @note
-   *   Only STRING type is supported in this function, while `contains` in SQL supports both
-   *   STRING and BINARY.
+   * input expression is NULL. Otherwise, returns False. Both left or right must be of STRING or
+   * BINARY type.
    *
    * @group string_funcs
    * @since 3.5.0
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index adef14de454..1a5633e3c5e 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -9711,15 +9711,10 @@ def contains(left: "ColumnOrName", right: "ColumnOrName") -> Column:
     """
     Returns a boolean. The value is True if right is found inside left.
     Returns NULL if either input expression is NULL. Otherwise, returns False.
-    Both left or right must be of STRING.
+    Both left or right must be of STRING or BINARY type.
 
     .. versionadded:: 3.5.0
 
-    Notes
-    -----
-    Only STRING type is supported in this function,
-    while `contains` in SQL supports both STRING and BINARY.
-
     Parameters
     ----------
     left : :class:`~pyspark.sql.Column` or str
@@ -9732,6 +9727,19 @@ def contains(left: "ColumnOrName", right: "ColumnOrName") -> Column:
     >>> df = spark.createDataFrame([("Spark SQL", "Spark")], ['a', 'b'])
     >>> df.select(contains(df.a, df.b).alias('r')).collect()
     [Row(r=True)]
+
+    >>> df = spark.createDataFrame([("414243", "4243",)], ["c", "d"])
+    >>> df = df.select(to_binary("c").alias("c"), to_binary("d").alias("d"))
+    >>> df.printSchema()
+    root
+     |-- c: binary (nullable = true)
+     |-- d: binary (nullable = true)
+    >>> df.select(contains("c", "d"), contains("d", "c")).show()
+    +--------------+--------------+
+    |contains(c, d)|contains(d, c)|
+    +--------------+--------------+
+    |          true|         false|
+    +--------------+--------------+
     """
     return _invoke_function_over_columns("contains", left, right)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index e7e14e30477..582e3b9e363 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4144,17 +4144,15 @@ object functions {
   /**
    * Returns a boolean. The value is True if right is found inside left.
    * Returns NULL if either input expression is NULL. Otherwise, returns False.
-   * Both left or right must be of STRING type.
-   *
-   * @note
-   *   Only STRING type is supported in this function, while `contains` in SQL supports both
-   *   STRING and BINARY.
+   * Both left or right must be of STRING or BINARY type.
    *
    * @group string_funcs
    * @since 3.5.0
    */
-  def contains(left: Column, right: Column): Column = withExpr {
-    Contains(left.expr, right.expr)
+  def contains(left: Column, right: Column): Column = {
+    // 'Contains' expression only supports StringType
+    // use 'call_udf' to support both StringType and BinaryType.
+    call_udf("contains", left, right)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index d68b6da2957..bdacd8a914f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -902,9 +902,15 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
   }
 
   test("contains function") {
-    val df = Seq(("Spark SQL", "Spark")).toDF("a", "b")
+    val df = Seq(("Spark SQL", "Spark", Array[Byte](1, 2, 3, 4), Array[Byte](1, 2))).
+      toDF("a", "b", "c", "d")
+
     checkAnswer(df.selectExpr("contains(a, b)"), Seq(Row(true)))
     checkAnswer(df.select(contains(col("a"), col("b"))), Seq(Row(true)))
+
+    // test binary
+    checkAnswer(df.selectExpr("contains(c, d)"), Seq(Row(true)))
+    checkAnswer(df.select(contains(col("c"), col("d"))), Seq(Row(true)))
   }
 
   test("elt function") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org