You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2022/08/31 03:37:52 UTC
[spark] branch master updated: [SPARK-40271][PYTHON] Support list type for `pyspark.sql.functions.lit`

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 65d89f8e897 [SPARK-40271][PYTHON] Support list type for `pyspark.sql.functions.lit`
65d89f8e897 is described below

commit 65d89f8e897449f7f026144a76328ff545fecde2
Author: itholic <ha...@databricks.com>
AuthorDate: Wed Aug 31 11:37:20 2022 +0800

    [SPARK-40271][PYTHON] Support list type for `pyspark.sql.functions.lit`
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to support `list` type for `pyspark.sql.functions.lit`.
    
    ### Why are the changes needed?
    
    To improve the API usability.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, now the `list` type is available for `pyspark.sql.functions.list` as below:
    
    - Before
    ```python
    >>> spark.range(3).withColumn("c", lit([1,2,3])).show()
    Traceback (most recent call last):
    ...
    : org.apache.spark.SparkRuntimeException: [UNSUPPORTED_FEATURE.LITERAL_TYPE] The feature is not supported: Literal for '[1, 2, 3]' of class java.util.ArrayList.
            at org.apache.spark.sql.errors.QueryExecutionErrors$.literalTypeUnsupportedError(QueryExecutionErrors.scala:302)
            at org.apache.spark.sql.catalyst.expressions.Literal$.apply(literals.scala:100)
            at org.apache.spark.sql.functions$.lit(functions.scala:125)
            at org.apache.spark.sql.functions.lit(functions.scala)
            at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:104)
            at java.base/java.lang.reflect.Method.invoke(Method.java:577)
            at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
            at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
            at py4j.Gateway.invoke(Gateway.java:282)
            at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
            at py4j.commands.CallCommand.execute(CallCommand.java:79)
            at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
            at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
            at java.base/java.lang.Thread.run(Thread.java:833)
    ```
    
    - After
    ```python
    >>> spark.range(3).withColumn("c", lit([1,2,3])).show()
    +---+---------+
    | id|        c|
    +---+---------+
    |  0|[1, 2, 3]|
    |  1|[1, 2, 3]|
    |  2|[1, 2, 3]|
    +---+---------+
    ```
    
    ### How was this patch tested?
    
    Added doctest & unit test.
    
    Closes #37722 from itholic/SPARK-40271.
    
    Authored-by: itholic <ha...@databricks.com>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 python/pyspark/sql/functions.py            | 23 +++++++++++++++++++++--
 python/pyspark/sql/tests/test_functions.py | 26 ++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 03c16db602f..e7a7a1b37f3 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -131,10 +131,13 @@ def lit(col: Any) -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or Python primitive type.
+    col : :class:`~pyspark.sql.Column`, str, int, float, bool or list.
         the value to make it as a PySpark literal. If a column is passed,
         it returns the column as is.
 
+        .. versionchanged:: 3.4.0
+            Since 3.4.0, it supports the list type.
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
@@ -149,8 +152,24 @@ def lit(col: Any) -> Column:
     +------+---+
     |     5|  0|
     +------+---+
+
+    Create a literal from a list.
+
+    >>> spark.range(1).select(lit([1, 2, 3])).show()
+    +--------------+
+    |array(1, 2, 3)|
+    +--------------+
+    |     [1, 2, 3]|
+    +--------------+
     """
-    return col if isinstance(col, Column) else _invoke_function("lit", col)
+    if isinstance(col, Column):
+        return col
+    elif isinstance(col, list):
+        if any(isinstance(c, Column) for c in col):
+            raise ValueError("lit does not allow for list of Columns")
+        return array(*[lit(item) for item in col])
+    else:
+        return _invoke_function("lit", col)
 
 
 def col(col: str) -> Column:
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 102ebef8317..1d02a540558 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -962,6 +962,32 @@ class FunctionsTests(ReusedSQLTestCase):
         actual = self.spark.range(1).select(lit(td)).first()[0]
         self.assertEqual(actual, td)
 
+    def test_lit_list(self):
+        # SPARK-40271: added list type supporting
+        test_list = [1, 2, 3]
+        expected = [1, 2, 3]
+        actual = self.spark.range(1).select(lit(test_list)).first()[0]
+        self.assertEqual(actual, expected)
+
+        test_list = [[1, 2, 3], [3, 4]]
+        expected = [[1, 2, 3], [3, 4]]
+        actual = self.spark.range(1).select(lit(test_list)).first()[0]
+        self.assertEqual(actual, expected)
+
+        test_list = ["a", 1, None, 1.0]
+        expected = ["a", "1", None, "1.0"]
+        actual = self.spark.range(1).select(lit(test_list)).first()[0]
+        self.assertEqual(actual, expected)
+
+        test_list = [["a", 1, None, 1.0], [1, None, "b"]]
+        expected = [["a", "1", None, "1.0"], ["1", None, "b"]]
+        actual = self.spark.range(1).select(lit(test_list)).first()[0]
+        self.assertEqual(actual, expected)
+
+        df = self.spark.range(10)
+        with self.assertRaisesRegex(ValueError, "lit does not allow for list of Columns"):
+            lit([df.id, df.id])
+
     # Test added for SPARK-39832; change Python API to accept both col & str as input
     def test_regexp_replace(self):
         df = self.spark.createDataFrame(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org