You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/02/08 06:43:50 UTC

[spark] branch branch-3.4 updated: [SPARK-42378][CONNECT][PYTHON] Make `DataFrame.select` support `a.*`

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new c4156d49c4b [SPARK-42378][CONNECT][PYTHON] Make `DataFrame.select` support `a.*`
c4156d49c4b is described below

commit c4156d49c4b4bacb95ca84f09babf5873693b5ef
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Wed Feb 8 14:43:00 2023 +0800

    [SPARK-42378][CONNECT][PYTHON] Make `DataFrame.select` support `a.*`
    
    ### What changes were proposed in this pull request?
    Make `DataFrame.select` support `a.*`
    
    ### Why are the changes needed?
    bugfix:
    ```
    Traceback (most recent call last):
      File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/tests/connect/test_connect_basic.py", line 1377, in test_select_star
        cdf.select("a", "b.*").collect(),
      File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/dataframe.py", line 1305, in collect
        table = self._session.client.to_table(query)
      File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/client.py", line 445, in to_table
        table, _ = self._execute_and_fetch(req)
      File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/client.py", line 639, in _execute_and_fetch
        self._handle_error(rpc_error)
      File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/client.py", line 675, in _handle_error
        raise SparkConnectAnalysisException(
    pyspark.errors.exceptions.SparkConnectAnalysisException: [FIELD_NOT_FOUND] No such struct field `*` in `c`, `d`.
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    yes
    
    ### How was this patch tested?
    added ut
    
    Closes #39934 from zhengruifeng/connect_select_star.
    
    Authored-by: Ruifeng Zheng <ru...@apache.org>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
    (cherry picked from commit dbc4c621679318e0652a00aec54927b5659d65cd)
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 python/pyspark/sql/connect/plan.py                 |  9 +++----
 .../sql/tests/connect/test_connect_basic.py        | 29 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
index d1d41b6a690..39b32f065ea 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -375,17 +375,16 @@ class Project(LogicalPlan):
                 )
 
     def plan(self, session: "SparkConnectClient") -> proto.Relation:
+        from pyspark.sql.connect.functions import col
+
         assert self._child is not None
+
         proj_exprs = []
         for c in self._columns:
             if isinstance(c, Column):
                 proj_exprs.append(c.to_plan(session))
-            elif c == "*":
-                exp = proto.Expression()
-                exp.unresolved_star.SetInParent()
-                proj_exprs.append(exp)
             else:
-                proj_exprs.append(self.unresolved_attr(c))
+                proj_exprs.append(col(c).to_plan(session))
 
         plan = proto.Relation()
         plan.project.input.CopyFrom(self._child.plan(session))
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
index 63b58dc1b56..9068d6f5635 100644
--- a/python/pyspark/sql/tests/connect/test_connect_basic.py
+++ b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -1349,6 +1349,35 @@ class SparkConnectBasicTests(SparkConnectSQLTestCase):
             .toPandas(),
         )
 
+    def test_select_star(self):
+        data = [Row(a=1, b=Row(c=2, d=Row(e=3)))]
+
+        # +---+--------+
+        # |  a|       b|
+        # +---+--------+
+        # |  1|{2, {3}}|
+        # +---+--------+
+
+        cdf = self.connect.createDataFrame(data=data)
+        sdf = self.spark.createDataFrame(data=data)
+
+        self.assertEqual(
+            cdf.select("*").collect(),
+            sdf.select("*").collect(),
+        )
+        self.assertEqual(
+            cdf.select("a", "*").collect(),
+            sdf.select("a", "*").collect(),
+        )
+        self.assertEqual(
+            cdf.select("a", "b").collect(),
+            sdf.select("a", "b").collect(),
+        )
+        self.assertEqual(
+            cdf.select("a", "b.*").collect(),
+            sdf.select("a", "b.*").collect(),
+        )
+
     def test_fill_na(self):
         # SPARK-41128: Test fill na
         query = """


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org