You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2023/02/08 06:43:50 UTC
[spark] branch branch-3.4 updated: [SPARK-42378][CONNECT][PYTHON] Make `DataFrame.select` support `a.*`
This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new c4156d49c4b [SPARK-42378][CONNECT][PYTHON] Make `DataFrame.select` support `a.*`
c4156d49c4b is described below
commit c4156d49c4b4bacb95ca84f09babf5873693b5ef
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Wed Feb 8 14:43:00 2023 +0800
[SPARK-42378][CONNECT][PYTHON] Make `DataFrame.select` support `a.*`
### What changes were proposed in this pull request?
Make `DataFrame.select` support `a.*`
### Why are the changes needed?
bugfix:
```
Traceback (most recent call last):
File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/tests/connect/test_connect_basic.py", line 1377, in test_select_star
cdf.select("a", "b.*").collect(),
File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/dataframe.py", line 1305, in collect
table = self._session.client.to_table(query)
File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/client.py", line 445, in to_table
table, _ = self._execute_and_fetch(req)
File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/client.py", line 639, in _execute_and_fetch
self._handle_error(rpc_error)
File "/Users/ruifeng.zheng/Dev/spark/python/pyspark/sql/connect/client.py", line 675, in _handle_error
raise SparkConnectAnalysisException(
pyspark.errors.exceptions.SparkConnectAnalysisException: [FIELD_NOT_FOUND] No such struct field `*` in `c`, `d`.
```
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
added ut
Closes #39934 from zhengruifeng/connect_select_star.
Authored-by: Ruifeng Zheng <ru...@apache.org>
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
(cherry picked from commit dbc4c621679318e0652a00aec54927b5659d65cd)
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
python/pyspark/sql/connect/plan.py | 9 +++----
.../sql/tests/connect/test_connect_basic.py | 29 ++++++++++++++++++++++
2 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
index d1d41b6a690..39b32f065ea 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -375,17 +375,16 @@ class Project(LogicalPlan):
)
def plan(self, session: "SparkConnectClient") -> proto.Relation:
+ from pyspark.sql.connect.functions import col
+
assert self._child is not None
+
proj_exprs = []
for c in self._columns:
if isinstance(c, Column):
proj_exprs.append(c.to_plan(session))
- elif c == "*":
- exp = proto.Expression()
- exp.unresolved_star.SetInParent()
- proj_exprs.append(exp)
else:
- proj_exprs.append(self.unresolved_attr(c))
+ proj_exprs.append(col(c).to_plan(session))
plan = proto.Relation()
plan.project.input.CopyFrom(self._child.plan(session))
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
index 63b58dc1b56..9068d6f5635 100644
--- a/python/pyspark/sql/tests/connect/test_connect_basic.py
+++ b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -1349,6 +1349,35 @@ class SparkConnectBasicTests(SparkConnectSQLTestCase):
.toPandas(),
)
+ def test_select_star(self):
+ data = [Row(a=1, b=Row(c=2, d=Row(e=3)))]
+
+ # +---+--------+
+ # | a| b|
+ # +---+--------+
+ # | 1|{2, {3}}|
+ # +---+--------+
+
+ cdf = self.connect.createDataFrame(data=data)
+ sdf = self.spark.createDataFrame(data=data)
+
+ self.assertEqual(
+ cdf.select("*").collect(),
+ sdf.select("*").collect(),
+ )
+ self.assertEqual(
+ cdf.select("a", "*").collect(),
+ sdf.select("a", "*").collect(),
+ )
+ self.assertEqual(
+ cdf.select("a", "b").collect(),
+ sdf.select("a", "b").collect(),
+ )
+ self.assertEqual(
+ cdf.select("a", "b.*").collect(),
+ sdf.select("a", "b.*").collect(),
+ )
+
def test_fill_na(self):
# SPARK-41128: Test fill na
query = """
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org