You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2022/11/28 15:06:56 UTC

[spark] branch master updated: [SPARK-41300][CONNECT] Unset schema is interpreted as Schema

This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 40a5751d101 [SPARK-41300][CONNECT] Unset schema is interpreted as Schema
40a5751d101 is described below

commit 40a5751d1010e7d811f670131c29a4f40acd7ad2
Author: Martin Grund <ma...@databricks.com>
AuthorDate: Mon Nov 28 11:06:06 2022 -0400

    [SPARK-41300][CONNECT] Unset schema is interpreted as Schema
    
    ### What changes were proposed in this pull request?
    When a query is read from a DataSource using Spark Connect, the scalar string value would be empty and thus during processing we would treat it as set and fail the query because no schema can be parsed from the empty string.
    
    This patch fixes this issue and adds the relevant test for it.
    
    ### Why are the changes needed?
    Bugfix
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    UT
    
    Closes #38821 from grundprinzip/SPARK-41300.
    
    Authored-by: Martin Grund <ma...@databricks.com>
    Signed-off-by: Herman van Hovell <he...@databricks.com>
---
 .../spark/sql/connect/planner/SparkConnectPlanner.scala      |  2 +-
 python/pyspark/sql/tests/connect/test_connect_basic.py       | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
index fa5a0068c68..b4eaa03df5d 100644
--- a/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
+++ b/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -300,7 +300,7 @@ class SparkConnectPlanner(session: SparkSession) {
         val reader = session.read
         reader.format(rel.getDataSource.getFormat)
         localMap.foreach { case (key, value) => reader.option(key, value) }
-        if (rel.getDataSource.getSchema != null) {
+        if (rel.getDataSource.getSchema != null && !rel.getDataSource.getSchema.isEmpty) {
           reader.schema(rel.getDataSource.getSchema)
         }
         reader.load().queryExecution.analyzed
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
index 97ba34d8269..e0f5f23fdb4 100644
--- a/python/pyspark/sql/tests/connect/test_connect_basic.py
+++ b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -552,6 +552,18 @@ class SparkConnectTests(SparkConnectSQLTestCase):
             actualResult = pandasResult.values.tolist()
             self.assertEqual(len(expectResult), len(actualResult))
 
+    def test_simple_read_without_schema(self) -> None:
+        """SPARK-41300: Schema not set when reading CSV."""
+        writeDf = self.df_text
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        writeDf.write.csv(tmpPath, header=True)
+
+        readDf = self.connect.read.format("csv").option("header", True).load(path=tmpPath)
+        expectResult = set(writeDf.collect())
+        pandasResult = set(readDf.collect())
+        self.assertEqual(expectResult, pandasResult)
+
     def test_simple_transform(self) -> None:
         """SPARK-41203: Support DF.transform"""
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org