You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/03/17 02:13:36 UTC

[spark] branch branch-3.4 updated: [SPARK-42824][CONNECT][PYTHON] Provide a clear error message for unsupported JVM attributes

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new ca75340d607 [SPARK-42824][CONNECT][PYTHON] Provide a clear error message for unsupported JVM attributes
ca75340d607 is described below

commit ca75340d607f13932c9a49081ae9effcee5ac3f7
Author: itholic <ha...@databricks.com>
AuthorDate: Fri Mar 17 11:13:00 2023 +0900

    [SPARK-42824][CONNECT][PYTHON] Provide a clear error message for unsupported JVM attributes
    
    ### What changes were proposed in this pull request?
    
    This pull request proposes an improvement to the error message when trying to access a JVM attribute that is not supported in Spark Connect. Specifically, it adds a more informative error message that clearly indicates which attribute is not supported due to Spark Connect's lack of dependency on the JVM.
    
    ### Why are the changes needed?
    
    Currently, when attempting to access an unsupported JVM attribute in Spark Connect, the error message is not very clear, making it difficult for users to understand the root cause of the issue. This improvement aims to provide more helpful information to users to address this problem as below:
    
    **Before**
    ```python
    >>> spark._jsc
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    AttributeError: 'SparkSession' object has no attribute '_jsc'
    ```
    
    **After**
    ```python
    >>> spark._jsc
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/Users/haejoon.lee/Desktop/git_store/spark/python/pyspark/sql/connect/session.py", line 490, in _jsc
        raise PySparkAttributeError(
    pyspark.errors.exceptions.base.PySparkAttributeError: [JVM_ATTRIBUTE_NOT_SUPPORTED] Attribute `_jsc` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, use the original PySpark instead of Spark Connect.
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    This PR does not introduce any user-facing change in terms of functionality. However, it improves the error message, which could potentially affect the user experience in a positive way.
    
    ### How was this patch tested?
    
    This patch was tested by adding new unit tests that specifically target the error message related to unsupported JVM attributes. The tests were run locally on a development environment.
    
    Closes #40458 from itholic/SPARK-42824.
    
    Authored-by: itholic <ha...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit deac481304489f9b8ecd24ec6f3aed1e0c0d75eb)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/errors/__init__.py                  |  2 +
 python/pyspark/errors/error_classes.py             |  5 +++
 python/pyspark/errors/exceptions/base.py           |  6 +++
 python/pyspark/sql/connect/column.py               | 12 +++++-
 python/pyspark/sql/connect/dataframe.py            |  6 ++-
 python/pyspark/sql/connect/readwriter.py           |  7 ++++
 python/pyspark/sql/connect/session.py              | 26 ++++++++++++
 .../sql/tests/connect/test_connect_basic.py        | 49 +++++++++++++++++++++-
 8 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/errors/__init__.py b/python/pyspark/errors/__init__.py
index 95da7ca2aa8..94117fc5160 100644
--- a/python/pyspark/errors/__init__.py
+++ b/python/pyspark/errors/__init__.py
@@ -31,6 +31,7 @@ from pyspark.errors.exceptions.base import (  # noqa: F401
     SparkUpgradeException,
     PySparkTypeError,
     PySparkValueError,
+    PySparkAttributeError,
 )
 
 
@@ -47,4 +48,5 @@ __all__ = [
     "SparkUpgradeException",
     "PySparkTypeError",
     "PySparkValueError",
+    "PySparkAttributeError",
 ]
diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
index 8c0f79f7d5a..dda1f5a1f84 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -39,6 +39,11 @@ ERROR_CLASSES_JSON = """
       "Function `<func_name>` should return Column, got <return_type>."
     ]
   },
+  "JVM_ATTRIBUTE_NOT_SUPPORTED" : {
+    "message" : [
+      "Attribute `<attr_name>` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, do not use Spark Connect when creating your session."
+    ]
+  },
   "NOT_BOOL" : {
     "message" : [
       "Argument `<arg_name>` should be a bool, got <arg_type>."
diff --git a/python/pyspark/errors/exceptions/base.py b/python/pyspark/errors/exceptions/base.py
index 6e67039374d..fa66b80ac3a 100644
--- a/python/pyspark/errors/exceptions/base.py
+++ b/python/pyspark/errors/exceptions/base.py
@@ -160,3 +160,9 @@ class PySparkTypeError(PySparkException, TypeError):
     """
     Wrapper class for TypeError to support error classes.
     """
+
+
+class PySparkAttributeError(PySparkException, AttributeError):
+    """
+    Wrapper class for AttributeError to support error classes.
+    """
diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py
index d2be32b905e..f30a5f258f2 100644
--- a/python/pyspark/sql/connect/column.py
+++ b/python/pyspark/sql/connect/column.py
@@ -31,7 +31,7 @@ from typing import (
     Optional,
 )
 
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkTypeError, PySparkAttributeError
 from pyspark.sql.types import DataType
 from pyspark.sql.column import Column as PySparkColumn
 
@@ -433,6 +433,10 @@ class Column:
     dropFields.__doc__ = PySparkColumn.dropFields.__doc__
 
     def __getattr__(self, item: Any) -> "Column":
+        if item == "_jc":
+            raise PySparkAttributeError(
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jc"}
+            )
         if item.startswith("__"):
             raise AttributeError(item)
         return self[item]
@@ -459,6 +463,12 @@ class Column:
 
     __bool__ = __nonzero__
 
+    @property
+    def _jc(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jc"}
+        )
+
 
 Column.__doc__ = PySparkColumn.__doc__
 
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
index 0887294ddcf..f1968bc0ad9 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -50,7 +50,7 @@ from pyspark.sql.dataframe import (
     DataFrameStatFunctions as PySparkDataFrameStatFunctions,
 )
 
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkTypeError, PySparkAttributeError
 from pyspark.errors.exceptions.connect import SparkConnectException
 from pyspark.rdd import PythonEvalType
 import pyspark.sql.connect.plan as plan
@@ -1304,6 +1304,10 @@ class DataFrame:
         return None
 
     def __getattr__(self, name: str) -> "Column":
+        if name in ["_jseq", "_jdf", "_jmap", "_jcols"]:
+            raise PySparkAttributeError(
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name}
+            )
         return self[name]
 
     @overload
diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py
index 1b58c54b38e..192ec68b92a 100644
--- a/python/pyspark/sql/connect/readwriter.py
+++ b/python/pyspark/sql/connect/readwriter.py
@@ -30,6 +30,7 @@ from pyspark.sql.readwriter import (
     DataFrameReader as PySparkDataFrameReader,
     DataFrameWriterV2 as PySparkDataFrameWriterV2,
 )
+from pyspark.errors import PySparkAttributeError
 
 if TYPE_CHECKING:
     from pyspark.sql.connect.dataframe import DataFrame
@@ -417,6 +418,12 @@ class DataFrameReader(OptionUtils):
 
     jdbc.__doc__ = PySparkDataFrameReader.jdbc.__doc__
 
+    @property
+    def _jreader(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jreader"}
+        )
+
 
 DataFrameReader.__doc__ = PySparkDataFrameReader.__doc__
 
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
index 5e7c8361d80..b75cb63c4de 100644
--- a/python/pyspark/sql/connect/session.py
+++ b/python/pyspark/sql/connect/session.py
@@ -67,6 +67,7 @@ from pyspark.sql.types import (
     TimestampType,
 )
 from pyspark.sql.utils import to_str
+from pyspark.errors import PySparkAttributeError
 
 if TYPE_CHECKING:
     from pyspark.sql.connect._typing import OptionalPrimitiveType
@@ -473,6 +474,31 @@ class SparkSession:
     def readStream(self) -> Any:
         raise NotImplementedError("readStream() is not implemented.")
 
+    @property
+    def _jsc(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jsc"}
+        )
+
+    @property
+    def _jconf(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jconf"}
+        )
+
+    @property
+    def _jvm(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jvm"}
+        )
+
+    @property
+    def _jsparkSession(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+            message_parameters={"attr_name": "_jsparkSession"},
+        )
+
     @property
     def udf(self) -> "UDFRegistration":
         from pyspark.sql.connect.udf import UDFRegistration
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
index 9da3285d07e..a8e161a42a6 100644
--- a/python/pyspark/sql/tests/connect/test_connect_basic.py
+++ b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -23,7 +23,7 @@ import shutil
 import tempfile
 from collections import defaultdict
 
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkAttributeError, PySparkTypeError
 from pyspark.sql import SparkSession as PySparkSession, Row
 from pyspark.sql.types import (
     StructType,
@@ -2936,6 +2936,53 @@ class SparkConnectBasicTests(SparkConnectSQLTestCase):
         self.assertEqual(cdf2.schema, sdf2.schema)
         self.assertEqual(cdf2.collect(), sdf2.collect())
 
+    def test_unsupported_jvm_attribute(self):
+        # Unsupported jvm attributes for Spark session.
+        unsupported_attrs = ["_jsc", "_jconf", "_jvm", "_jsparkSession"]
+        spark_session = self.connect
+        for attr in unsupported_attrs:
+            with self.assertRaises(PySparkAttributeError) as pe:
+                getattr(spark_session, attr)
+
+            self.check_error(
+                exception=pe.exception,
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+                message_parameters={"attr_name": attr},
+            )
+
+        # Unsupported jvm attributes for DataFrame.
+        unsupported_attrs = ["_jseq", "_jdf", "_jmap", "_jcols"]
+        cdf = self.connect.range(10)
+        for attr in unsupported_attrs:
+            with self.assertRaises(PySparkAttributeError) as pe:
+                getattr(cdf, attr)
+
+            self.check_error(
+                exception=pe.exception,
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+                message_parameters={"attr_name": attr},
+            )
+
+        # Unsupported jvm attributes for Column.
+        with self.assertRaises(PySparkAttributeError) as pe:
+            getattr(cdf.id, "_jc")
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+            message_parameters={"attr_name": "_jc"},
+        )
+
+        # Unsupported jvm attributes for DataFrameReader.
+        with self.assertRaises(PySparkAttributeError) as pe:
+            getattr(spark_session.read, "_jreader")
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+            message_parameters={"attr_name": "_jreader"},
+        )
+
 
 @unittest.skipIf(not should_test_connect, connect_requirement_message)
 class ClientTests(unittest.TestCase):


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org