You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2023/12/04 06:49:42 UTC

(spark) branch master updated: [SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new b23ae15da019 [SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework
b23ae15da019 is described below

commit b23ae15da019082891d71853682329c2d24c2e9e
Author: Haejoon Lee <ha...@databricks.com>
AuthorDate: Sun Dec 3 22:49:30 2023 -0800

    [SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to migrate all remaining `ValueError`  from `pyspark/sql/*` into PySpark error framework, `PySparkValueError` with assigning dedicated error classes.
    
    ### Why are the changes needed?
    
    To improve the error handling in PySpark.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No API changes, but the user-facing error messages will be improved.
    
    ### How was this patch tested?
    
    The existing CI should pass.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #44149 from itholic/migrate_value_error.
    
    Authored-by: Haejoon Lee <ha...@databricks.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 python/pyspark/errors/error_classes.py   | 19 +++++++++++++++++--
 python/pyspark/sql/pandas/serializers.py |  5 +++--
 python/pyspark/sql/pandas/typehints.py   | 12 +++++++++---
 python/pyspark/sql/pandas/types.py       |  7 +++++--
 python/pyspark/sql/sql_formatter.py      |  7 ++++---
 5 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
index c7199ac938be..d0c0d1c115b0 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -287,6 +287,11 @@ ERROR_CLASSES_JSON = """
       "NumPy array input should be of <dimensions> dimensions."
     ]
   },
+  "INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP" : {
+    "message" : [
+      "Invalid number of dataframes in group <dataframes_in_group>."
+    ]
+  },
   "INVALID_PANDAS_UDF" : {
     "message" : [
       "Invalid function: <detail>"
@@ -803,9 +808,9 @@ ERROR_CLASSES_JSON = """
       "Expected <expected> values for `<item>`, got <actual>."
     ]
   },
-  "TYPE_HINT_REQUIRED" : {
+  "TYPE_HINT_SHOULD_BE_SPECIFIED" : {
     "message" : [
-      "A <arg_type> is required <where>."
+      "Type hints for <target> should be specified; however, got <sig>."
     ]
   },
   "UDF_RETURN_TYPE" : {
@@ -888,6 +893,11 @@ ERROR_CLASSES_JSON = """
       "Unknown response: <response>."
     ]
   },
+  "UNKNOWN_VALUE_FOR" : {
+    "message" : [
+      "Unknown value for `<var>`."
+    ]
+  },
   "UNSUPPORTED_DATA_TYPE" : {
     "message" : [
       "Unsupported DataType `<data_type>`."
@@ -983,6 +993,11 @@ ERROR_CLASSES_JSON = """
       "Value for `<arg_name>` only supports the 'pearson', got '<arg_value>'."
     ]
   },
+  "VALUE_NOT_PLAIN_COLUMN_REFERENCE" : {
+    "message" : [
+      "Value <val> in <field_name> should be a plain column reference such as `df.col` or `col('column')`."
+    ]
+  },
   "VALUE_NOT_POSITIVE" : {
     "message" : [
       "Value for `<arg_name>` must be positive, got '<arg_value>'."
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index 8ffb7407714b..6c5bd826a023 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -707,8 +707,9 @@ class CogroupArrowUDFSerializer(ArrowStreamGroupUDFSerializer):
                 yield batches1, batches2
 
             elif dataframes_in_group != 0:
-                raise ValueError(
-                    "Invalid number of dataframes in group {0}".format(dataframes_in_group)
+                raise PySparkValueError(
+                    error_class="INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP",
+                    message_parameters={"dataframes_in_group": str(dataframes_in_group)},
                 )
 
 
diff --git a/python/pyspark/sql/pandas/typehints.py b/python/pyspark/sql/pandas/typehints.py
index f0c13e66a63d..37ba02a94d58 100644
--- a/python/pyspark/sql/pandas/typehints.py
+++ b/python/pyspark/sql/pandas/typehints.py
@@ -18,7 +18,7 @@ from inspect import Signature
 from typing import Any, Callable, Dict, Optional, Union, TYPE_CHECKING
 
 from pyspark.sql.pandas.utils import require_minimum_pandas_version
-from pyspark.errors import PySparkNotImplementedError
+from pyspark.errors import PySparkNotImplementedError, PySparkValueError
 
 if TYPE_CHECKING:
     from pyspark.sql.pandas._typing import (
@@ -51,12 +51,18 @@ def infer_eval_type(
         annotations[parameter] for parameter in sig.parameters if parameter in annotations
     ]
     if len(parameters_sig) != len(sig.parameters):
-        raise ValueError("Type hints for all parameters should be specified; however, got %s" % sig)
+        raise PySparkValueError(
+            error_class="TYPE_HINT_SHOULD_BE_SPECIFIED",
+            message_parameters={"target": "all parameters", "sig": str(sig)},
+        )
 
     # Check if the return has a type hint
     return_annotation = type_hints.get("return", sig.return_annotation)
     if sig.empty is return_annotation:
-        raise ValueError("Type hint for the return type should be specified; however, got %s" % sig)
+        raise PySparkValueError(
+            error_class="TYPE_HINT_SHOULD_BE_SPECIFIED",
+            message_parameters={"target": "the return type", "sig": str(sig)},
+        )
 
     # Series, Frame or Union[DataFrame, Series], ... -> Series or Frame
     is_series_or_frame = all(
diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py
index f4005a47357b..36c982eb519c 100644
--- a/python/pyspark/sql/pandas/types.py
+++ b/python/pyspark/sql/pandas/types.py
@@ -49,7 +49,7 @@ from pyspark.sql.types import (
     UserDefinedType,
     _create_row,
 )
-from pyspark.errors import PySparkTypeError, UnsupportedOperationException
+from pyspark.errors import PySparkTypeError, UnsupportedOperationException, PySparkValueError
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -716,7 +716,10 @@ def _create_converter_to_pandas(
                 return convert_struct_as_dict
 
             else:
-                raise ValueError(f"Unknown value for `struct_in_pandas`: {_struct_in_pandas}")
+                raise PySparkValueError(
+                    error_class="UNKNOWN_VALUE_FOR",
+                    message_parameters={"var": str(_struct_in_pandas)},
+                )
 
         elif isinstance(dt, TimestampType):
             assert timezone is not None
diff --git a/python/pyspark/sql/sql_formatter.py b/python/pyspark/sql/sql_formatter.py
index 5e79b9ff5ea9..a27f7205a2d7 100644
--- a/python/pyspark/sql/sql_formatter.py
+++ b/python/pyspark/sql/sql_formatter.py
@@ -25,6 +25,7 @@ from py4j.java_gateway import is_instance_of
 if typing.TYPE_CHECKING:
     from pyspark.sql import SparkSession, DataFrame
 from pyspark.sql.functions import lit
+from pyspark.errors import PySparkValueError
 
 
 class SQLStringFormatter(string.Formatter):
@@ -61,9 +62,9 @@ class SQLStringFormatter(string.Formatter):
             ):
                 return jexpr.sql()
             else:
-                raise ValueError(
-                    "%s in %s should be a plain column reference such as `df.col` "
-                    "or `col('column')`" % (val, field_name)
+                raise PySparkValueError(
+                    error_class="VALUE_NOT_PLAIN_COLUMN_REFERENCE",
+                    message_parameters={"val": str(val), "field_name": field_name},
                 )
         elif isinstance(val, DataFrame):
             for df, n in self._temp_views:


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org