You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2023/12/04 06:49:42 UTC
(spark) branch master updated: [SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new b23ae15da019 [SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework
b23ae15da019 is described below
commit b23ae15da019082891d71853682329c2d24c2e9e
Author: Haejoon Lee <ha...@databricks.com>
AuthorDate: Sun Dec 3 22:49:30 2023 -0800
[SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework
### What changes were proposed in this pull request?
This PR proposes to migrate all remaining `ValueError` from `pyspark/sql/*` into PySpark error framework, `PySparkValueError` with assigning dedicated error classes.
### Why are the changes needed?
To improve the error handling in PySpark.
### Does this PR introduce _any_ user-facing change?
No API changes, but the user-facing error messages will be improved.
### How was this patch tested?
The existing CI should pass.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #44149 from itholic/migrate_value_error.
Authored-by: Haejoon Lee <ha...@databricks.com>
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
python/pyspark/errors/error_classes.py | 19 +++++++++++++++++--
python/pyspark/sql/pandas/serializers.py | 5 +++--
python/pyspark/sql/pandas/typehints.py | 12 +++++++++---
python/pyspark/sql/pandas/types.py | 7 +++++--
python/pyspark/sql/sql_formatter.py | 7 ++++---
5 files changed, 38 insertions(+), 12 deletions(-)
diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
index c7199ac938be..d0c0d1c115b0 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -287,6 +287,11 @@ ERROR_CLASSES_JSON = """
"NumPy array input should be of <dimensions> dimensions."
]
},
+ "INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP" : {
+ "message" : [
+ "Invalid number of dataframes in group <dataframes_in_group>."
+ ]
+ },
"INVALID_PANDAS_UDF" : {
"message" : [
"Invalid function: <detail>"
@@ -803,9 +808,9 @@ ERROR_CLASSES_JSON = """
"Expected <expected> values for `<item>`, got <actual>."
]
},
- "TYPE_HINT_REQUIRED" : {
+ "TYPE_HINT_SHOULD_BE_SPECIFIED" : {
"message" : [
- "A <arg_type> is required <where>."
+ "Type hints for <target> should be specified; however, got <sig>."
]
},
"UDF_RETURN_TYPE" : {
@@ -888,6 +893,11 @@ ERROR_CLASSES_JSON = """
"Unknown response: <response>."
]
},
+ "UNKNOWN_VALUE_FOR" : {
+ "message" : [
+ "Unknown value for `<var>`."
+ ]
+ },
"UNSUPPORTED_DATA_TYPE" : {
"message" : [
"Unsupported DataType `<data_type>`."
@@ -983,6 +993,11 @@ ERROR_CLASSES_JSON = """
"Value for `<arg_name>` only supports the 'pearson', got '<arg_value>'."
]
},
+ "VALUE_NOT_PLAIN_COLUMN_REFERENCE" : {
+ "message" : [
+ "Value <val> in <field_name> should be a plain column reference such as `df.col` or `col('column')`."
+ ]
+ },
"VALUE_NOT_POSITIVE" : {
"message" : [
"Value for `<arg_name>` must be positive, got '<arg_value>'."
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index 8ffb7407714b..6c5bd826a023 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -707,8 +707,9 @@ class CogroupArrowUDFSerializer(ArrowStreamGroupUDFSerializer):
yield batches1, batches2
elif dataframes_in_group != 0:
- raise ValueError(
- "Invalid number of dataframes in group {0}".format(dataframes_in_group)
+ raise PySparkValueError(
+ error_class="INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP",
+ message_parameters={"dataframes_in_group": str(dataframes_in_group)},
)
diff --git a/python/pyspark/sql/pandas/typehints.py b/python/pyspark/sql/pandas/typehints.py
index f0c13e66a63d..37ba02a94d58 100644
--- a/python/pyspark/sql/pandas/typehints.py
+++ b/python/pyspark/sql/pandas/typehints.py
@@ -18,7 +18,7 @@ from inspect import Signature
from typing import Any, Callable, Dict, Optional, Union, TYPE_CHECKING
from pyspark.sql.pandas.utils import require_minimum_pandas_version
-from pyspark.errors import PySparkNotImplementedError
+from pyspark.errors import PySparkNotImplementedError, PySparkValueError
if TYPE_CHECKING:
from pyspark.sql.pandas._typing import (
@@ -51,12 +51,18 @@ def infer_eval_type(
annotations[parameter] for parameter in sig.parameters if parameter in annotations
]
if len(parameters_sig) != len(sig.parameters):
- raise ValueError("Type hints for all parameters should be specified; however, got %s" % sig)
+ raise PySparkValueError(
+ error_class="TYPE_HINT_SHOULD_BE_SPECIFIED",
+ message_parameters={"target": "all parameters", "sig": str(sig)},
+ )
# Check if the return has a type hint
return_annotation = type_hints.get("return", sig.return_annotation)
if sig.empty is return_annotation:
- raise ValueError("Type hint for the return type should be specified; however, got %s" % sig)
+ raise PySparkValueError(
+ error_class="TYPE_HINT_SHOULD_BE_SPECIFIED",
+ message_parameters={"target": "the return type", "sig": str(sig)},
+ )
# Series, Frame or Union[DataFrame, Series], ... -> Series or Frame
is_series_or_frame = all(
diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py
index f4005a47357b..36c982eb519c 100644
--- a/python/pyspark/sql/pandas/types.py
+++ b/python/pyspark/sql/pandas/types.py
@@ -49,7 +49,7 @@ from pyspark.sql.types import (
UserDefinedType,
_create_row,
)
-from pyspark.errors import PySparkTypeError, UnsupportedOperationException
+from pyspark.errors import PySparkTypeError, UnsupportedOperationException, PySparkValueError
if TYPE_CHECKING:
import pandas as pd
@@ -716,7 +716,10 @@ def _create_converter_to_pandas(
return convert_struct_as_dict
else:
- raise ValueError(f"Unknown value for `struct_in_pandas`: {_struct_in_pandas}")
+ raise PySparkValueError(
+ error_class="UNKNOWN_VALUE_FOR",
+ message_parameters={"var": str(_struct_in_pandas)},
+ )
elif isinstance(dt, TimestampType):
assert timezone is not None
diff --git a/python/pyspark/sql/sql_formatter.py b/python/pyspark/sql/sql_formatter.py
index 5e79b9ff5ea9..a27f7205a2d7 100644
--- a/python/pyspark/sql/sql_formatter.py
+++ b/python/pyspark/sql/sql_formatter.py
@@ -25,6 +25,7 @@ from py4j.java_gateway import is_instance_of
if typing.TYPE_CHECKING:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import lit
+from pyspark.errors import PySparkValueError
class SQLStringFormatter(string.Formatter):
@@ -61,9 +62,9 @@ class SQLStringFormatter(string.Formatter):
):
return jexpr.sql()
else:
- raise ValueError(
- "%s in %s should be a plain column reference such as `df.col` "
- "or `col('column')`" % (val, field_name)
+ raise PySparkValueError(
+ error_class="VALUE_NOT_PLAIN_COLUMN_REFERENCE",
+ message_parameters={"val": str(val), "field_name": field_name},
)
elif isinstance(val, DataFrame):
for df, n in self._temp_views:
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org