You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/10/07 00:13:33 UTC
[spark] branch branch-3.0 updated: Revert "[SPARK-33073][PYTHON]
Improve error handling on Pandas to Arrow conversion failures"
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 2076abc Revert "[SPARK-33073][PYTHON] Improve error handling on Pandas to Arrow conversion failures"
2076abc is described below
commit 2076abc0d3f1055254d65697c21f26727a502391
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Wed Oct 7 09:11:42 2020 +0900
Revert "[SPARK-33073][PYTHON] Improve error handling on Pandas to Arrow conversion failures"
This reverts commit 4f71231af51a3da5d7964a218a878c0cf3037c10.
---
python/pyspark/sql/pandas/serializers.py | 17 +++++++----------
python/pyspark/sql/tests/test_arrow.py | 9 ++++-----
python/pyspark/sql/tests/test_pandas_grouped_map.py | 15 +++++++--------
3 files changed, 18 insertions(+), 23 deletions(-)
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index b164a38..4dd15d1 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -156,16 +156,13 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
s = _check_series_convert_timestamps_internal(s, self._timezone)
try:
array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck)
- except ValueError as e:
- if self._safecheck:
- error_msg = "Exception thrown when converting pandas.Series (%s) to " + \
- "Arrow Array (%s). It can be caused by overflows or other " + \
- "unsafe conversions warned by Arrow. Arrow safe type check " + \
- "can be disabled by using SQL config " + \
- "`spark.sql.execution.pandas.convertToArrowArraySafely`."
- raise ValueError(error_msg % (s.dtype, t)) from e
- else:
- raise e
+ except pa.ArrowException as e:
+ error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \
+ "Array (%s). It can be caused by overflows or other unsafe " + \
+ "conversions warned by Arrow. Arrow safe type check can be " + \
+ "disabled by using SQL config " + \
+ "`spark.sql.execution.pandas.convertToArrowArraySafely`."
+ raise RuntimeError(error_msg % (s.dtype, t), e)
return array
arrs = []
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index 2c6231d..15c5cf1 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -266,12 +266,11 @@ class ArrowTests(ReusedSQLTestCase):
def test_createDataFrame_with_incorrect_schema(self):
pdf = self.create_pandas_data_frame()
fields = list(self.schema)
- fields[5], fields[6] = fields[6], fields[5] # swap decimal with date
+ fields[0], fields[1] = fields[1], fields[0] # swap str with int
wrong_schema = StructType(fields)
- with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
- with QuietTest(self.sc):
- with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"):
- self.spark.createDataFrame(pdf, schema=wrong_schema)
+ with QuietTest(self.sc):
+ with self.assertRaisesRegexp(Exception, "integer.*required"):
+ self.spark.createDataFrame(pdf, schema=wrong_schema)
def test_createDataFrame_with_names(self):
pdf = self.create_pandas_data_frame()
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index 8e02b29..cc6167e 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -450,16 +450,15 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
def column_name_typo(pdf):
return pd.DataFrame({'iid': pdf.id, 'v': pdf.v})
- @pandas_udf('id long, v decimal', PandasUDFType.GROUPED_MAP)
+ @pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP)
def invalid_positional_types(pdf):
- return pd.DataFrame([(1, datetime.date(2020, 10, 5))])
+ return pd.DataFrame([(u'a', 1.2)])
- with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
- with QuietTest(self.sc):
- with self.assertRaisesRegexp(Exception, "KeyError: 'id'"):
- grouped_df.apply(column_name_typo).collect()
- with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"):
- grouped_df.apply(invalid_positional_types).collect()
+ with QuietTest(self.sc):
+ with self.assertRaisesRegexp(Exception, "KeyError: 'id'"):
+ grouped_df.apply(column_name_typo).collect()
+ with self.assertRaisesRegexp(Exception, "an integer is required"):
+ grouped_df.apply(invalid_positional_types).collect()
def test_positional_assignment_conf(self):
with self.sql_conf({
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org