You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/10/07 00:13:33 UTC

[spark] branch branch-3.0 updated: Revert "[SPARK-33073][PYTHON] Improve error handling on Pandas to Arrow conversion failures"

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 2076abc  Revert "[SPARK-33073][PYTHON] Improve error handling on Pandas to Arrow conversion failures"
2076abc is described below

commit 2076abc0d3f1055254d65697c21f26727a502391
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Wed Oct 7 09:11:42 2020 +0900

    Revert "[SPARK-33073][PYTHON] Improve error handling on Pandas to Arrow conversion failures"
    
    This reverts commit 4f71231af51a3da5d7964a218a878c0cf3037c10.
---
 python/pyspark/sql/pandas/serializers.py            | 17 +++++++----------
 python/pyspark/sql/tests/test_arrow.py              |  9 ++++-----
 python/pyspark/sql/tests/test_pandas_grouped_map.py | 15 +++++++--------
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
index b164a38..4dd15d1 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -156,16 +156,13 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
                 s = _check_series_convert_timestamps_internal(s, self._timezone)
             try:
                 array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck)
-            except ValueError as e:
-                if self._safecheck:
-                    error_msg = "Exception thrown when converting pandas.Series (%s) to " + \
-                                "Arrow Array (%s). It can be caused by overflows or other " + \
-                                "unsafe conversions warned by Arrow. Arrow safe type check " + \
-                                "can be disabled by using SQL config " + \
-                                "`spark.sql.execution.pandas.convertToArrowArraySafely`."
-                    raise ValueError(error_msg % (s.dtype, t)) from e
-                else:
-                    raise e
+            except pa.ArrowException as e:
+                error_msg = "Exception thrown when converting pandas.Series (%s) to Arrow " + \
+                            "Array (%s). It can be caused by overflows or other unsafe " + \
+                            "conversions warned by Arrow. Arrow safe type check can be " + \
+                            "disabled by using SQL config " + \
+                            "`spark.sql.execution.pandas.convertToArrowArraySafely`."
+                raise RuntimeError(error_msg % (s.dtype, t), e)
             return array
 
         arrs = []
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index 2c6231d..15c5cf1 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -266,12 +266,11 @@ class ArrowTests(ReusedSQLTestCase):
     def test_createDataFrame_with_incorrect_schema(self):
         pdf = self.create_pandas_data_frame()
         fields = list(self.schema)
-        fields[5], fields[6] = fields[6], fields[5]  # swap decimal with date
+        fields[0], fields[1] = fields[1], fields[0]  # swap str with int
         wrong_schema = StructType(fields)
-        with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
-            with QuietTest(self.sc):
-                with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"):
-                    self.spark.createDataFrame(pdf, schema=wrong_schema)
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(Exception, "integer.*required"):
+                self.spark.createDataFrame(pdf, schema=wrong_schema)
 
     def test_createDataFrame_with_names(self):
         pdf = self.create_pandas_data_frame()
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index 8e02b29..cc6167e 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -450,16 +450,15 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
         def column_name_typo(pdf):
             return pd.DataFrame({'iid': pdf.id, 'v': pdf.v})
 
-        @pandas_udf('id long, v decimal', PandasUDFType.GROUPED_MAP)
+        @pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP)
         def invalid_positional_types(pdf):
-            return pd.DataFrame([(1, datetime.date(2020, 10, 5))])
+            return pd.DataFrame([(u'a', 1.2)])
 
-        with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
-            with QuietTest(self.sc):
-                with self.assertRaisesRegexp(Exception, "KeyError: 'id'"):
-                    grouped_df.apply(column_name_typo).collect()
-                with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"):
-                    grouped_df.apply(invalid_positional_types).collect()
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(Exception, "KeyError: 'id'"):
+                grouped_df.apply(column_name_typo).collect()
+            with self.assertRaisesRegexp(Exception, "an integer is required"):
+                grouped_df.apply(invalid_positional_types).collect()
 
     def test_positional_assignment_conf(self):
         with self.sql_conf({


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org