You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/02/16 17:41:21 UTC
spark git commit: [SPARK-23446][PYTHON] Explicitly check supported
types in toPandas
Repository: spark
Updated Branches:
refs/heads/master 1dc2c1d5e -> c5857e496
[SPARK-23446][PYTHON] Explicitly check supported types in toPandas
## What changes were proposed in this pull request?
This PR explicitly specifies and checks the types we supported in `toPandas`. This was a hole. For example, we haven't finished the binary type support in Python side yet but now it allows as below:
```python
spark.conf.set("spark.sql.execution.arrow.enabled", "false")
df = spark.createDataFrame([[bytearray("a")]])
df.toPandas()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
df.toPandas()
```
```
_1
0 [97]
_1
0 a
```
This should be disallowed. I think the same things also apply to nested timestamps too.
I also added some nicer message about `spark.sql.execution.arrow.enabled` in the error message.
## How was this patch tested?
Manually tested and tests added in `python/pyspark/sql/tests.py`.
Author: hyukjinkwon <gu...@gmail.com>
Closes #20625 from HyukjinKwon/pandas_convertion_supported_type.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5857e49
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5857e49
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5857e49
Branch: refs/heads/master
Commit: c5857e496ff0d170ed0339f14afc7d36b192da6d
Parents: 1dc2c1d
Author: hyukjinkwon <gu...@gmail.com>
Authored: Fri Feb 16 09:41:17 2018 -0800
Committer: gatorsmile <ga...@gmail.com>
Committed: Fri Feb 16 09:41:17 2018 -0800
----------------------------------------------------------------------
python/pyspark/sql/dataframe.py | 15 +++++++++------
python/pyspark/sql/tests.py | 9 ++++++++-
2 files changed, 17 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/c5857e49/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 5cc8b63..f37777e 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1988,10 +1988,11 @@ class DataFrame(object):
if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
try:
from pyspark.sql.types import _check_dataframe_convert_date, \
- _check_dataframe_localize_timestamps
+ _check_dataframe_localize_timestamps, to_arrow_schema
from pyspark.sql.utils import require_minimum_pyarrow_version
- import pyarrow
require_minimum_pyarrow_version()
+ import pyarrow
+ to_arrow_schema(self.schema)
tables = self._collectAsArrow()
if tables:
table = pyarrow.concat_tables(tables)
@@ -2000,10 +2001,12 @@ class DataFrame(object):
return _check_dataframe_localize_timestamps(pdf, timezone)
else:
return pd.DataFrame.from_records([], columns=self.columns)
- except ImportError as e:
- msg = "note: pyarrow must be installed and available on calling Python process " \
- "if using spark.sql.execution.arrow.enabled=true"
- raise ImportError("%s\n%s" % (_exception_message(e), msg))
+ except Exception as e:
+ msg = (
+ "Note: toPandas attempted Arrow optimization because "
+ "'spark.sql.execution.arrow.enabled' is set to true. Please set it to false "
+ "to disable this.")
+ raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
else:
pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
http://git-wip-us.apache.org/repos/asf/spark/blob/c5857e49/python/pyspark/sql/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 2af218a..1965307 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3497,7 +3497,14 @@ class ArrowTests(ReusedSQLTestCase):
schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
df = self.spark.createDataFrame([(None,)], schema=schema)
with QuietTest(self.sc):
- with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
+ with self.assertRaisesRegexp(Exception, 'Unsupported type'):
+ df.toPandas()
+
+ df = self.spark.createDataFrame([(None,)], schema="a binary")
+ with QuietTest(self.sc):
+ with self.assertRaisesRegexp(
+ Exception,
+ 'Unsupported type.*\nNote: toPandas attempted Arrow optimization because'):
df.toPandas()
def test_null_conversion(self):
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org