You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/06/27 01:23:04 UTC

[spark] branch master updated: [SPARK-39574][PS] Better error message when `ps.Index` is used for DataFrame/Series creation

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 5ad8fbca84d [SPARK-39574][PS] Better error message when `ps.Index` is used for DataFrame/Series creation
5ad8fbca84d is described below

commit 5ad8fbca84d8f810954c14d9e404104ff64086bd
Author: Xinrong Meng <xi...@databricks.com>
AuthorDate: Sun Jun 26 18:22:48 2022 -0700

    [SPARK-39574][PS] Better error message when `ps.Index` is used for DataFrame/Series creation
    
    ### What changes were proposed in this pull request?
    Better error message when `ps.Index` is used for DataFrame/Series creation.
    
    ### Why are the changes needed?
    As part of [SPARK-39581](https://issues.apache.org/jira/browse/SPARK-39581).
    
    The current error message is confusing and is from the pandas library. We should improve that.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes.
    **From:**
    ```py
    >>> ps.DataFrame([1, 2], index=ps.Index([1, 2]))
    Traceback (most recent call last):
    ...
    ValueError: The truth value of a Int64Index is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
    
    >>> ps.Series([1, 2], index=ps.Index([1, 2]))
    Traceback (most recent call last):
    ...
    ValueError: The truth value of a Int64Index is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
    ```
    
    **To:**
    ```py
    >>> ps.DataFrame([1, 2], index=ps.Index([1, 2]))
    Traceback (most recent call last):
    ...
    TypeError: The given index cannot be a pandas-on-Spark index. Try pandas index or array-like.
    
    >>> ps.Series([1, 2], index=ps.Index([1, 2]))
    Traceback (most recent call last):
    ...
    TypeError: The given index cannot be a pandas-on-Spark index. Try pandas index or array-like.
    ```
    
    ### How was this patch tested?
    Unit tests.
    
    Closes #36981 from xinrong-databricks/err_pd.
    
    Authored-by: Xinrong Meng <xi...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/pandas/frame.py                |  7 +++++++
 python/pyspark/pandas/series.py               |  8 ++++++++
 python/pyspark/pandas/tests/test_dataframe.py |  9 +++++++++
 python/pyspark/pandas/tests/test_series.py    | 10 ++++++++++
 4 files changed, 34 insertions(+)

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index dd1ed489ecd..fc22f2f6989 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -452,6 +452,13 @@ class DataFrame(Frame, Generic[T]):
                 assert not copy
                 pdf = data
             else:
+                from pyspark.pandas.indexes.base import Index
+
+                if isinstance(index, Index):
+                    raise TypeError(
+                        "The given index cannot be a pandas-on-Spark index. "
+                        "Try pandas index or array-like."
+                    )
                 pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
             internal = InternalFrame.from_pandas(pdf)
 
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 352e7dd750b..a7852c110f7 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -405,6 +405,14 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
                 assert not fastpath
                 s = data
             else:
+                from pyspark.pandas.indexes.base import Index
+
+                if isinstance(index, Index):
+                    raise TypeError(
+                        "The given index cannot be a pandas-on-Spark index. "
+                        "Try pandas index or array-like."
+                    )
+
                 s = pd.Series(
                     data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
                 )
diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py
index 51ef473159e..f9496f5e0db 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -96,6 +96,15 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
         index_cols = pdf.columns[column_mask]
         self.assert_eq(psdf[index_cols], pdf[index_cols])
 
+    def test_creation_index(self):
+        err_msg = (
+            "The given index cannot be a pandas-on-Spark index. Try pandas index or array-like."
+        )
+        with self.assertRaisesRegex(TypeError, err_msg):
+            ps.DataFrame([1, 2], index=ps.Index([1, 2]))
+        with self.assertRaisesRegex(TypeError, err_msg):
+            ps.DataFrame([1, 2], index=ps.MultiIndex.from_tuples([(1, 3), (2, 4)]))
+
     def _check_extension(self, psdf, pdf):
         if LooseVersion("1.1") <= LooseVersion(pd.__version__) < LooseVersion("1.2.2"):
             self.assert_eq(psdf, pdf, check_exact=False)
diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py
index 7631444ee5d..f513ea11bd8 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -54,6 +54,16 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
     def psser(self):
         return ps.from_pandas(self.pser)
 
+    def test_creation_index(self):
+        err_msg = (
+            "The given index cannot be a pandas-on-Spark index. Try pandas index or array-like."
+        )
+        with self.assertRaisesRegex(TypeError, err_msg):
+            ps.Series([1, 2], index=ps.Index([1, 2]))
+
+        with self.assertRaisesRegex(TypeError, err_msg):
+            ps.Series([1, 2], index=ps.MultiIndex.from_tuples([(1, 3), (2, 4)]))
+
     def test_series_ops(self):
         pser = self.pser
         psser = self.psser


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org