You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/10/06 06:36:13 UTC

[spark] branch master updated: [SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 5c6f0b9  [SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes
5c6f0b9 is described below

commit 5c6f0b9263f29f805f386237448f671dea3ad6c5
Author: dchvn nguyen <dg...@viettel.com.vn>
AuthorDate: Wed Oct 6 15:35:32 2021 +0900

    [SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes
    
    ### What changes were proposed in this pull request?
    Add dtypes for MultiIndex
    
    ### Why are the changes needed?
    Add dtypes for MultiIndex
    
    Before this PR:
    
    ```python
    >>> idx = pd.MultiIndex.from_arrays([[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]], names=("zero", "one"))
    >>> pdf = pd.DataFrame(
    ...     {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
    ...     index=idx,
    ... )
    >>> psdf = ps.from_pandas(pdf)
    >>>
    >>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes]
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/u02/spark/python/pyspark/pandas/indexes/multi.py", line 917, in __getattr__
        raise AttributeError("'MultiIndex' object has no attribute '{}'".format(item))
    AttributeError: 'MultiIndex' object has no attribute 'dtypes'
    >>>
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    After this PR user can use ```MultiIndex.dtypes``` for:
    
    ``` python
    >>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes]
    typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
    ```
    
    ### How was this patch tested?
    unit tests.
    
    Closes #34179 from dchvn/add_multiindex_dtypes.
    
    Lead-authored-by: dchvn nguyen <dg...@viettel.com.vn>
    Co-authored-by: dch nguyen <dg...@viettel.com.vn>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../source/reference/pyspark.pandas/indexing.rst   |  1 +
 python/pyspark/pandas/indexes/multi.py             | 29 ++++++++++++++++++++++
 python/pyspark/pandas/tests/test_dataframe.py      | 14 +++++++++++
 3 files changed, 44 insertions(+)

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 7e796c6..4168b67 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -240,6 +240,7 @@ MultiIndex Properties
    MultiIndex.nlevels
    MultiIndex.levshape
    MultiIndex.values
+   MultiIndex.dtypes
 
 MultiIndex components
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index cff3e26..896ea2a 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -375,6 +375,35 @@ class MultiIndex(Index):
     def name(self, name: Name) -> None:
         raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")
 
+    @property
+    def dtypes(self) -> pd.Series:
+        """Return the dtypes as a Series for the underlying MultiIndex.
+
+        .. versionadded:: 3.3.0
+
+        Returns
+        -------
+        pd.Series
+            The data type of each level.
+
+        Examples
+        --------
+        >>> psmidx = ps.MultiIndex.from_arrays(
+        ...     [[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]],
+        ...     names=("zero", "one"),
+        ... )
+        >>> psmidx.dtypes
+        zero    int64
+        one     int64
+        dtype: object
+        """
+        return pd.Series(
+            [field.dtype for field in self._internal.index_fields],
+            index=pd.Index(
+                [name if len(name) > 1 else name[0] for name in self._internal.index_names]
+            ),
+        )
+
     def _verify_for_rename(self, name: List[Name]) -> List[Label]:  # type: ignore[override]
         if is_list_like(name):
             if self._internal.index_level != len(name):
diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py
index 1ae009c..800fa46 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -6000,6 +6000,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
             expected_pdf = pd.DataFrame({"A": [None, 0], "B": [4.0, 1.0], "C": [3, 3]})
             self.assert_eq(expected_pdf, psdf1.combine_first(psdf2))
 
+    def test_multi_index_dtypes(self):
+        # SPARK-36930: Support ps.MultiIndex.dtypes
+        arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
+        pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
+        psmidx = ps.from_pandas(pmidx)
+
+        self.assert_eq(psmidx.dtypes, pmidx.dtypes)
+
+        # multiple labels
+        pmidx = pd.MultiIndex.from_arrays(arrays, names=[("zero", "first"), ("one", "second")])
+        psmidx = ps.from_pandas(pmidx)
+
+        self.assert_eq(psmidx.dtypes, pmidx.dtypes)
+
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.test_dataframe import *  # noqa: F401

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org