You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/10/06 06:36:13 UTC
[spark] branch master updated: [SPARK-36930][PYTHON] Support
ps.MultiIndex.dtypes
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5c6f0b9 [SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes
5c6f0b9 is described below
commit 5c6f0b9263f29f805f386237448f671dea3ad6c5
Author: dchvn nguyen <dg...@viettel.com.vn>
AuthorDate: Wed Oct 6 15:35:32 2021 +0900
[SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes
### What changes were proposed in this pull request?
Add dtypes for MultiIndex
### Why are the changes needed?
Add dtypes for MultiIndex
Before this PR:
```python
>>> idx = pd.MultiIndex.from_arrays([[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]], names=("zero", "one"))
>>> pdf = pd.DataFrame(
... {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
... index=idx,
... )
>>> psdf = ps.from_pandas(pdf)
>>>
>>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/u02/spark/python/pyspark/pandas/indexes/multi.py", line 917, in __getattr__
raise AttributeError("'MultiIndex' object has no attribute '{}'".format(item))
AttributeError: 'MultiIndex' object has no attribute 'dtypes'
>>>
```
### Does this PR introduce _any_ user-facing change?
After this PR user can use ```MultiIndex.dtypes``` for:
``` python
>>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
```
### How was this patch tested?
unit tests.
Closes #34179 from dchvn/add_multiindex_dtypes.
Lead-authored-by: dchvn nguyen <dg...@viettel.com.vn>
Co-authored-by: dch nguyen <dg...@viettel.com.vn>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../source/reference/pyspark.pandas/indexing.rst | 1 +
python/pyspark/pandas/indexes/multi.py | 29 ++++++++++++++++++++++
python/pyspark/pandas/tests/test_dataframe.py | 14 +++++++++++
3 files changed, 44 insertions(+)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 7e796c6..4168b67 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -240,6 +240,7 @@ MultiIndex Properties
MultiIndex.nlevels
MultiIndex.levshape
MultiIndex.values
+ MultiIndex.dtypes
MultiIndex components
~~~~~~~~~~~~~~~~~~~~~
diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py
index cff3e26..896ea2a 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -375,6 +375,35 @@ class MultiIndex(Index):
def name(self, name: Name) -> None:
raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")
+ @property
+ def dtypes(self) -> pd.Series:
+ """Return the dtypes as a Series for the underlying MultiIndex.
+
+ .. versionadded:: 3.3.0
+
+ Returns
+ -------
+ pd.Series
+ The data type of each level.
+
+ Examples
+ --------
+ >>> psmidx = ps.MultiIndex.from_arrays(
+ ... [[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]],
+ ... names=("zero", "one"),
+ ... )
+ >>> psmidx.dtypes
+ zero int64
+ one int64
+ dtype: object
+ """
+ return pd.Series(
+ [field.dtype for field in self._internal.index_fields],
+ index=pd.Index(
+ [name if len(name) > 1 else name[0] for name in self._internal.index_names]
+ ),
+ )
+
def _verify_for_rename(self, name: List[Name]) -> List[Label]: # type: ignore[override]
if is_list_like(name):
if self._internal.index_level != len(name):
diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py
index 1ae009c..800fa46 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -6000,6 +6000,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
expected_pdf = pd.DataFrame({"A": [None, 0], "B": [4.0, 1.0], "C": [3, 3]})
self.assert_eq(expected_pdf, psdf1.combine_first(psdf2))
+ def test_multi_index_dtypes(self):
+ # SPARK-36930: Support ps.MultiIndex.dtypes
+ arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
+ pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
+ psmidx = ps.from_pandas(pmidx)
+
+ self.assert_eq(psmidx.dtypes, pmidx.dtypes)
+
+ # multiple labels
+ pmidx = pd.MultiIndex.from_arrays(arrays, names=[("zero", "first"), ("one", "second")])
+ psmidx = ps.from_pandas(pmidx)
+
+ self.assert_eq(psmidx.dtypes, pmidx.dtypes)
+
if __name__ == "__main__":
from pyspark.pandas.tests.test_dataframe import * # noqa: F401
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org