You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2021/07/24 00:20:29 UTC

[spark] branch branch-3.2 updated: [SPARK-36264][PYTHON] Add reorder_categories to CategoricalAccessor and CategoricalIndex

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new ab5224c  [SPARK-36264][PYTHON] Add reorder_categories to CategoricalAccessor and CategoricalIndex
ab5224c is described below

commit ab5224c45b7ffa26502938cb5fcfea37977d9f3e
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Fri Jul 23 17:19:20 2021 -0700

    [SPARK-36264][PYTHON] Add reorder_categories to CategoricalAccessor and CategoricalIndex
    
    ### What changes were proposed in this pull request?
    
    Add `reorder_categories` to `CategoricalAccessor` and `CategoricalIndex`.
    
    ### Why are the changes needed?
    
    We should implement `reorder_categories` in `CategoricalAccessor` and `CategoricalIndex`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, users will be able to use `reorder_categories`.
    
    ### How was this patch tested?
    
    Added some tests.
    
    Closes #33499 from ueshin/issues/SPARK-36264/reorder_categories.
    
    Authored-by: Takuya UESHIN <ue...@databricks.com>
    Signed-off-by: Takuya UESHIN <ue...@databricks.com>
    (cherry picked from commit e12bc4d31df7f32f4ecf079d8ace6fd34df770d7)
    Signed-off-by: Takuya UESHIN <ue...@databricks.com>
---
 .../source/reference/pyspark.pandas/indexing.rst   |  1 +
 .../source/reference/pyspark.pandas/series.rst     |  1 +
 python/pyspark/pandas/categorical.py               | 93 +++++++++++++++++++++-
 python/pyspark/pandas/indexes/category.py          | 52 ++++++++++++
 python/pyspark/pandas/missing/indexes.py           |  1 -
 .../pyspark/pandas/tests/indexes/test_category.py  | 21 +++++
 python/pyspark/pandas/tests/test_categorical.py    | 37 +++++++++
 7 files changed, 202 insertions(+), 4 deletions(-)

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 7115018..ebf332e 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -174,6 +174,7 @@ Categorical components
    CategoricalIndex.codes
    CategoricalIndex.categories
    CategoricalIndex.ordered
+   CategoricalIndex.reorder_categories
    CategoricalIndex.add_categories
    CategoricalIndex.remove_categories
    CategoricalIndex.remove_unused_categories
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index c3a86bc..95e102f 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -399,6 +399,7 @@ the ``Series.cat`` accessor.
    Series.cat.categories
    Series.cat.ordered
    Series.cat.codes
+   Series.cat.reorder_categories
    Series.cat.add_categories
    Series.cat.remove_categories
    Series.cat.remove_unused_categories
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index cc0c68c..ce9b3ed 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -587,9 +587,96 @@ class CategoricalAccessor(object):
             return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
 
     def reorder_categories(
-        self, new_categories: pd.Index, ordered: bool = None, inplace: bool = False
-    ) -> "ps.Series":
-        raise NotImplementedError()
+        self,
+        new_categories: Union[pd.Index, List],
+        ordered: Optional[bool] = None,
+        inplace: bool = False,
+    ) -> Optional["ps.Series"]:
+        """
+        Reorder categories as specified in new_categories.
+
+        `new_categories` need to include all old categories and no new category
+        items.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, optional
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        inplace : bool, default False
+           Whether or not to reorder the categories inplace or return a copy of
+           this categorical with reordered categories.
+
+        Returns
+        -------
+        cat : Series or None
+            Categorical with removed categories or None if ``inplace=True``.
+
+        Raises
+        ------
+        ValueError
+            If the new categories do not contain all old category items or any
+            new ones
+
+        Examples
+        --------
+        >>> s = ps.Series(list("abbccc"), dtype="category")
+        >>> s  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['a', 'b', 'c']
+
+        >>> s.cat.reorder_categories(['c', 'b', 'a'], ordered=True)  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['c' < 'b' < 'a']
+        """
+        if not is_list_like(new_categories):
+            raise TypeError(
+                "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
+            )
+        elif len(set(new_categories)) != len(set(self.categories)) or any(
+            cat not in self.categories for cat in new_categories
+        ):
+            raise ValueError("items in new_categories are not the same as in old categories")
+
+        if ordered is None:
+            ordered = self.ordered
+
+        if new_categories == list(self.categories) and ordered == self.ordered:
+            if inplace:
+                return None
+            else:
+                psser = self._data
+                return psser._with_new_scol(
+                    psser.spark.column, field=psser._internal.data_fields[0]
+                )
+        else:
+            dtype = CategoricalDtype(categories=new_categories, ordered=ordered)
+            psser = self._data.astype(dtype)
+
+            if inplace:
+                internal = self._data._psdf._internal.with_new_spark_column(
+                    self._data._column_label,
+                    psser.spark.column,
+                    field=psser._internal.data_fields[0],
+                )
+                self._data._psdf._update_internal_frame(internal)
+                return None
+            else:
+                return psser
 
     def set_categories(
         self,
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index cc23fab..51b14bc 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -386,6 +386,58 @@ class CategoricalIndex(Index):
 
         return CategoricalIndex(self.to_series().cat.remove_unused_categories()).rename(self.name)
 
+    def reorder_categories(
+        self,
+        new_categories: Union[pd.Index, Any, List],
+        ordered: Optional[bool] = None,
+        inplace: bool = False,
+    ) -> Optional["CategoricalIndex"]:
+        """
+        Reorder categories as specified in new_categories.
+
+        `new_categories` need to include all old categories and no new category
+        items.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, optional
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        inplace : bool, default False
+           Whether or not to reorder the categories inplace or return a copy of
+           this categorical with reordered categories.
+
+        Returns
+        -------
+        cat : CategoricalIndex or None
+            Categorical with removed categories or None if ``inplace=True``.
+
+        Raises
+        ------
+        ValueError
+            If the new categories do not contain all old category items or any
+            new ones
+
+        Examples
+        --------
+        >>> idx = ps.CategoricalIndex(list("abbccc"))
+        >>> idx  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=False, dtype='category')
+
+        >>> idx.reorder_categories(['c', 'b', 'a'])  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['c', 'b', 'a'], ordered=False, dtype='category')
+        """
+        if inplace:
+            raise ValueError("cannot use inplace with CategoricalIndex")
+
+        return CategoricalIndex(
+            self.to_series().cat.reorder_categories(new_categories=new_categories, ordered=ordered)
+        ).rename(self.name)
+
     def __getattr__(self, item: str) -> Any:
         if hasattr(MissingPandasLikeCategoricalIndex, item):
             property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py
index c81657f..0f1c316 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -123,7 +123,6 @@ class MissingPandasLikeDatetimeIndex(MissingPandasLikeIndex):
 class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
 
     # Functions
-    reorder_categories = _unsupported_function("reorder_categories", cls="CategoricalIndex")
     set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
     map = _unsupported_function("map", cls="CategoricalIndex")
 
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
index c13e9ac..a11f36a 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -130,6 +130,27 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
 
         self.assertRaises(ValueError, lambda: psidx.remove_unused_categories(inplace=True))
 
+    def test_reorder_categories(self):
+        pidx = pd.CategoricalIndex([1, 2, 3])
+        psidx = ps.from_pandas(pidx)
+
+        self.assert_eq(pidx.reorder_categories([1, 2, 3]), psidx.reorder_categories([1, 2, 3]))
+        self.assert_eq(
+            pidx.reorder_categories([1, 2, 3], ordered=True),
+            psidx.reorder_categories([1, 2, 3], ordered=True),
+        )
+        self.assert_eq(pidx.reorder_categories([3, 2, 1]), psidx.reorder_categories([3, 2, 1]))
+        self.assert_eq(
+            pidx.reorder_categories([3, 2, 1], ordered=True),
+            psidx.reorder_categories([3, 2, 1], ordered=True),
+        )
+
+        self.assertRaises(ValueError, lambda: pidx.reorder_categories([1, 2, 3], inplace=True))
+        self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2]))
+        self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 4]))
+        self.assertRaises(ValueError, lambda: psidx.reorder_categories([1, 2, 2]))
+        self.assertRaises(TypeError, lambda: psidx.reorder_categories(1))
+
     def test_as_ordered_unordered(self):
         pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
         psidx = ps.from_pandas(pidx)
diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py
index d9e232a..4122efa 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -141,6 +141,43 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
+    def test_reorder_categories(self):
+        pdf, psdf = self.df_pair
+
+        pser = pdf.a
+        psser = psdf.a
+
+        self.assert_eq(
+            pser.cat.reorder_categories([1, 2, 3]), psser.cat.reorder_categories([1, 2, 3])
+        )
+        self.assert_eq(
+            pser.cat.reorder_categories([1, 2, 3], ordered=True),
+            psser.cat.reorder_categories([1, 2, 3], ordered=True),
+        )
+        self.assert_eq(
+            pser.cat.reorder_categories([3, 2, 1]), psser.cat.reorder_categories([3, 2, 1])
+        )
+        self.assert_eq(
+            pser.cat.reorder_categories([3, 2, 1], ordered=True),
+            psser.cat.reorder_categories([3, 2, 1], ordered=True),
+        )
+
+        pser.cat.reorder_categories([1, 2, 3], inplace=True)
+        psser.cat.reorder_categories([1, 2, 3], inplace=True)
+        self.assert_eq(pser, psser)
+        self.assert_eq(pdf, psdf)
+
+        pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
+        psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
+        self.assert_eq(pser, psser)
+        self.assert_eq(pdf, psdf)
+
+        self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2]))
+        self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2, 4]))
+        self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2, 2]))
+        self.assertRaises(TypeError, lambda: psser.cat.reorder_categories(1))
+        self.assertRaises(TypeError, lambda: psdf.b.cat.reorder_categories("abcd"))
+
     def test_as_ordered_unordered(self):
         pdf, psdf = self.df_pair
 

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org