You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/07/27 03:18:27 UTC

[spark] branch branch-3.2 updated: [SPARK-36267][PYTHON] Clean up CategoricalAccessor and CategoricalIndex

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new f278f77  [SPARK-36267][PYTHON] Clean up CategoricalAccessor and CategoricalIndex
f278f77 is described below

commit f278f771e69d19152879dba9b477d9beaa654638
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Tue Jul 27 12:17:18 2021 +0900

    [SPARK-36267][PYTHON] Clean up CategoricalAccessor and CategoricalIndex
    
    ### What changes were proposed in this pull request?
    
    Clean up `CategoricalAccessor` and `CategoricalIndex`.
    
    - Clean up the classes
    - Add deprecation warnings
    - Clean up the docs
    
    ### Why are the changes needed?
    
    To finalize the series of PRs for `CategoricalAccessor` and `CategoricalIndex`, we should clean up the classes.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests.
    
    Closes #33528 from ueshin/issues/SPARK-36267/cleanup.
    
    Authored-by: Takuya UESHIN <ue...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit c40d9d46f12d5909bfe18be6376d5216ef320782)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../source/reference/pyspark.pandas/indexing.rst   |   4 +-
 .../source/reference/pyspark.pandas/series.rst     |   4 +-
 python/pyspark/pandas/categorical.py               |  87 +++++++++++
 python/pyspark/pandas/indexes/category.py          | 166 +++++++++++++--------
 4 files changed, 196 insertions(+), 65 deletions(-)

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index cf898aa..677d80f 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -174,14 +174,14 @@ Categorical components
    CategoricalIndex.codes
    CategoricalIndex.categories
    CategoricalIndex.ordered
+   CategoricalIndex.rename_categories
    CategoricalIndex.reorder_categories
    CategoricalIndex.add_categories
    CategoricalIndex.remove_categories
    CategoricalIndex.remove_unused_categories
+   CategoricalIndex.set_categories
    CategoricalIndex.as_ordered
    CategoricalIndex.as_unordered
-   CategoricalIndex.rename_categories
-   CategoricalIndex.set_categories
 
 .. _api.multiindex:
 
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 717c762..3c7de4d 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -399,14 +399,14 @@ the ``Series.cat`` accessor.
    Series.cat.categories
    Series.cat.ordered
    Series.cat.codes
+   Series.cat.rename_categories
    Series.cat.reorder_categories
    Series.cat.add_categories
    Series.cat.remove_categories
    Series.cat.remove_unused_categories
+   Series.cat.set_categories
    Series.cat.as_ordered
    Series.cat.as_unordered
-   Series.cat.rename_categories
-   Series.cat.set_categories
 
 .. _api.series.plot:
 
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index cae9ab1..c12227e 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
+import warnings
 
 import pandas as pd
 from pandas.api.types import CategoricalDtype, is_dict_like, is_list_like
@@ -184,6 +185,8 @@ class CategoricalAccessor(object):
            Whether or not to add the categories inplace or return a copy of
            this categorical with added categories.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         Series or None
@@ -195,6 +198,14 @@ class CategoricalAccessor(object):
             If the new categories include old categories or do not validate as
             categories
 
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        remove_categories : Remove the specified categories.
+        remove_unused_categories : Remove categories which are not used.
+        set_categories : Set the categories to the specified ones.
+
         Examples
         --------
         >>> s = ps.Series(list("abbccc"), dtype="category")
@@ -220,6 +231,13 @@ class CategoricalAccessor(object):
         """
         from pyspark.pandas.frame import DataFrame
 
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in add_categories is deprecated "
+                "and will be removed in a future version.",
+                FutureWarning,
+            )
+
         if is_list_like(new_categories):
             categories = list(new_categories)  # type: List
         else:
@@ -367,6 +385,8 @@ class CategoricalAccessor(object):
            Whether or not to remove the categories inplace or return a copy of
            this categorical with removed categories.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         Series or None
@@ -377,6 +397,14 @@ class CategoricalAccessor(object):
         ValueError
             If the removals are not contained in the categories
 
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        add_categories : Add new categories.
+        remove_unused_categories : Remove categories which are not used.
+        set_categories : Set the categories to the specified ones.
+
         Examples
         --------
         >>> s = ps.Series(list("abbccc"), dtype="category")
@@ -400,6 +428,13 @@ class CategoricalAccessor(object):
         dtype: category
         Categories (2, object): ['a', 'c']
         """
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in remove_categories is deprecated "
+                "and will be removed in a future version.",
+                FutureWarning,
+            )
+
         if is_list_like(removals):
             categories = [cat for cat in removals if cat is not None]  # type: List
         elif removals is None:
@@ -451,11 +486,21 @@ class CategoricalAccessor(object):
            Whether or not to drop unused categories inplace or return a copy of
            this categorical with unused categories dropped.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         cat : Series or None
             Categorical with unused categories dropped or None if ``inplace=True``.
 
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        add_categories : Add new categories.
+        remove_categories : Remove the specified categories.
+        set_categories : Set the categories to the specified ones.
+
         Examples
         --------
         >>> s = ps.Series(pd.Categorical(list("abbccc"), categories=['a', 'b', 'c', 'd']))
@@ -479,6 +524,13 @@ class CategoricalAccessor(object):
         dtype: category
         Categories (3, object): ['a', 'b', 'c']
         """
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in remove_unused_categories is deprecated "
+                "and will be removed in a future version.",
+                FutureWarning,
+            )
+
         categories = set(self._data.drop_duplicates().to_pandas())
         removals = [cat for cat in self.categories if cat not in categories]
         return self.remove_categories(removals=removals, inplace=inplace)
@@ -510,6 +562,8 @@ class CategoricalAccessor(object):
             Whether or not to rename the categories inplace or return a copy of
             this categorical with renamed categories.
 
+            .. deprecated:: 3.2.0
+
         Returns
         -------
         cat : Series or None
@@ -560,6 +614,13 @@ class CategoricalAccessor(object):
         """
         from pyspark.pandas.frame import DataFrame
 
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in rename_categories is deprecated "
+                "and will be removed in a future version.",
+                FutureWarning,
+            )
+
         if is_dict_like(new_categories):
             categories = [cast(dict, new_categories).get(item, item) for item in self.categories]
         elif callable(new_categories):
@@ -611,6 +672,8 @@ class CategoricalAccessor(object):
            Whether or not to reorder the categories inplace or return a copy of
            this categorical with reordered categories.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         cat : Series or None
@@ -622,6 +685,14 @@ class CategoricalAccessor(object):
             If the new categories do not contain all old category items or any
             new ones
 
+        See Also
+        --------
+        rename_categories : Rename categories.
+        add_categories : Add new categories.
+        remove_categories : Remove the specified categories.
+        remove_unused_categories : Remove categories which are not used.
+        set_categories : Set the categories to the specified ones.
+
         Examples
         --------
         >>> s = ps.Series(list("abbccc"), dtype="category")
@@ -645,6 +716,13 @@ class CategoricalAccessor(object):
         dtype: category
         Categories (3, object): ['c' < 'b' < 'a']
         """
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in reorder_categories is deprecated "
+                "and will be removed in a future version.",
+                FutureWarning,
+            )
+
         if not is_list_like(new_categories):
             raise TypeError(
                 "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
@@ -720,6 +798,8 @@ class CategoricalAccessor(object):
            Whether or not to reorder the categories in-place or return a copy
            of this categorical with reordered categories.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         Series with reordered categories or None if inplace.
@@ -782,6 +862,13 @@ class CategoricalAccessor(object):
         """
         from pyspark.pandas.frame import DataFrame
 
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in set_categories is deprecated "
+                "and will be removed in a future version.",
+                FutureWarning,
+            )
+
         if not is_list_like(new_categories):
             raise TypeError(
                 "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index cd95e39..e2dbd33 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -221,6 +221,8 @@ class CategoricalIndex(Index):
            Whether or not to add the categories inplace or return a copy of
            this categorical with added categories.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         CategoricalIndex or None
@@ -232,6 +234,14 @@ class CategoricalIndex(Index):
             If the new categories include old categories or do not validate as
             categories
 
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        remove_categories : Remove the specified categories.
+        remove_unused_categories : Remove categories which are not used.
+        set_categories : Set the categories to the specified ones.
+
         Examples
         --------
         >>> idx = ps.CategoricalIndex(list("abbccc"))
@@ -329,6 +339,8 @@ class CategoricalIndex(Index):
            Whether or not to remove the categories inplace or return a copy of
            this categorical with removed categories.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         CategoricalIndex or None
@@ -339,6 +351,14 @@ class CategoricalIndex(Index):
         ValueError
             If the removals are not contained in the categories
 
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        add_categories : Add new categories.
+        remove_unused_categories : Remove categories which are not used.
+        set_categories : Set the categories to the specified ones.
+
         Examples
         --------
         >>> idx = ps.CategoricalIndex(list("abbccc"))
@@ -365,11 +385,21 @@ class CategoricalIndex(Index):
            Whether or not to drop unused categories inplace or return a copy of
            this categorical with unused categories dropped.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         cat : CategoricalIndex or None
             Categorical with unused categories dropped or None if ``inplace=True``.
 
+        See Also
+        --------
+        rename_categories : Rename categories.
+        reorder_categories : Reorder categories.
+        add_categories : Add new categories.
+        remove_categories : Remove the specified categories.
+        set_categories : Set the categories to the specified ones.
+
         Examples
         --------
         >>> idx = ps.CategoricalIndex(list("abbccc"), categories=['a', 'b', 'c', 'd'])
@@ -386,67 +416,6 @@ class CategoricalIndex(Index):
 
         return CategoricalIndex(self.to_series().cat.remove_unused_categories()).rename(self.name)
 
-    def reorder_categories(
-        self,
-        new_categories: Union[pd.Index, Any, List],
-        ordered: Optional[bool] = None,
-        inplace: bool = False,
-    ) -> Optional["CategoricalIndex"]:
-        """
-        Reorder categories as specified in new_categories.
-
-        `new_categories` need to include all old categories and no new category
-        items.
-
-        Parameters
-        ----------
-        new_categories : Index-like
-           The categories in new order.
-        ordered : bool, optional
-           Whether or not the categorical is treated as a ordered categorical.
-           If not given, do not change the ordered information.
-        inplace : bool, default False
-           Whether or not to reorder the categories inplace or return a copy of
-           this categorical with reordered categories.
-
-        Returns
-        -------
-        cat : CategoricalIndex or None
-            Categorical with removed categories or None if ``inplace=True``.
-
-        Raises
-        ------
-        ValueError
-            If the new categories do not contain all old category items or any
-            new ones
-
-        Examples
-        --------
-        >>> idx = ps.CategoricalIndex(list("abbccc"))
-        >>> idx  # doctest: +NORMALIZE_WHITESPACE
-        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
-                         categories=['a', 'b', 'c'], ordered=False, dtype='category')
-
-        >>> idx.reorder_categories(['c', 'b', 'a'])  # doctest: +NORMALIZE_WHITESPACE
-        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
-                         categories=['c', 'b', 'a'], ordered=False, dtype='category')
-        """
-        if inplace:
-            raise ValueError("cannot use inplace with CategoricalIndex")
-
-        return CategoricalIndex(
-            self.to_series().cat.reorder_categories(new_categories=new_categories, ordered=ordered)
-        ).rename(self.name)
-
-    def __getattr__(self, item: str) -> Any:
-        if hasattr(MissingPandasLikeCategoricalIndex, item):
-            property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
-            if isinstance(property_or_func, property):
-                return property_or_func.fget(self)  # type: ignore
-            else:
-                return partial(property_or_func, self)
-        raise AttributeError("'CategoricalIndex' object has no attribute '{}'".format(item))
-
     def rename_categories(
         self, new_categories: Union[list, dict, Callable], inplace: bool = False
     ) -> Optional["CategoricalIndex"]:
@@ -474,6 +443,8 @@ class CategoricalIndex(Index):
             Whether or not to rename the categories inplace or return a copy of
             this categorical with renamed categories.
 
+            .. deprecated:: 3.2.0
+
         Returns
         -------
         cat : CategoricalIndex or None
@@ -517,6 +488,68 @@ class CategoricalIndex(Index):
             self.name
         )
 
+    def reorder_categories(
+        self,
+        new_categories: Union[pd.Index, Any, List],
+        ordered: Optional[bool] = None,
+        inplace: bool = False,
+    ) -> Optional["CategoricalIndex"]:
+        """
+        Reorder categories as specified in new_categories.
+
+        `new_categories` need to include all old categories and no new category
+        items.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, optional
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        inplace : bool, default False
+           Whether or not to reorder the categories inplace or return a copy of
+           this categorical with reordered categories.
+
+           .. deprecated:: 3.2.0
+
+        Returns
+        -------
+        cat : CategoricalIndex or None
+            Categorical with removed categories or None if ``inplace=True``.
+
+        Raises
+        ------
+        ValueError
+            If the new categories do not contain all old category items or any
+            new ones
+
+        See Also
+        --------
+        rename_categories : Rename categories.
+        add_categories : Add new categories.
+        remove_categories : Remove the specified categories.
+        remove_unused_categories : Remove categories which are not used.
+        set_categories : Set the categories to the specified ones.
+
+        Examples
+        --------
+        >>> idx = ps.CategoricalIndex(list("abbccc"))
+        >>> idx  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=False, dtype='category')
+
+        >>> idx.reorder_categories(['c', 'b', 'a'])  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['c', 'b', 'a'], ordered=False, dtype='category')
+        """
+        if inplace:
+            raise ValueError("cannot use inplace with CategoricalIndex")
+
+        return CategoricalIndex(
+            self.to_series().cat.reorder_categories(new_categories=new_categories, ordered=ordered)
+        ).rename(self.name)
+
     def set_categories(
         self,
         new_categories: Union[pd.Index, List],
@@ -557,6 +590,8 @@ class CategoricalIndex(Index):
            Whether or not to reorder the categories in-place or return a copy
            of this categorical with reordered categories.
 
+           .. deprecated:: 3.2.0
+
         Returns
         -------
         CategoricalIndex with reordered categories or None if inplace.
@@ -598,6 +633,15 @@ class CategoricalIndex(Index):
             self.to_series().cat.set_categories(new_categories, ordered=ordered, rename=rename)
         ).rename(self.name)
 
+    def __getattr__(self, item: str) -> Any:
+        if hasattr(MissingPandasLikeCategoricalIndex, item):
+            property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
+            if isinstance(property_or_func, property):
+                return property_or_func.fget(self)  # type: ignore
+            else:
+                return partial(property_or_func, self)
+        raise AttributeError("'CategoricalIndex' object has no attribute '{}'".format(item))
+
 
 def _test() -> None:
     import os

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org