You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/07/27 03:18:27 UTC
[spark] branch branch-3.2 updated: [SPARK-36267][PYTHON] Clean up
CategoricalAccessor and CategoricalIndex
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new f278f77 [SPARK-36267][PYTHON] Clean up CategoricalAccessor and CategoricalIndex
f278f77 is described below
commit f278f771e69d19152879dba9b477d9beaa654638
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Tue Jul 27 12:17:18 2021 +0900
[SPARK-36267][PYTHON] Clean up CategoricalAccessor and CategoricalIndex
### What changes were proposed in this pull request?
Clean up `CategoricalAccessor` and `CategoricalIndex`.
- Clean up the classes
- Add deprecation warnings
- Clean up the docs
### Why are the changes needed?
To finalize the series of PRs for `CategoricalAccessor` and `CategoricalIndex`, we should clean up the classes.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
Closes #33528 from ueshin/issues/SPARK-36267/cleanup.
Authored-by: Takuya UESHIN <ue...@databricks.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
(cherry picked from commit c40d9d46f12d5909bfe18be6376d5216ef320782)
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../source/reference/pyspark.pandas/indexing.rst | 4 +-
.../source/reference/pyspark.pandas/series.rst | 4 +-
python/pyspark/pandas/categorical.py | 87 +++++++++++
python/pyspark/pandas/indexes/category.py | 166 +++++++++++++--------
4 files changed, 196 insertions(+), 65 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index cf898aa..677d80f 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -174,14 +174,14 @@ Categorical components
CategoricalIndex.codes
CategoricalIndex.categories
CategoricalIndex.ordered
+ CategoricalIndex.rename_categories
CategoricalIndex.reorder_categories
CategoricalIndex.add_categories
CategoricalIndex.remove_categories
CategoricalIndex.remove_unused_categories
+ CategoricalIndex.set_categories
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
- CategoricalIndex.rename_categories
- CategoricalIndex.set_categories
.. _api.multiindex:
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 717c762..3c7de4d 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -399,14 +399,14 @@ the ``Series.cat`` accessor.
Series.cat.categories
Series.cat.ordered
Series.cat.codes
+ Series.cat.rename_categories
Series.cat.reorder_categories
Series.cat.add_categories
Series.cat.remove_categories
Series.cat.remove_unused_categories
+ Series.cat.set_categories
Series.cat.as_ordered
Series.cat.as_unordered
- Series.cat.rename_categories
- Series.cat.set_categories
.. _api.series.plot:
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index cae9ab1..c12227e 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -15,6 +15,7 @@
# limitations under the License.
#
from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
+import warnings
import pandas as pd
from pandas.api.types import CategoricalDtype, is_dict_like, is_list_like
@@ -184,6 +185,8 @@ class CategoricalAccessor(object):
Whether or not to add the categories inplace or return a copy of
this categorical with added categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
Series or None
@@ -195,6 +198,14 @@ class CategoricalAccessor(object):
If the new categories include old categories or do not validate as
categories
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
Examples
--------
>>> s = ps.Series(list("abbccc"), dtype="category")
@@ -220,6 +231,13 @@ class CategoricalAccessor(object):
"""
from pyspark.pandas.frame import DataFrame
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in add_categories is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+
if is_list_like(new_categories):
categories = list(new_categories) # type: List
else:
@@ -367,6 +385,8 @@ class CategoricalAccessor(object):
Whether or not to remove the categories inplace or return a copy of
this categorical with removed categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
Series or None
@@ -377,6 +397,14 @@ class CategoricalAccessor(object):
ValueError
If the removals are not contained in the categories
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
Examples
--------
>>> s = ps.Series(list("abbccc"), dtype="category")
@@ -400,6 +428,13 @@ class CategoricalAccessor(object):
dtype: category
Categories (2, object): ['a', 'c']
"""
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in remove_categories is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+
if is_list_like(removals):
categories = [cat for cat in removals if cat is not None] # type: List
elif removals is None:
@@ -451,11 +486,21 @@ class CategoricalAccessor(object):
Whether or not to drop unused categories inplace or return a copy of
this categorical with unused categories dropped.
+ .. deprecated:: 3.2.0
+
Returns
-------
cat : Series or None
Categorical with unused categories dropped or None if ``inplace=True``.
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ set_categories : Set the categories to the specified ones.
+
Examples
--------
>>> s = ps.Series(pd.Categorical(list("abbccc"), categories=['a', 'b', 'c', 'd']))
@@ -479,6 +524,13 @@ class CategoricalAccessor(object):
dtype: category
Categories (3, object): ['a', 'b', 'c']
"""
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in remove_unused_categories is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+
categories = set(self._data.drop_duplicates().to_pandas())
removals = [cat for cat in self.categories if cat not in categories]
return self.remove_categories(removals=removals, inplace=inplace)
@@ -510,6 +562,8 @@ class CategoricalAccessor(object):
Whether or not to rename the categories inplace or return a copy of
this categorical with renamed categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
cat : Series or None
@@ -560,6 +614,13 @@ class CategoricalAccessor(object):
"""
from pyspark.pandas.frame import DataFrame
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in rename_categories is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+
if is_dict_like(new_categories):
categories = [cast(dict, new_categories).get(item, item) for item in self.categories]
elif callable(new_categories):
@@ -611,6 +672,8 @@ class CategoricalAccessor(object):
Whether or not to reorder the categories inplace or return a copy of
this categorical with reordered categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
cat : Series or None
@@ -622,6 +685,14 @@ class CategoricalAccessor(object):
If the new categories do not contain all old category items or any
new ones
+ See Also
+ --------
+ rename_categories : Rename categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
Examples
--------
>>> s = ps.Series(list("abbccc"), dtype="category")
@@ -645,6 +716,13 @@ class CategoricalAccessor(object):
dtype: category
Categories (3, object): ['c' < 'b' < 'a']
"""
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in reorder_categories is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+
if not is_list_like(new_categories):
raise TypeError(
"Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
@@ -720,6 +798,8 @@ class CategoricalAccessor(object):
Whether or not to reorder the categories in-place or return a copy
of this categorical with reordered categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
Series with reordered categories or None if inplace.
@@ -782,6 +862,13 @@ class CategoricalAccessor(object):
"""
from pyspark.pandas.frame import DataFrame
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in set_categories is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+
if not is_list_like(new_categories):
raise TypeError(
"Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index cd95e39..e2dbd33 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -221,6 +221,8 @@ class CategoricalIndex(Index):
Whether or not to add the categories inplace or return a copy of
this categorical with added categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
CategoricalIndex or None
@@ -232,6 +234,14 @@ class CategoricalIndex(Index):
If the new categories include old categories or do not validate as
categories
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
Examples
--------
>>> idx = ps.CategoricalIndex(list("abbccc"))
@@ -329,6 +339,8 @@ class CategoricalIndex(Index):
Whether or not to remove the categories inplace or return a copy of
this categorical with removed categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
CategoricalIndex or None
@@ -339,6 +351,14 @@ class CategoricalIndex(Index):
ValueError
If the removals are not contained in the categories
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
Examples
--------
>>> idx = ps.CategoricalIndex(list("abbccc"))
@@ -365,11 +385,21 @@ class CategoricalIndex(Index):
Whether or not to drop unused categories inplace or return a copy of
this categorical with unused categories dropped.
+ .. deprecated:: 3.2.0
+
Returns
-------
cat : CategoricalIndex or None
Categorical with unused categories dropped or None if ``inplace=True``.
+ See Also
+ --------
+ rename_categories : Rename categories.
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ set_categories : Set the categories to the specified ones.
+
Examples
--------
>>> idx = ps.CategoricalIndex(list("abbccc"), categories=['a', 'b', 'c', 'd'])
@@ -386,67 +416,6 @@ class CategoricalIndex(Index):
return CategoricalIndex(self.to_series().cat.remove_unused_categories()).rename(self.name)
- def reorder_categories(
- self,
- new_categories: Union[pd.Index, Any, List],
- ordered: Optional[bool] = None,
- inplace: bool = False,
- ) -> Optional["CategoricalIndex"]:
- """
- Reorder categories as specified in new_categories.
-
- `new_categories` need to include all old categories and no new category
- items.
-
- Parameters
- ----------
- new_categories : Index-like
- The categories in new order.
- ordered : bool, optional
- Whether or not the categorical is treated as a ordered categorical.
- If not given, do not change the ordered information.
- inplace : bool, default False
- Whether or not to reorder the categories inplace or return a copy of
- this categorical with reordered categories.
-
- Returns
- -------
- cat : CategoricalIndex or None
- Categorical with removed categories or None if ``inplace=True``.
-
- Raises
- ------
- ValueError
- If the new categories do not contain all old category items or any
- new ones
-
- Examples
- --------
- >>> idx = ps.CategoricalIndex(list("abbccc"))
- >>> idx # doctest: +NORMALIZE_WHITESPACE
- CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
- categories=['a', 'b', 'c'], ordered=False, dtype='category')
-
- >>> idx.reorder_categories(['c', 'b', 'a']) # doctest: +NORMALIZE_WHITESPACE
- CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
- categories=['c', 'b', 'a'], ordered=False, dtype='category')
- """
- if inplace:
- raise ValueError("cannot use inplace with CategoricalIndex")
-
- return CategoricalIndex(
- self.to_series().cat.reorder_categories(new_categories=new_categories, ordered=ordered)
- ).rename(self.name)
-
- def __getattr__(self, item: str) -> Any:
- if hasattr(MissingPandasLikeCategoricalIndex, item):
- property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
- if isinstance(property_or_func, property):
- return property_or_func.fget(self) # type: ignore
- else:
- return partial(property_or_func, self)
- raise AttributeError("'CategoricalIndex' object has no attribute '{}'".format(item))
-
def rename_categories(
self, new_categories: Union[list, dict, Callable], inplace: bool = False
) -> Optional["CategoricalIndex"]:
@@ -474,6 +443,8 @@ class CategoricalIndex(Index):
Whether or not to rename the categories inplace or return a copy of
this categorical with renamed categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
cat : CategoricalIndex or None
@@ -517,6 +488,68 @@ class CategoricalIndex(Index):
self.name
)
+ def reorder_categories(
+ self,
+ new_categories: Union[pd.Index, Any, List],
+ ordered: Optional[bool] = None,
+ inplace: bool = False,
+ ) -> Optional["CategoricalIndex"]:
+ """
+ Reorder categories as specified in new_categories.
+
+ `new_categories` need to include all old categories and no new category
+ items.
+
+ Parameters
+ ----------
+ new_categories : Index-like
+ The categories in new order.
+ ordered : bool, optional
+ Whether or not the categorical is treated as a ordered categorical.
+ If not given, do not change the ordered information.
+ inplace : bool, default False
+ Whether or not to reorder the categories inplace or return a copy of
+ this categorical with reordered categories.
+
+ .. deprecated:: 3.2.0
+
+ Returns
+ -------
+ cat : CategoricalIndex or None
+ Categorical with removed categories or None if ``inplace=True``.
+
+ Raises
+ ------
+ ValueError
+ If the new categories do not contain all old category items or any
+ new ones
+
+ See Also
+ --------
+ rename_categories : Rename categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
+ Examples
+ --------
+ >>> idx = ps.CategoricalIndex(list("abbccc"))
+ >>> idx # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['a', 'b', 'c'], ordered=False, dtype='category')
+
+ >>> idx.reorder_categories(['c', 'b', 'a']) # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['c', 'b', 'a'], ordered=False, dtype='category')
+ """
+ if inplace:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+
+ return CategoricalIndex(
+ self.to_series().cat.reorder_categories(new_categories=new_categories, ordered=ordered)
+ ).rename(self.name)
+
def set_categories(
self,
new_categories: Union[pd.Index, List],
@@ -557,6 +590,8 @@ class CategoricalIndex(Index):
Whether or not to reorder the categories in-place or return a copy
of this categorical with reordered categories.
+ .. deprecated:: 3.2.0
+
Returns
-------
CategoricalIndex with reordered categories or None if inplace.
@@ -598,6 +633,15 @@ class CategoricalIndex(Index):
self.to_series().cat.set_categories(new_categories, ordered=ordered, rename=rename)
).rename(self.name)
+ def __getattr__(self, item: str) -> Any:
+ if hasattr(MissingPandasLikeCategoricalIndex, item):
+ property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
+ if isinstance(property_or_func, property):
+ return property_or_func.fget(self) # type: ignore
+ else:
+ return partial(property_or_func, self)
+ raise AttributeError("'CategoricalIndex' object has no attribute '{}'".format(item))
+
def _test() -> None:
import os
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org