You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/07/23 05:06:19 UTC
[spark] branch branch-3.2 updated: [SPARK-36261][PYTHON] Add
remove_unused_categories to CategoricalAccessor and CategoricalIndex
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new 4abc1d3 [SPARK-36261][PYTHON] Add remove_unused_categories to CategoricalAccessor and CategoricalIndex
4abc1d3 is described below
commit 4abc1d389ed235c064f9e6da01253b47fec30f5b
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Fri Jul 23 14:04:59 2021 +0900
[SPARK-36261][PYTHON] Add remove_unused_categories to CategoricalAccessor and CategoricalIndex
### What changes were proposed in this pull request?
Add `remove_unused_categories` to `CategoricalAccessor` and `CategoricalIndex`.
### Why are the changes needed?
We should implement `remove_unused_categories` in `CategoricalAccessor` and `CategoricalIndex`.
### Does this PR introduce _any_ user-facing change?
Yes, users will be able to use `remove_unused_categories`.
### How was this patch tested?
Added some tests.
Closes #33485 from ueshin/issues/SPARK-36261/remove_unused_categories.
Authored-by: Takuya UESHIN <ue...@databricks.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
(cherry picked from commit 2fe12a75206d4dbef6d7678b876c16876136cdd0)
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../source/reference/pyspark.pandas/indexing.rst | 1 +
.../source/reference/pyspark.pandas/series.rst | 1 +
python/pyspark/pandas/categorical.py | 43 +++++++++++++++++++++-
python/pyspark/pandas/indexes/category.py | 31 ++++++++++++++++
python/pyspark/pandas/missing/indexes.py | 3 --
.../pyspark/pandas/tests/indexes/test_category.py | 8 ++++
python/pyspark/pandas/tests/test_categorical.py | 20 ++++++++++
7 files changed, 102 insertions(+), 5 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index c2eae08..7115018 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -176,6 +176,7 @@ Categorical components
CategoricalIndex.ordered
CategoricalIndex.add_categories
CategoricalIndex.remove_categories
+ CategoricalIndex.remove_unused_categories
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
CategoricalIndex.rename_categories
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 877902c..c3a86bc 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -401,6 +401,7 @@ the ``Series.cat`` accessor.
Series.cat.codes
Series.cat.add_categories
Series.cat.remove_categories
+ Series.cat.remove_unused_categories
Series.cat.as_ordered
Series.cat.as_unordered
Series.cat.rename_categories
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index c7f0923..cc0c68c 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -439,8 +439,47 @@ class CategoricalAccessor(object):
else:
return psser
- def remove_unused_categories(self) -> "ps.Series":
- raise NotImplementedError()
+ def remove_unused_categories(self, inplace: bool = False) -> Optional["ps.Series"]:
+ """
+ Remove categories which are not used.
+
+ Parameters
+ ----------
+ inplace : bool, default False
+ Whether or not to drop unused categories inplace or return a copy of
+ this categorical with unused categories dropped.
+
+ Returns
+ -------
+ cat : Series or None
+ Categorical with unused categories dropped or None if ``inplace=True``.
+
+ Examples
+ --------
+ >>> s = ps.Series(pd.Categorical(list("abbccc"), categories=['a', 'b', 'c', 'd']))
+ >>> s # doctest: +SKIP
+ 0 a
+ 1 b
+ 2 b
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (4, object): ['a', 'b', 'c', 'd']
+
+ >>> s.cat.remove_unused_categories() # doctest: +SKIP
+ 0 a
+ 1 b
+ 2 b
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (3, object): ['a', 'b', 'c']
+ """
+ categories = set(self._data.drop_duplicates().to_pandas())
+ removals = [cat for cat in self.categories if cat not in categories]
+ return self.remove_categories(removals=removals, inplace=inplace)
def rename_categories(
self, new_categories: Union[list, dict, Callable], inplace: bool = False
diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py
index a745b25..cc23fab 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -355,6 +355,37 @@ class CategoricalIndex(Index):
return CategoricalIndex(self.to_series().cat.remove_categories(removals)).rename(self.name)
+ def remove_unused_categories(self, inplace: bool = False) -> Optional["CategoricalIndex"]:
+ """
+ Remove categories which are not used.
+
+ Parameters
+ ----------
+ inplace : bool, default False
+ Whether or not to drop unused categories inplace or return a copy of
+ this categorical with unused categories dropped.
+
+ Returns
+ -------
+ cat : CategoricalIndex or None
+ Categorical with unused categories dropped or None if ``inplace=True``.
+
+ Examples
+ --------
+ >>> idx = ps.CategoricalIndex(list("abbccc"), categories=['a', 'b', 'c', 'd'])
+ >>> idx # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['a', 'b', 'c', 'd'], ordered=False, dtype='category')
+
+ >>> idx.remove_unused_categories() # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['a', 'b', 'c'], ordered=False, dtype='category')
+ """
+ if inplace:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+
+ return CategoricalIndex(self.to_series().cat.remove_unused_categories()).rename(self.name)
+
def __getattr__(self, item: str) -> Any:
if hasattr(MissingPandasLikeCategoricalIndex, item):
property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py
index ef65da1..c81657f 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -124,9 +124,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
# Functions
reorder_categories = _unsupported_function("reorder_categories", cls="CategoricalIndex")
- remove_unused_categories = _unsupported_function(
- "remove_unused_categories", cls="CategoricalIndex"
- )
set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
map = _unsupported_function("map", cls="CategoricalIndex")
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
index a05eaef..c13e9ac 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -122,6 +122,14 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
self.assertRaises(ValueError, lambda: psidx.remove_categories(4))
self.assertRaises(ValueError, lambda: psidx.remove_categories([4, None]))
+ def test_remove_unused_categories(self):
+ pidx = pd.CategoricalIndex([1, 4, 5, 3], categories=[4, 3, 2, 1])
+ psidx = ps.from_pandas(pidx)
+
+ self.assert_eq(pidx.remove_unused_categories(), psidx.remove_unused_categories())
+
+ self.assertRaises(ValueError, lambda: psidx.remove_unused_categories(inplace=True))
+
def test_as_ordered_unordered(self):
pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
psidx = ps.from_pandas(pidx)
diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py
index e60426a..d9e232a 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -121,6 +121,26 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
self.assertRaises(ValueError, lambda: psser.cat.remove_categories([4, None]))
+ def test_remove_unused_categories(self):
+ pdf, psdf = self.df_pair
+
+ pser = pdf.a
+ psser = psdf.a
+
+ self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
+
+ pser.cat.add_categories(4, inplace=True)
+ pser.cat.remove_categories(2, inplace=True)
+ psser.cat.add_categories(4, inplace=True)
+ psser.cat.remove_categories(2, inplace=True)
+
+ self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
+
+ pser.cat.remove_unused_categories(inplace=True)
+ psser.cat.remove_unused_categories(inplace=True)
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
def test_as_ordered_unordered(self):
pdf, psdf = self.df_pair
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org